summaryrefslogtreecommitdiffstats
path: root/storage/innobase/row
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
commit3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/row
parentInitial commit. (diff)
downloadmariadb-upstream.tar.xz
mariadb-upstream.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/row')
-rw-r--r--storage/innobase/row/row0ext.cc132
-rw-r--r--storage/innobase/row/row0ftsort.cc1791
-rw-r--r--storage/innobase/row/row0import.cc4585
-rw-r--r--storage/innobase/row/row0ins.cc3843
-rw-r--r--storage/innobase/row/row0log.cc4134
-rw-r--r--storage/innobase/row/row0merge.cc5406
-rw-r--r--storage/innobase/row/row0mysql.cc2916
-rw-r--r--storage/innobase/row/row0purge.cc1304
-rw-r--r--storage/innobase/row/row0quiesce.cc715
-rw-r--r--storage/innobase/row/row0row.cc1720
-rw-r--r--storage/innobase/row/row0sel.cc6947
-rw-r--r--storage/innobase/row/row0uins.cc652
-rw-r--r--storage/innobase/row/row0umod.cc1288
-rw-r--r--storage/innobase/row/row0undo.cc453
-rw-r--r--storage/innobase/row/row0upd.cc3002
-rw-r--r--storage/innobase/row/row0vers.cc1419
16 files changed, 40307 insertions, 0 deletions
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc
new file mode 100644
index 00000000..b7a62760
--- /dev/null
+++ b/storage/innobase/row/row0ext.cc
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.cc
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+#include "btr0cur.h"
+
+/** Fills the column prefix cache of an externally stored column.
+@param[in,out] ext column prefix cache
+@param[in] i index of ext->ext[]
+@param[in] space tablespace
+@param[in] dfield data field */
+static
+void
+row_ext_cache_fill(
+ row_ext_t* ext,
+ ulint i,
+ fil_space_t* space,
+ const dfield_t* dfield)
+{
+ const byte* field = static_cast<const byte*>(
+ dfield_get_data(dfield));
+ ulint f_len = dfield_get_len(dfield);
+ byte* buf = ext->buf + i * ext->max_len;
+
+ ut_ad(ext->max_len > 0);
+ ut_ad(i < ext->n_ext);
+ ut_ad(dfield_is_ext(dfield));
+ ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+ field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* The BLOB pointer is not set: we cannot fetch it */
+ ext->len[i] = 0;
+ } else {
+ if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN
+ && f_len > BTR_EXTERN_FIELD_REF_SIZE) {
+ /* In this case, the field is in B format or beyond,
+ (refer to the definition of row_ext_t.max_len)
+ and the field is already fill with prefix, otherwise
+ f_len would be BTR_EXTERN_FIELD_REF_SIZE.
+ So there is no need to re-read the prefix externally,
+ but just copy the local prefix to buf. Please note
+ if the ext->len[i] is zero, it means an error
+ as above. */
+ memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE);
+ ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+ /* Fetch at most ext->max_len of the column.
+ The column should be non-empty. However,
+ trx_rollback_all_recovered() may try to
+ access a half-deleted BLOB if the server previously
+ crashed during the execution of
+ btr_free_externally_stored_field(). */
+ ext->len[i] = btr_copy_externally_stored_field_prefix(
+ buf, ext->max_len, ext->zip_size,
+ field, f_len);
+ }
+ }
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dict_table_t& table, /*!< in: table */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ mem_heap_t* heap) /*!< in: heap where created */
+{
+ if (!table.space) {
+ return NULL;
+ }
+
+ ut_ad(n_ext > 0);
+
+ row_ext_t* ret = static_cast<row_ext_t*>(
+ mem_heap_alloc(heap,
+ (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
+
+ ret->n_ext = n_ext;
+ ret->ext = ext;
+ ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(table.flags);
+ ret->zip_size = dict_tf_get_zip_size(table.flags);
+
+ ret->buf = static_cast<byte*>(
+ mem_heap_alloc(heap, n_ext * ret->max_len));
+
+ /* Fetch the BLOB prefixes */
+ for (ulint i = 0; i < n_ext; i++) {
+ const dfield_t* dfield;
+
+ dfield = dtuple_get_nth_field(tuple, ext[i]);
+ row_ext_cache_fill(ret, i, table.space, dfield);
+ }
+
+ return(ret);
+}
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
new file mode 100644
index 00000000..17a2f034
--- /dev/null
+++ b/storage/innobase/row/row0ftsort.cc
@@ -0,0 +1,1791 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "row0ftsort.h"
+#include "dict0dict.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+#include "fts0plugin.h"
+#include "log0crypt.h"
+
+/** Read the next record to buffer N.
+@param N index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N) \
+ do { \
+ b[N] = row_merge_read_rec( \
+ block[N], buf[N], b[N], index, \
+ fd[N], &foffs[N], &mrec[N], offsets[N], \
+ crypt_block[N], space); \
+ if (UNIV_UNLIKELY(!b[N])) { \
+ if (mrec[N]) { \
+ goto exit; \
+ } \
+ } \
+ } while (0)
+
+/** Parallel sort degree */
+ulong fts_sort_pll_degree = 2;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@see fts_create_one_index_table()
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+ dict_index_t* index, /*!< in: Original FTS index
+ based on which this sort index
+ is created */
+ dict_table_t* table, /*!< in,out: table that FTS index
+ is being created on */
+ ibool* opt_doc_id_size)
+ /*!< out: whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort */
+{
+ dict_index_t* new_index;
+ dict_field_t* field;
+ dict_field_t* idx_field;
+ CHARSET_INFO* charset;
+
+ // FIXME: This name shouldn't be hard coded here.
+ new_index = dict_mem_index_create(table, "tmp_fts_idx", DICT_FTS, 3);
+
+ new_index->id = index->id;
+ new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+ new_index->n_def = FTS_NUM_FIELDS_SORT;
+ new_index->cached = TRUE;
+ new_index->parser = index->parser;
+
+ idx_field = dict_index_get_nth_field(index, 0);
+ charset = fts_index_get_charset(index);
+
+ /* The first field is on the Tokenized Word */
+ field = dict_index_get_nth_field(new_index, 0);
+ field->name = NULL;
+ field->prefix_len = 0;
+ field->descending = false;
+ field->col = static_cast<dict_col_t*>(
+ mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+ field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+ field->col->mtype = charset == &my_charset_latin1
+ ? DATA_VARCHAR : DATA_VARMYSQL;
+ field->col->mbminlen = idx_field->col->mbminlen;
+ field->col->mbmaxlen = idx_field->col->mbmaxlen;
+ field->col->len = static_cast<uint16_t>(
+ HA_FT_MAXCHARLEN * field->col->mbmaxlen);
+
+ field->fixed_len = 0;
+
+ /* Doc ID */
+ field = dict_index_get_nth_field(new_index, 1);
+ field->name = NULL;
+ field->prefix_len = 0;
+ field->descending = false;
+ field->col = static_cast<dict_col_t*>(
+ mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+ field->col->mtype = DATA_INT;
+ *opt_doc_id_size = FALSE;
+
+ /* Check whether we can use 4 bytes instead of 8 bytes integer
+ field to hold the Doc ID, thus reduce the overall sort size */
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+ /* If Doc ID column is being added by this create
+ index, then just check the number of rows in the table */
+ if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) {
+ *opt_doc_id_size = TRUE;
+ }
+ } else {
+ doc_id_t max_doc_id;
+
+ /* If the Doc ID column is supplied by user, then
+ check the maximum Doc ID in the table */
+ max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+ if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+ *opt_doc_id_size = TRUE;
+ }
+ }
+
+ if (*opt_doc_id_size) {
+ field->col->len = sizeof(ib_uint32_t);
+ field->fixed_len = sizeof(ib_uint32_t);
+ } else {
+ field->col->len = FTS_DOC_ID_LEN;
+ field->fixed_len = FTS_DOC_ID_LEN;
+ }
+
+ field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+ /* The third field is on the word's position in the original doc */
+ field = dict_index_get_nth_field(new_index, 2);
+ field->name = NULL;
+ field->prefix_len = 0;
+ field->descending = false;
+ field->col = static_cast<dict_col_t*>(
+ mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+ field->col->mtype = DATA_INT;
+ field->col->len = 4 ;
+ field->fixed_len = 4;
+ field->col->prtype = DATA_NOT_NULL;
+
+ return(new_index);
+}
+
+/** Initialize FTS parallel sort structures.
+@param[in] trx transaction
+@param[in,out] dup descriptor of FTS index being created
+@param[in,out] new_table table where indexes are created
+@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes
+ integer to store Doc ID during sort
+@param[in] old_zip_size page size of the old table during alter
+@param[out] psort parallel sort info to be instantiated
+@param[out] merge parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+ trx_t* trx,
+ row_merge_dup_t*dup,
+ dict_table_t* new_table,
+ bool opt_doc_id_size,
+ ulint old_zip_size,
+ fts_psort_t** psort,
+ fts_psort_t** merge)
+{
+ ulint i;
+ ulint j;
+ fts_psort_common_t* common_info = NULL;
+ fts_psort_t* psort_info = NULL;
+ fts_psort_t* merge_info = NULL;
+ ulint block_size;
+ ibool ret = TRUE;
+ ut_ad(ut_is_2pow(old_zip_size));
+
+ block_size = 3 * srv_sort_buf_size;
+
+ *psort = psort_info = static_cast<fts_psort_t*>(ut_zalloc_nokey(
+ fts_sort_pll_degree * sizeof *psort_info));
+
+ if (!psort_info) {
+ ut_free(dup);
+ return(FALSE);
+ }
+
+ /* Common Info for all sort threads */
+ common_info = static_cast<fts_psort_common_t*>(
+ ut_malloc_nokey(sizeof *common_info));
+
+ if (!common_info) {
+ ut_free(dup);
+ ut_free(psort_info);
+ return(FALSE);
+ }
+
+ common_info->dup = dup;
+ common_info->new_table = new_table;
+ common_info->old_zip_size = old_zip_size;
+ common_info->trx = trx;
+ common_info->all_info = psort_info;
+ pthread_cond_init(&common_info->sort_cond, nullptr);
+ common_info->opt_doc_id_size = opt_doc_id_size;
+
+ ut_ad(trx->mysql_thd != NULL);
+ const char* path = thd_innodb_tmpdir(trx->mysql_thd);
+ /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+ each parallel sort thread. Each "sort bucket" holds records for
+ a particular "FTS index partition" */
+ for (j = 0; j < fts_sort_pll_degree; j++) {
+
+ UT_LIST_INIT(
+ psort_info[j].fts_doc_list, &fts_doc_item_t::doc_list);
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+ psort_info[j].merge_file[i] =
+ static_cast<merge_file_t*>(
+ ut_zalloc_nokey(sizeof(merge_file_t)));
+
+ if (!psort_info[j].merge_file[i]) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ psort_info[j].merge_buf[i] = row_merge_buf_create(
+ dup->index);
+
+ if (row_merge_file_create(psort_info[j].merge_file[i],
+ path) == OS_FILE_CLOSED) {
+ goto func_exit;
+ }
+
+ /* Need to align memory for O_DIRECT write */
+ psort_info[j].merge_block[i] =
+ static_cast<row_merge_block_t*>(
+ aligned_malloc(block_size, 1024));
+
+ if (!psort_info[j].merge_block[i]) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* If tablespace is encrypted, allocate additional buffer for
+ encryption/decryption. */
+ if (srv_encrypt_log) {
+ /* Need to align memory for O_DIRECT write */
+ psort_info[j].crypt_block[i] =
+ static_cast<row_merge_block_t*>(
+ aligned_malloc(block_size,
+ 1024));
+
+ if (!psort_info[j].crypt_block[i]) {
+ ret = FALSE;
+ goto func_exit;
+ }
+ } else {
+ psort_info[j].crypt_block[i] = NULL;
+ }
+ }
+
+ psort_info[j].child_status = 0;
+ psort_info[j].state = 0;
+ psort_info[j].psort_common = common_info;
+ psort_info[j].error = DB_SUCCESS;
+ psort_info[j].memory_used = 0;
+ mysql_mutex_init(0, &psort_info[j].mutex, nullptr);
+ }
+
+ /* Initialize merge_info structures parallel merge and insert
+ into auxiliary FTS tables (FTS_INDEX_TABLE) */
+ *merge = merge_info = static_cast<fts_psort_t*>(
+ ut_malloc_nokey(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+ for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+ merge_info[j].child_status = 0;
+ merge_info[j].state = 0;
+ merge_info[j].psort_common = common_info;
+ }
+
+func_exit:
+ if (!ret) {
+ row_fts_psort_info_destroy(psort_info, merge_info);
+ }
+
+ return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ fts_psort_t* merge_info) /*!< parallel merge info */
+{
+ ulint i;
+ ulint j;
+
+ if (psort_info) {
+ for (j = 0; j < fts_sort_pll_degree; j++) {
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ if (psort_info[j].merge_file[i]) {
+ row_merge_file_destroy(
+ psort_info[j].merge_file[i]);
+ }
+
+ aligned_free(psort_info[j].merge_block[i]);
+ ut_free(psort_info[j].merge_file[i]);
+ aligned_free(psort_info[j].crypt_block[i]);
+ }
+
+ mysql_mutex_destroy(&psort_info[j].mutex);
+ }
+
+ pthread_cond_destroy(&merge_info[0].psort_common->sort_cond);
+ ut_free(merge_info[0].psort_common->dup);
+ ut_free(merge_info[0].psort_common);
+ ut_free(psort_info);
+ }
+
+ ut_free(merge_info);
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+ fts_psort_t* psort_info) /*!< in: parallel sort info */
+{
+ ulint j;
+ ulint i;
+
+ if (!psort_info) {
+ return;
+ }
+
+ for (j = 0; j < fts_sort_pll_degree; j++) {
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ row_merge_buf_free(psort_info[j].merge_buf[i]);
+ }
+ }
+
+ return;
+}
+
+/*********************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for row merge.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+static
+int
+row_merge_fts_doc_add_word_for_parser(
+/*==================================*/
+ MYSQL_FTPARSER_PARAM *param, /* in: parser paramter */
+ const char *word, /* in: token word */
+ int word_len, /* in: word len */
+ MYSQL_FTPARSER_BOOLEAN_INFO* boolean_info) /* in: boolean info */
+{
+ fts_string_t str;
+ fts_tokenize_ctx_t* t_ctx;
+ row_fts_token_t* fts_token;
+ byte* ptr;
+
+ ut_ad(param);
+ ut_ad(param->mysql_ftparam);
+ ut_ad(word);
+ ut_ad(boolean_info);
+
+ t_ctx = static_cast<fts_tokenize_ctx_t*>(param->mysql_ftparam);
+ ut_ad(t_ctx);
+
+ str.f_str = (byte*)(word);
+ str.f_len = ulint(word_len);
+ str.f_n_char = fts_get_token_size(
+ (CHARSET_INFO*)param->cs, word, ulint(word_len));
+
+ /* JAN: TODO: MySQL 5.7 FTS
+ ut_ad(boolean_info->position >= 0);
+ */
+
+ ptr = static_cast<byte*>(ut_malloc_nokey(sizeof(row_fts_token_t)
+ + sizeof(fts_string_t) + str.f_len));
+ fts_token = reinterpret_cast<row_fts_token_t*>(ptr);
+ fts_token->text = reinterpret_cast<fts_string_t*>(
+ ptr + sizeof(row_fts_token_t));
+ fts_token->text->f_str = static_cast<byte*>(
+ ptr + sizeof(row_fts_token_t) + sizeof(fts_string_t));
+
+ fts_token->text->f_len = str.f_len;
+ fts_token->text->f_n_char = str.f_n_char;
+ memcpy(fts_token->text->f_str, str.f_str, str.f_len);
+
+ /* JAN: TODO: MySQL 5.7 FTS
+ fts_token->position = boolean_info->position;
+ */
+
+ /* Add token to list */
+ UT_LIST_ADD_LAST(t_ctx->fts_token_list, fts_token);
+
+ return(0);
+}
+
+/*********************************************************************//**
+Tokenize by fts plugin parser */
+static
+void
+row_merge_fts_doc_tokenize_by_parser(
+/*=================================*/
+ fts_doc_t* doc, /* in: doc to tokenize */
+ st_mysql_ftparser* parser, /* in: plugin parser instance */
+ fts_tokenize_ctx_t* t_ctx) /* in/out: tokenize ctx instance */
+{
+ MYSQL_FTPARSER_PARAM param;
+
+ ut_a(parser);
+
+ /* Set paramters for param */
+ param.mysql_parse = fts_tokenize_document_internal;
+ param.mysql_add_word = row_merge_fts_doc_add_word_for_parser;
+ param.mysql_ftparam = t_ctx;
+ param.cs = doc->charset;
+ param.doc = reinterpret_cast<char*>(doc->text.f_str);
+ param.length = static_cast<int>(doc->text.f_len);
+ param.mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+ PARSER_INIT(parser, &param);
+ /* We assume parse returns successfully here. */
+ parser->parse(&param);
+ PARSER_DEINIT(parser, &param);
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@see row_merge_buf_encode()
+@return TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+ row_merge_buf_t** sort_buf, /*!< in/out: sort buffer */
+ doc_id_t doc_id, /*!< in: Doc ID */
+ fts_doc_t* doc, /*!< in: Doc to be tokenized */
+ merge_file_t** merge_file, /*!< in/out: merge file */
+ ibool opt_doc_id_size,/*!< in: whether to use 4 bytes
+ instead of 8 bytes integer to
+ store Doc ID during sort*/
+ fts_tokenize_ctx_t* t_ctx) /*!< in/out: tokenize context */
+{
+ ulint inc = 0;
+ fts_string_t str;
+ ulint len;
+ row_merge_buf_t* buf;
+ dfield_t* field;
+ fts_string_t t_str;
+ ibool buf_full = FALSE;
+ byte str_buf[FTS_MAX_WORD_LEN + 1];
+ ulint data_size[FTS_NUM_AUX_INDEX];
+ ulint n_tuple[FTS_NUM_AUX_INDEX];
+ st_mysql_ftparser* parser;
+
+ t_str.f_n_char = 0;
+ t_ctx->buf_used = 0;
+
+ memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+ memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+ parser = sort_buf[0]->index->parser;
+
+ /* Tokenize the data and add each word string, its corresponding
+ doc id and position to sort buffer */
+ while (parser
+ ? (!t_ctx->processed_len
+ || UT_LIST_GET_LEN(t_ctx->fts_token_list))
+ : t_ctx->processed_len < doc->text.f_len) {
+ ulint idx = 0;
+ ulint cur_len;
+ doc_id_t write_doc_id;
+ row_fts_token_t* fts_token = NULL;
+
+ if (parser != NULL) {
+ if (t_ctx->processed_len == 0) {
+ UT_LIST_INIT(t_ctx->fts_token_list, &row_fts_token_t::token_list);
+
+ /* Parse the whole doc and cache tokens */
+ row_merge_fts_doc_tokenize_by_parser(doc,
+ parser, t_ctx);
+
+ /* Just indictate we have parsed all the word */
+ t_ctx->processed_len += 1;
+ }
+
+ /* Then get a token */
+ fts_token = UT_LIST_GET_FIRST(t_ctx->fts_token_list);
+ if (fts_token) {
+ str.f_len = fts_token->text->f_len;
+ str.f_n_char = fts_token->text->f_n_char;
+ str.f_str = fts_token->text->f_str;
+ } else {
+ ut_ad(UT_LIST_GET_LEN(t_ctx->fts_token_list) == 0);
+ /* Reach the end of the list */
+ t_ctx->processed_len = doc->text.f_len;
+ break;
+ }
+ } else {
+ inc = innobase_mysql_fts_get_token(
+ doc->charset,
+ doc->text.f_str + t_ctx->processed_len,
+ doc->text.f_str + doc->text.f_len, &str);
+
+ ut_a(inc > 0);
+ }
+
+ /* Ignore string whose character number is less than
+ "fts_min_token_size" or more than "fts_max_token_size" */
+ if (!fts_check_token(&str, NULL, NULL)) {
+ if (parser != NULL) {
+ UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+ ut_free(fts_token);
+ } else {
+ t_ctx->processed_len += inc;
+ }
+
+ continue;
+ }
+
+ t_str.f_len = innobase_fts_casedn_str(
+ doc->charset, (char*) str.f_str, str.f_len,
+ (char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+ t_str.f_str = (byte*) &str_buf;
+
+ /* if "cached_stopword" is defined, ignore words in the
+ stopword list */
+ if (!fts_check_token(&str, t_ctx->cached_stopword,
+ doc->charset)) {
+ if (parser != NULL) {
+ UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+ ut_free(fts_token);
+ } else {
+ t_ctx->processed_len += inc;
+ }
+
+ continue;
+ }
+
+ /* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+ out which sort buffer to put this word record in */
+ t_ctx->buf_used = fts_select_index(
+ doc->charset, t_str.f_str, t_str.f_len);
+
+ buf = sort_buf[t_ctx->buf_used];
+
+ ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+ idx = t_ctx->buf_used;
+
+ mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]];
+
+ field = mtuple->fields = static_cast<dfield_t*>(
+ mem_heap_alloc(buf->heap,
+ FTS_NUM_FIELDS_SORT * sizeof *field));
+
+ /* The first field is the tokenized word */
+ dfield_set_data(field, t_str.f_str, t_str.f_len);
+ len = dfield_get_len(field);
+
+ dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type);
+ field->type.prtype |= DATA_NOT_NULL;
+ ut_ad(len <= field->type.len);
+
+ /* For the temporary file, row_merge_buf_encode() uses
+ 1 byte for representing the number of extra_size bytes.
+ This number will always be 1, because for this 3-field index
+ consisting of one variable-size column, extra_size will always
+ be 1 or 2, which can be encoded in one byte.
+
+ The extra_size is 1 byte if the length of the
+ variable-length column is less than 128 bytes or the
+ maximum length is less than 256 bytes. */
+
+ /* One variable length column, word with its lenght less than
+ fts_max_token_size, add one extra size and one extra byte.
+
+ Since the max length for FTS token now is larger than 255,
+ so we will need to signify length byte itself, so only 1 to 128
+ bytes can be used for 1 bytes, larger than that 2 bytes. */
+ if (len < 128 || field->type.len < 256) {
+ /* Extra size is one byte. */
+ cur_len = 2 + len;
+ } else {
+ /* Extra size is two bytes. */
+ cur_len = 3 + len;
+ }
+
+ dfield_dup(field, buf->heap);
+ field++;
+
+ /* The second field is the Doc ID */
+
+ ib_uint32_t doc_id_32_bit;
+
+ if (!opt_doc_id_size) {
+ fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+ dfield_set_data(
+ field, &write_doc_id, sizeof(write_doc_id));
+ } else {
+ mach_write_to_4(
+ (byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+ dfield_set_data(
+ field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+ }
+
+ len = field->len;
+ ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+ field->type.mtype = DATA_INT;
+ field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+ field->type.len = static_cast<uint16_t>(field->len);
+ field->type.mbminlen = 0;
+ field->type.mbmaxlen = 0;
+
+ cur_len += len;
+ dfield_dup(field, buf->heap);
+
+ ++field;
+
+ /* The third field is the position.
+ MySQL 5.7 changed the fulltext parser plugin interface
+ by adding MYSQL_FTPARSER_BOOLEAN_INFO::position.
+ Below we assume that the field is always 0. */
+ ulint pos = t_ctx->init_pos;
+ byte position[4];
+ if (parser == NULL) {
+ pos += t_ctx->processed_len + inc - str.f_len;
+ }
+ len = 4;
+ mach_write_to_4(position, pos);
+ dfield_set_data(field, &position, len);
+
+ field->type.mtype = DATA_INT;
+ field->type.prtype = DATA_NOT_NULL;
+ field->type.len = 4;
+ field->type.mbminlen = 0;
+ field->type.mbmaxlen = 0;
+ cur_len += len;
+ dfield_dup(field, buf->heap);
+
+ /* Reserve one byte for the end marker of row_merge_block_t */
+ if (buf->total_size + data_size[idx] + cur_len
+ >= srv_sort_buf_size - 1) {
+
+ buf_full = TRUE;
+ break;
+ }
+
+ /* Increment the number of tuples */
+ n_tuple[idx]++;
+ if (parser != NULL) {
+ UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+ ut_free(fts_token);
+ } else {
+ t_ctx->processed_len += inc;
+ }
+ data_size[idx] += cur_len;
+ }
+
+ /* Update the data length and the number of new word tuples
+ added in this round of tokenization */
+ for (ulint i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ /* The computation of total_size below assumes that no
+ delete-mark flags will be stored and that all fields
+ are NOT NULL and fixed-length. */
+
+ sort_buf[i]->total_size += data_size[i];
+
+ sort_buf[i]->n_tuples += n_tuple[i];
+
+ merge_file[i]->n_rec += n_tuple[i];
+ t_ctx->rows_added[i] += n_tuple[i];
+ }
+
+ if (!buf_full) {
+ /* we pad one byte between text accross two fields */
+ t_ctx->init_pos += doc->text.f_len + 1;
+ }
+
+ return(!buf_full);
+}
+
+/*********************************************************************//**
+Get next doc item from fts_doc_list */
+UNIV_INLINE
+void
+row_merge_fts_get_next_doc_item(
+/*============================*/
+ fts_psort_t* psort_info, /*!< in: psort_info */
+ fts_doc_item_t** doc_item) /*!< in/out: doc item */
+{
+ if (*doc_item != NULL) {
+ ut_free(*doc_item);
+ }
+
+ mysql_mutex_lock(&psort_info->mutex);
+
+ *doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+ if (*doc_item != NULL) {
+ UT_LIST_REMOVE(psort_info->fts_doc_list, *doc_item);
+
+ ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t)
+ + (*doc_item)->field->len);
+ psort_info->memory_used -= sizeof(fts_doc_item_t)
+ + (*doc_item)->field->len;
+ }
+
+ mysql_mutex_unlock(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+*/
+static
+void fts_parallel_tokenization(
+/*======================*/
+ void* arg) /*!< in: psort_info for the thread */
+{
+ fts_psort_t* psort_info = (fts_psort_t*) arg;
+ ulint i;
+ fts_doc_item_t* doc_item = NULL;
+ row_merge_buf_t** buf;
+ ibool processed = FALSE;
+ merge_file_t** merge_file;
+ row_merge_block_t** block;
+ row_merge_block_t** crypt_block;
+ pfs_os_file_t tmpfd[FTS_NUM_AUX_INDEX];
+ ulint mycount[FTS_NUM_AUX_INDEX];
+ ulint num_doc_processed = 0;
+ doc_id_t last_doc_id = 0;
+ mem_heap_t* blob_heap = NULL;
+ fts_doc_t doc;
+ dict_table_t* table = psort_info->psort_common->new_table;
+ fts_tokenize_ctx_t t_ctx;
+ ulint retried = 0;
+ dberr_t error = DB_SUCCESS;
+
+ ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+ /* const char* path = thd_innodb_tmpdir(
+ psort_info->psort_common->trx->mysql_thd);
+ */
+
+ ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+ const char* path = thd_innodb_tmpdir(
+ psort_info->psort_common->trx->mysql_thd);
+
+ ut_ad(psort_info);
+
+ buf = psort_info->merge_buf;
+ merge_file = psort_info->merge_file;
+ blob_heap = mem_heap_create(512);
+ memset(&doc, 0, sizeof(doc));
+ memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+ doc.charset = fts_index_get_charset(
+ psort_info->psort_common->dup->index);
+
+ block = psort_info->merge_block;
+ crypt_block = psort_info->crypt_block;
+
+ const ulint zip_size = psort_info->psort_common->old_zip_size;
+
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+ t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+ processed = TRUE;
+loop:
+ while (doc_item) {
+ dfield_t* dfield = doc_item->field;
+
+ last_doc_id = doc_item->doc_id;
+
+ ut_ad (dfield->data != NULL
+ && dfield_get_len(dfield) != UNIV_SQL_NULL);
+
+ /* If finish processing the last item, update "doc" with
+ strings in the doc_item, otherwise continue processing last
+ item */
+ if (processed) {
+ byte* data;
+ ulint data_len;
+
+ dfield = doc_item->field;
+ data = static_cast<byte*>(dfield_get_data(dfield));
+ data_len = dfield_get_len(dfield);
+
+ if (dfield_is_ext(dfield)) {
+ doc.text.f_str =
+ btr_copy_externally_stored_field(
+ &doc.text.f_len, data,
+ zip_size, data_len, blob_heap);
+ } else {
+ doc.text.f_str = data;
+ doc.text.f_len = data_len;
+ }
+
+ doc.tokens = 0;
+ t_ctx.processed_len = 0;
+ } else {
+ /* Not yet finish processing the "doc" on hand,
+ continue processing it */
+ ut_ad(doc.text.f_str);
+ ut_ad(buf[0]->index->parser
+ || t_ctx.processed_len < doc.text.f_len);
+ }
+
+ processed = row_merge_fts_doc_tokenize(
+ buf, doc_item->doc_id, &doc,
+ merge_file, psort_info->psort_common->opt_doc_id_size,
+ &t_ctx);
+
+ /* Current sort buffer full, need to recycle */
+ if (!processed) {
+ ut_ad(buf[0]->index->parser
+ || t_ctx.processed_len < doc.text.f_len);
+ ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+ break;
+ }
+
+ num_doc_processed++;
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)
+ && num_doc_processed % 10000 == 1) {
+ ib::info() << "Number of documents processed: "
+ << num_doc_processed;
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ ib::info() << "ID " << psort_info->psort_id
+ << ", partition " << i << ", word "
+ << mycount[i];
+ }
+#endif
+ }
+
+ mem_heap_empty(blob_heap);
+
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+ if (doc_item && last_doc_id != doc_item->doc_id) {
+ t_ctx.init_pos = 0;
+ }
+ }
+
+ /* If we run out of current sort buffer, need to sort
+ and flush the sort buffer to disk */
+ if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+ row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+ row_merge_buf_write(buf[t_ctx.buf_used],
+#ifndef DBUG_OFF
+ merge_file[t_ctx.buf_used],
+#endif
+ block[t_ctx.buf_used]);
+
+ if (!row_merge_write(merge_file[t_ctx.buf_used]->fd,
+ merge_file[t_ctx.buf_used]->offset++,
+ block[t_ctx.buf_used],
+ crypt_block[t_ctx.buf_used],
+ table->space_id)) {
+ error = DB_TEMP_FILE_WRITE_FAIL;
+ goto func_exit;
+ }
+
+ MEM_UNDEFINED(block[t_ctx.buf_used], srv_sort_buf_size);
+ buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+ mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+ t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+ ut_a(doc_item);
+ goto loop;
+ }
+
+ /* Parent done scanning, and if finish processing all the docs, exit */
+ if (psort_info->state == FTS_PARENT_COMPLETE) {
+ if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) {
+ goto exit;
+ } else if (retried > 10000) {
+ ut_ad(!doc_item);
+ /* retried too many times and cannot get new record */
+ ib::error() << "FTS parallel sort processed "
+ << num_doc_processed
+ << " records, the sort queue has "
+ << UT_LIST_GET_LEN(psort_info->fts_doc_list)
+ << " records. But sort cannot get the next"
+ " records during alter table " << table->name;
+ goto exit;
+ }
+ } else if (psort_info->state == FTS_PARENT_EXITING) {
+ /* Parent abort */
+ goto func_exit;
+ }
+
+ if (doc_item == NULL) {
+ std::this_thread::yield();
+ }
+
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+ if (doc_item != NULL) {
+ if (last_doc_id != doc_item->doc_id) {
+ t_ctx.init_pos = 0;
+ }
+
+ retried = 0;
+ } else if (psort_info->state == FTS_PARENT_COMPLETE) {
+ retried++;
+ }
+
+ goto loop;
+
+exit:
+ /* Do a final sort of the last (or latest) batch of records
+ in block memory. Flush them to temp file if records cannot
+ be hold in one block memory */
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ if (t_ctx.rows_added[i]) {
+ row_merge_buf_sort(buf[i], NULL);
+ row_merge_buf_write(buf[i],
+#ifndef DBUG_OFF
+ merge_file[i],
+#endif
+ block[i]);
+
+ /* Write to temp file, only if records have
+ been flushed to temp file before (offset > 0):
+ The pseudo code for sort is following:
+
+ while (there are rows) {
+ tokenize rows, put result in block[]
+ if (block[] runs out) {
+ sort rows;
+ write to temp file with
+ row_merge_write();
+ offset++;
+ }
+ }
+
+ # write out the last batch
+ if (offset > 0) {
+ row_merge_write();
+ offset++;
+ } else {
+ # no need to write anything
+ offset stay as 0
+ }
+
+ so if merge_file[i]->offset is 0 when we come to
+ here as the last batch, this means rows have
+ never flush to temp file, it can be held all in
+ memory */
+ if (merge_file[i]->offset != 0) {
+ if (!row_merge_write(merge_file[i]->fd,
+ merge_file[i]->offset++,
+ block[i],
+ crypt_block[i],
+ table->space_id)) {
+ error = DB_TEMP_FILE_WRITE_FAIL;
+ goto func_exit;
+ }
+
+#ifdef HAVE_valgrind
+ MEM_UNDEFINED(block[i], srv_sort_buf_size);
+
+ if (crypt_block[i]) {
+ MEM_UNDEFINED(crypt_block[i],
+ srv_sort_buf_size);
+ }
+#endif /* HAVE_valgrind */
+ }
+
+ buf[i] = row_merge_buf_empty(buf[i]);
+ t_ctx.rows_added[i] = 0;
+ }
+ }
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: start merge sort\n");
+ }
+
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ if (!merge_file[i]->offset) {
+ continue;
+ }
+
+ tmpfd[i] = row_merge_file_create_low(path);
+ if (tmpfd[i] == OS_FILE_CLOSED) {
+ error = DB_OUT_OF_MEMORY;
+ goto func_exit;
+ }
+
+ error = row_merge_sort(psort_info->psort_common->trx,
+ psort_info->psort_common->dup,
+ merge_file[i], block[i], &tmpfd[i],
+ false, 0.0/* pct_progress */, 0.0/* pct_cost */,
+ crypt_block[i], table->space_id);
+
+ if (error != DB_SUCCESS) {
+ row_merge_file_destroy_low(tmpfd[i]);
+ goto func_exit;
+ }
+
+ row_merge_file_destroy_low(tmpfd[i]);
+ }
+
+func_exit:
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: complete merge sort\n");
+ }
+
+ mem_heap_free(blob_heap);
+
+ mysql_mutex_lock(&psort_info->mutex);
+ psort_info->error = error;
+ mysql_mutex_unlock(&psort_info->mutex);
+
+ if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) {
+ /* child can exit either with error or told by parent. */
+ ut_ad(error != DB_SUCCESS
+ || psort_info->state == FTS_PARENT_EXITING);
+ }
+
+ /* Free fts doc list in case of error. */
+ do {
+ row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+ } while (doc_item != NULL);
+
+ mysql_mutex_lock(&psort_info->mutex);
+ psort_info->child_status = FTS_CHILD_COMPLETE;
+ pthread_cond_signal(&psort_info->psort_common->sort_cond);
+ mysql_mutex_unlock(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+ fts_psort_t* psort_info) /*!< parallel sort structure */
+{
+ ulint i = 0;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ psort_info[i].psort_id = i;
+ psort_info[i].task =
+ new tpool::waitable_task(fts_parallel_tokenization,&psort_info[i]);
+ srv_thread_pool->submit_task(psort_info[i].task);
+ }
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records. */
+static
+void
+fts_parallel_merge(
+/*===============*/
+ void* arg) /*!< in: parallel merge info */
+{
+ fts_psort_t* psort_info = (fts_psort_t*) arg;
+ ulint id;
+
+ ut_ad(psort_info);
+
+ id = psort_info->psort_id;
+
+ row_fts_merge_insert(psort_info->psort_common->dup->index,
+ psort_info->psort_common->new_table,
+ psort_info->psort_common->all_info, id);
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+ fts_psort_t* merge_info) /*!< in: parallel sort info */
+{
+ ulint i = 0;
+
+ /* Kick off merge/insert tasks */
+ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+ merge_info[i].psort_id = i;
+ merge_info[i].child_status = 0;
+
+ merge_info[i].task = new tpool::waitable_task(
+ fts_parallel_merge,
+ (void*) &merge_info[i]);
+ srv_thread_pool->submit_task(merge_info[i].task);
+ }
+}
+
+/**
+Write out a single word's data as new entry/entries in the INDEX table.
+@param[in] ins_ctx insert context
+@param[in] word word string
+@param[in] node node colmns
+@return DB_SUCCUESS if insertion runs fine, otherwise error code */
+static
+dberr_t
+row_merge_write_fts_node(
+ const fts_psort_insert_t* ins_ctx,
+ const fts_string_t* word,
+ const fts_node_t* node)
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ dberr_t ret = DB_SUCCESS;
+ doc_id_t write_first_doc_id[8];
+ doc_id_t write_last_doc_id[8];
+ ib_uint32_t write_doc_count;
+
+ tuple = ins_ctx->tuple;
+
+ /* The first field is the tokenized word */
+ field = dtuple_get_nth_field(tuple, 0);
+ dfield_set_data(field, word->f_str, word->f_len);
+
+ /* The second field is first_doc_id */
+ field = dtuple_get_nth_field(tuple, 1);
+ fts_write_doc_id((byte*)&write_first_doc_id, node->first_doc_id);
+ dfield_set_data(field, &write_first_doc_id, sizeof(doc_id_t));
+
+ /* The third and fourth fileds(TRX_ID, ROLL_PTR) are filled already.*/
+ /* The fifth field is last_doc_id */
+ field = dtuple_get_nth_field(tuple, 4);
+ fts_write_doc_id((byte*)&write_last_doc_id, node->last_doc_id);
+ dfield_set_data(field, &write_last_doc_id, sizeof(doc_id_t));
+
+ /* The sixth field is doc_count */
+ field = dtuple_get_nth_field(tuple, 5);
+ mach_write_to_4((byte*)&write_doc_count, (ib_uint32_t)node->doc_count);
+ dfield_set_data(field, &write_doc_count, sizeof(ib_uint32_t));
+
+ /* The seventh field is ilist */
+ field = dtuple_get_nth_field(tuple, 6);
+ dfield_set_data(field, node->ilist, node->ilist_size);
+
+ ret = ins_ctx->btr_bulk->insert(tuple);
+
+ return(ret);
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return DB_SUCCESS if insertion runs fine */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+row_merge_write_fts_word(
+/*=====================*/
+ fts_psort_insert_t* ins_ctx, /*!< in: insert context */
+ fts_tokenizer_word_t* word) /*!< in: sorted and tokenized
+ word */
+{
+ dberr_t ret = DB_SUCCESS;
+
+ ut_ad(ins_ctx->aux_index_id == fts_select_index(
+ ins_ctx->charset, word->text.f_str, word->text.f_len));
+
+ /* Pop out each fts_node in word->nodes write them to auxiliary table */
+ for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+ dberr_t error;
+ fts_node_t* fts_node;
+
+ fts_node = static_cast<fts_node_t*>(ib_vector_get(word->nodes, i));
+
+ error = row_merge_write_fts_node(ins_ctx, &word->text, fts_node);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+ ib::error() << "Failed to write word to FTS auxiliary"
+ " index table "
+ << ins_ctx->btr_bulk->table_name()
+ << ", error " << error;
+ ret = error;
+ }
+
+ ut_free(fts_node->ilist);
+ fts_node->ilist = NULL;
+ }
+
+ ib_vector_reset(word->nodes);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+static
+void
+row_fts_insert_tuple(
+/*=================*/
+ fts_psort_insert_t*
+ ins_ctx, /*!< in: insert context */
+ fts_tokenizer_word_t* word, /*!< in: last processed
+ tokenized word */
+ ib_vector_t* positions, /*!< in: word position */
+ doc_id_t* in_doc_id, /*!< in: last item doc id */
+ dtuple_t* dtuple) /*!< in: entry to insert */
+{
+ fts_node_t* fts_node = NULL;
+ dfield_t* dfield;
+ doc_id_t doc_id;
+ ulint position;
+ fts_string_t token_word;
+ ulint i;
+
+ /* Get fts_node for the FTS auxillary INDEX table */
+ if (ib_vector_size(word->nodes) > 0) {
+ fts_node = static_cast<fts_node_t*>(
+ ib_vector_last(word->nodes));
+ }
+
+ if (fts_node == NULL
+ || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+ fts_node = static_cast<fts_node_t*>(
+ ib_vector_push(word->nodes, NULL));
+
+ memset(fts_node, 0x0, sizeof(*fts_node));
+ }
+
+ /* If dtuple == NULL, this is the last word to be processed */
+ if (!dtuple) {
+ if (fts_node && ib_vector_size(positions) > 0) {
+ fts_cache_node_add_positions(
+ NULL, fts_node, *in_doc_id,
+ positions);
+
+ /* Write out the current word */
+ row_merge_write_fts_word(ins_ctx, word);
+ }
+
+ return;
+ }
+
+ /* Get the first field for the tokenized word */
+ dfield = dtuple_get_nth_field(dtuple, 0);
+
+ token_word.f_n_char = 0;
+ token_word.f_len = dfield->len;
+ token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (!word->text.f_str) {
+ fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+ }
+
+ /* compare to the last word, to see if they are the same
+ word */
+ if (innobase_fts_text_cmp(ins_ctx->charset,
+ &word->text, &token_word) != 0) {
+ ulint num_item;
+
+ /* Getting a new word, flush the last position info
+ for the currnt word in fts_node */
+ if (ib_vector_size(positions) > 0) {
+ fts_cache_node_add_positions(
+ NULL, fts_node, *in_doc_id, positions);
+ }
+
+ /* Write out the current word */
+ row_merge_write_fts_word(ins_ctx, word);
+
+ /* Copy the new word */
+ fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+ num_item = ib_vector_size(positions);
+
+ /* Clean up position queue */
+ for (i = 0; i < num_item; i++) {
+ ib_vector_pop(positions);
+ }
+
+ /* Reset Doc ID */
+ *in_doc_id = 0;
+ memset(fts_node, 0x0, sizeof(*fts_node));
+ }
+
+ /* Get the word's Doc ID */
+ dfield = dtuple_get_nth_field(dtuple, 1);
+
+ if (!ins_ctx->opt_doc_id_size) {
+ doc_id = fts_read_doc_id(
+ static_cast<byte*>(dfield_get_data(dfield)));
+ } else {
+ doc_id = (doc_id_t) mach_read_from_4(
+ static_cast<byte*>(dfield_get_data(dfield)));
+ }
+
+ /* Get the word's position info */
+ dfield = dtuple_get_nth_field(dtuple, 2);
+ position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+ /* If this is the same word as the last word, and they
+ have the same Doc ID, we just need to add its position
+ info. Otherwise, we will flush position info to the
+ fts_node and initiate a new position vector */
+ if (!(*in_doc_id) || *in_doc_id == doc_id) {
+ ib_vector_push(positions, &position);
+ } else {
+ ulint num_pos = ib_vector_size(positions);
+
+ fts_cache_node_add_positions(NULL, fts_node,
+ *in_doc_id, positions);
+ for (i = 0; i < num_pos; i++) {
+ ib_vector_pop(positions);
+ }
+ ib_vector_push(positions, &position);
+ }
+
+ /* record the current Doc ID */
+ *in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+ulint
+row_fts_sel_tree_propagate(
+/*=======================*/
+ ulint propogated, /*<! in: tree node propagated */
+ int* sel_tree, /*<! in: selection tree */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in/out: FTS index */
+{
+ ulint parent;
+ int child_left;
+ int child_right;
+ int selected;
+
+ /* Find which parent this value will be propagated to */
+ parent = (propogated - 1) / 2;
+
+ /* Find out which value is smaller, and to propagate */
+ child_left = sel_tree[parent * 2 + 1];
+ child_right = sel_tree[parent * 2 + 2];
+
+ if (child_left == -1 || mrec[child_left] == NULL) {
+ if (child_right == -1
+ || mrec[child_right] == NULL) {
+ selected = -1;
+ } else {
+ selected = child_right ;
+ }
+ } else if (child_right == -1
+ || mrec[child_right] == NULL) {
+ selected = child_left;
+ } else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right],
+ offsets[child_left],
+ offsets[child_right],
+ index, NULL) < 0) {
+ selected = child_left;
+ } else {
+ selected = child_right;
+ }
+
+ sel_tree[parent] = selected;
+
+ return parent;
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+ int* sel_tree, /*<! in/out: selection tree */
+ ulint propagated, /*<! in: node to propagate up */
+ ulint height, /*<! in: tree height */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in: index dictionary */
+{
+ ulint i;
+
+ for (i = 1; i <= height; i++) {
+ propagated = row_fts_sel_tree_propagate(
+ propagated, sel_tree, mrec, offsets, index);
+ }
+
+ return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+ int* sel_tree, /*<! in/out: selection tree */
+ ulint level, /*<! in: selection tree level */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in: index dictionary */
+{
+ ulint start;
+ int child_left;
+ int child_right;
+ ulint i;
+ ulint num_item = ulint(1) << level;
+
+ start = num_item - 1;
+
+ for (i = 0; i < num_item; i++) {
+ child_left = sel_tree[(start + i) * 2 + 1];
+ child_right = sel_tree[(start + i) * 2 + 2];
+
+ if (child_left == -1) {
+ if (child_right == -1) {
+ sel_tree[start + i] = -1;
+ } else {
+ sel_tree[start + i] = child_right;
+ }
+ continue;
+ } else if (child_right == -1) {
+ sel_tree[start + i] = child_left;
+ continue;
+ }
+
+ /* Deal with NULL child conditions */
+ if (!mrec[child_left]) {
+ if (!mrec[child_right]) {
+ sel_tree[start + i] = -1;
+ } else {
+ sel_tree[start + i] = child_right;
+ }
+ continue;
+ } else if (!mrec[child_right]) {
+ sel_tree[start + i] = child_left;
+ continue;
+ }
+
+ /* Select the smaller one to set parent pointer */
+ int cmp = cmp_rec_rec_simple(
+ mrec[child_left], mrec[child_right],
+ offsets[child_left], offsets[child_right],
+ index, NULL);
+
+ sel_tree[start + i] = cmp < 0 ? child_left : child_right;
+ }
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+ int* sel_tree, /*<! in/out: selection tree */
+ const mrec_t** mrec, /*<! in: sort record */
+ rec_offs** offsets, /*<! in: record offsets */
+ dict_index_t* index) /*<! in: index dictionary */
+{
+ ulint treelevel = 1;
+ ulint num = 2;
+ ulint i = 0;
+ ulint start;
+
+ /* No need to build selection tree if we only have two merge threads */
+ if (fts_sort_pll_degree <= 2) {
+ return(0);
+ }
+
+ while (num < fts_sort_pll_degree) {
+ num = num << 1;
+ treelevel++;
+ }
+
+ start = (ulint(1) << treelevel) - 1;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ sel_tree[i + start] = int(i);
+ }
+
+ i = treelevel;
+ do {
+ row_fts_build_sel_tree_level(
+ sel_tree, --i, mrec, offsets, index);
+ } while (i > 0);
+
+ return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ dict_table_t* table, /*!< in: new table */
+ fts_psort_t* psort_info, /*!< parallel sort info */
+ ulint id) /* !< in: which auxiliary table's data
+ to insert to */
+{
+ const byte** b;
+ mem_heap_t* tuple_heap;
+ mem_heap_t* heap;
+ dberr_t error = DB_SUCCESS;
+ ulint* foffs;
+ rec_offs** offsets;
+ fts_tokenizer_word_t new_word;
+ ib_vector_t* positions;
+ doc_id_t last_doc_id;
+ ib_alloc_t* heap_alloc;
+ ulint i;
+ mrec_buf_t** buf;
+ pfs_os_file_t* fd;
+ byte** block;
+ byte** crypt_block;
+ const mrec_t** mrec;
+ ulint count = 0;
+ int* sel_tree;
+ ulint height;
+ ulint start;
+ fts_psort_insert_t ins_ctx;
+ uint64_t count_diag = 0;
+ fts_table_t fts_table;
+ char aux_table_name[MAX_FULL_NAME_LEN];
+ dict_table_t* aux_table;
+ dict_index_t* aux_index;
+ trx_t* trx;
+
+ /* We use the insert query graph as the dummy graph
+ needed in the row module call */
+
+ trx = trx_create();
+ trx_start_if_not_started(trx, true);
+
+ trx->op_info = "inserting index entries";
+
+ ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+ heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+ b = (const byte**) mem_heap_alloc(
+ heap, sizeof (*b) * fts_sort_pll_degree);
+ foffs = (ulint*) mem_heap_alloc(
+ heap, sizeof(*foffs) * fts_sort_pll_degree);
+ offsets = (rec_offs**) mem_heap_alloc(
+ heap, sizeof(*offsets) * fts_sort_pll_degree);
+ buf = (mrec_buf_t**) mem_heap_alloc(
+ heap, sizeof(*buf) * fts_sort_pll_degree);
+ fd = (pfs_os_file_t*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+ block = (byte**) mem_heap_alloc(
+ heap, sizeof(*block) * fts_sort_pll_degree);
+ crypt_block = (byte**) mem_heap_alloc(
+ heap, sizeof(*block) * fts_sort_pll_degree);
+ mrec = (const mrec_t**) mem_heap_alloc(
+ heap, sizeof(*mrec) * fts_sort_pll_degree);
+ sel_tree = (int*) mem_heap_alloc(
+ heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+ tuple_heap = mem_heap_create(1000);
+
+ ins_ctx.charset = fts_index_get_charset(index);
+ ins_ctx.heap = heap;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ ulint num;
+
+ num = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ offsets[i] = static_cast<rec_offs*>(mem_heap_zalloc(
+ heap, num * sizeof *offsets[i]));
+ rec_offs_set_n_alloc(offsets[i], num);
+ rec_offs_set_n_fields(offsets[i], dict_index_get_n_fields(index));
+ block[i] = psort_info[i].merge_block[id];
+ crypt_block[i] = psort_info[i].crypt_block[id];
+ b[i] = psort_info[i].merge_block[id];
+ fd[i] = psort_info[i].merge_file[id]->fd;
+ foffs[i] = 0;
+
+ buf[i] = static_cast<mrec_buf_t*>(
+ mem_heap_alloc(heap, sizeof *buf[i]));
+
+ count_diag += psort_info[i].merge_file[id]->n_rec;
+ }
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "InnoDB_FTS: to insert " << count_diag
+ << " records";
+ }
+
+ /* Initialize related variables if creating FTS indexes */
+ heap_alloc = ib_heap_allocator_create(heap);
+
+ memset(&new_word, 0, sizeof(new_word));
+
+ new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+ positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+ last_doc_id = 0;
+
+ /* We should set the flags2 with aux_table_name here,
+ in order to get the correct aux table names. */
+ index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+ DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+ index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME
+ & ((1U << DICT_TF2_BITS) - 1););
+ fts_table.type = FTS_INDEX_TABLE;
+ fts_table.index_id = index->id;
+ fts_table.table_id = table->id;
+ fts_table.table = index->table;
+ fts_table.suffix = fts_get_suffix(id);
+
+ /* Get aux index */
+ fts_get_table_name(&fts_table, aux_table_name);
+ aux_table = dict_table_open_on_name(aux_table_name, false,
+ DICT_ERR_IGNORE_NONE);
+ ut_ad(aux_table != NULL);
+ aux_index = dict_table_get_first_index(aux_table);
+
+ ut_ad(!aux_index->is_instant());
+ /* row_merge_write_fts_node() depends on the correct value */
+ ut_ad(aux_index->n_core_null_bytes
+ == UT_BITS_IN_BYTES(aux_index->n_nullable));
+
+ /* Create bulk load instance */
+ ins_ctx.btr_bulk = UT_NEW_NOKEY(BtrBulk(aux_index, trx));
+
+ /* Create tuple for insert */
+ ins_ctx.tuple = dtuple_create(heap, dict_index_get_n_fields(aux_index));
+ dict_index_copy_types(ins_ctx.tuple, aux_index,
+ dict_index_get_n_fields(aux_index));
+
+ /* Set TRX_ID and ROLL_PTR */
+ dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 2),
+ &reset_trx_id, DATA_TRX_ID_LEN);
+ dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 3),
+ &reset_trx_id[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN);
+
+ ut_d(ins_ctx.aux_index_id = id);
+
+ const ulint space = table->space_id;
+
+ for (i = 0; i < fts_sort_pll_degree; i++) {
+ if (psort_info[i].merge_file[id]->n_rec == 0) {
+ /* No Rows to read */
+ mrec[i] = b[i] = NULL;
+ } else {
+ /* Read from temp file only if it has been
+ written to. Otherwise, block memory holds
+ all the sorted records */
+ if (psort_info[i].merge_file[id]->offset > 0
+ && (!row_merge_read(
+ fd[i], foffs[i],
+ (row_merge_block_t*) block[i],
+ (row_merge_block_t*) crypt_block[i],
+ space))) {
+ error = DB_CORRUPTION;
+ goto exit;
+ }
+
+ ROW_MERGE_READ_GET_NEXT(i);
+ }
+ }
+
+ height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+ offsets, index);
+
+ start = (1U << height) - 1;
+
+ /* Fetch sorted records from sort buffer and insert them into
+ corresponding FTS index auxiliary tables */
+ for (;;) {
+ dtuple_t* dtuple;
+ int min_rec = 0;
+
+ if (fts_sort_pll_degree <= 2) {
+ while (!mrec[min_rec]) {
+ min_rec++;
+
+ if (min_rec >= (int) fts_sort_pll_degree) {
+ row_fts_insert_tuple(
+ &ins_ctx, &new_word,
+ positions, &last_doc_id,
+ NULL);
+
+ goto exit;
+ }
+ }
+
+ for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+ if (!mrec[i]) {
+ continue;
+ }
+
+ if (cmp_rec_rec_simple(
+ mrec[i], mrec[min_rec],
+ offsets[i], offsets[min_rec],
+ index, NULL) < 0) {
+ min_rec = static_cast<int>(i);
+ }
+ }
+ } else {
+ min_rec = sel_tree[0];
+
+ if (min_rec == -1) {
+ row_fts_insert_tuple(
+ &ins_ctx, &new_word,
+ positions, &last_doc_id,
+ NULL);
+
+ goto exit;
+ }
+ }
+
+ dtuple = row_rec_to_index_entry_low(
+ mrec[min_rec], index, offsets[min_rec],
+ tuple_heap);
+
+ row_fts_insert_tuple(
+ &ins_ctx, &new_word, positions,
+ &last_doc_id, dtuple);
+
+
+ ROW_MERGE_READ_GET_NEXT(min_rec);
+
+ if (fts_sort_pll_degree > 2) {
+ if (!mrec[min_rec]) {
+ sel_tree[start + min_rec] = -1;
+ }
+
+ row_fts_sel_tree_update(sel_tree, start + min_rec,
+ height, mrec,
+ offsets, index);
+ }
+
+ count++;
+
+ mem_heap_empty(tuple_heap);
+ }
+
+exit:
+ fts_sql_commit(trx);
+
+ trx->op_info = "";
+
+ mem_heap_free(tuple_heap);
+
+ error = ins_ctx.btr_bulk->finish(error);
+ UT_DELETE(ins_ctx.btr_bulk);
+
+ aux_table->release();
+
+ trx->free();
+
+ mem_heap_free(heap);
+
+ if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "InnoDB_FTS: inserted " << count << " records";
+ }
+
+ return(error);
+}
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
new file mode 100644
index 00000000..d2609fdb
--- /dev/null
+++ b/storage/innobase/row/row0import.cc
@@ -0,0 +1,4585 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0import.cc
+Import a tablespace to a running instance.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0import.h"
+#include "btr0pcur.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#include "buf0flu.h"
+#include "que0que.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "pars0pars.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "row0quiesce.h"
+#include "fil0pagecompress.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "lzo/lzo1x.h"
+#include "snappy-c.h"
+#include "log.h"
+
+#include "scope.h"
+
+#include <vector>
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+using st_::span;
+
+/** The size of the buffer to use for IO.
+@param n physical page size
+@return number of pages */
+#define IO_BUFFER_SIZE(n) ((1024 * 1024) / (n))
+
+/** For gathering stats on records during phase I */
+struct row_stats_t {
+ ulint m_n_deleted; /*!< Number of deleted records
+ found in the index */
+
+ ulint m_n_purged; /*!< Number of records purged
+ optimisatically */
+
+ ulint m_n_rows; /*!< Number of rows */
+
+ ulint m_n_purge_failed; /*!< Number of deleted rows
+ that could not be purged */
+};
+
+/** Index information required by IMPORT. */
+struct row_index_t {
+ index_id_t m_id; /*!< Index id of the table
+ in the exporting server */
+ byte* m_name; /*!< Index name */
+
+ uint32_t m_space; /*!< Space where it is placed */
+
+ uint32_t m_page_no; /*!< Root page number */
+
+ ulint m_type; /*!< Index type */
+
+ ulint m_trx_id_offset; /*!< Relevant only for clustered
+ indexes, offset of transaction
+ id system column */
+
+ ulint m_n_user_defined_cols; /*!< User defined columns */
+
+ ulint m_n_uniq; /*!< Number of columns that can
+ uniquely identify the row */
+
+ ulint m_n_nullable; /*!< Number of nullable
+ columns */
+
+ ulint m_n_fields; /*!< Total number of fields */
+
+ dict_field_t* m_fields; /*!< Index fields */
+
+ const dict_index_t*
+ m_srv_index; /*!< Index instance in the
+ importing server */
+
+ row_stats_t m_stats; /*!< Statistics gathered during
+ the import phase */
+
+};
+
+/** Meta data required by IMPORT. */
+struct row_import {
+ row_import() UNIV_NOTHROW
+ :
+ m_table(NULL),
+ m_version(0),
+ m_hostname(NULL),
+ m_table_name(NULL),
+ m_autoinc(0),
+ m_zip_size(0),
+ m_flags(0),
+ m_n_cols(0),
+ m_cols(NULL),
+ m_col_names(NULL),
+ m_n_indexes(0),
+ m_indexes(NULL),
+ m_missing(true) { }
+
+ ~row_import() UNIV_NOTHROW;
+
+ /** Find the index entry in in the indexes array.
+ @param name index name
+ @return instance if found else 0. */
+ row_index_t* get_index(const char* name) const UNIV_NOTHROW;
+
+ /** Get the number of rows in the index.
+ @param name index name
+ @return number of rows (doesn't include delete marked rows). */
+ ulint get_n_rows(const char* name) const UNIV_NOTHROW;
+
+ /** Find the ordinal value of the column name in the cfg table columns.
+ @param name of column to look for.
+ @return ULINT_UNDEFINED if not found. */
+ ulint find_col(const char* name) const UNIV_NOTHROW;
+
+ /** Get the number of rows for which purge failed during the
+ convert phase.
+ @param name index name
+ @return number of rows for which purge failed. */
+ ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW;
+
+ /** Check if the index is clean. ie. no delete-marked records
+ @param name index name
+ @return true if index needs to be purged. */
+ bool requires_purge(const char* name) const UNIV_NOTHROW
+ {
+ return(get_n_purge_failed(name) > 0);
+ }
+
+ /** Set the index root <space, pageno> using the index name */
+ void set_root_by_name() UNIV_NOTHROW;
+
+ /** Set the index root <space, pageno> using a heuristic
+ @return DB_SUCCESS or error code */
+ dberr_t set_root_by_heuristic() UNIV_NOTHROW;
+
+ /** Check if the index schema that was read from the .cfg file
+ matches the in memory index definition.
+ Note: It will update row_import_t::m_srv_index to map the meta-data
+ read from the .cfg file to the server index instance.
+ @return DB_SUCCESS or error code. */
+ dberr_t match_index_columns(
+ THD* thd,
+ const dict_index_t* index) UNIV_NOTHROW;
+
+ /** Check if the table schema that was read from the .cfg file
+ matches the in memory table definition.
+ @param thd MySQL session variable
+ @return DB_SUCCESS or error code. */
+ dberr_t match_table_columns(
+ THD* thd) UNIV_NOTHROW;
+
+ /** Check if the table (and index) schema that was read from the
+ .cfg file matches the in memory table definition.
+ @param thd MySQL session variable
+ @return DB_SUCCESS or error code. */
+ dberr_t match_schema(
+ THD* thd) UNIV_NOTHROW;
+
+ dberr_t match_flags(THD *thd) const ;
+
+
+ dict_table_t* m_table; /*!< Table instance */
+
+ ulint m_version; /*!< Version of config file */
+
+ byte* m_hostname; /*!< Hostname where the
+ tablespace was exported */
+ byte* m_table_name; /*!< Exporting instance table
+ name */
+
+ ib_uint64_t m_autoinc; /*!< Next autoinc value */
+
+ ulint m_zip_size; /*!< ROW_FORMAT=COMPRESSED
+ page size, or 0 */
+
+ ulint m_flags; /*!< Table flags */
+
+ ulint m_n_cols; /*!< Number of columns in the
+ meta-data file */
+
+ dict_col_t* m_cols; /*!< Column data */
+
+ byte** m_col_names; /*!< Column names, we store the
+ column naems separately becuase
+ there is no field to store the
+ value in dict_col_t */
+
+ ulint m_n_indexes; /*!< Number of indexes,
+ including clustered index */
+
+ row_index_t* m_indexes; /*!< Index meta data */
+
+ bool m_missing; /*!< true if a .cfg file was
+ found and was readable */
+};
+
+struct fil_iterator_t {
+ pfs_os_file_t file; /*!< File handle */
+ const char* filepath; /*!< File path name */
+ os_offset_t start; /*!< From where to start */
+ os_offset_t end; /*!< Where to stop */
+ os_offset_t file_size; /*!< File size in bytes */
+ ulint n_io_buffers; /*!< Number of pages to use
+ for IO */
+ byte* io_buffer; /*!< Buffer to use for IO */
+ fil_space_crypt_t *crypt_data; /*!< Crypt data (if encrypted) */
+ byte* crypt_io_buffer; /*!< IO buffer when encrypted */
+};
+
+/** Use the page cursor to iterate over records in a block. */
+class RecIterator {
+public:
+ /** Default constructor */
+ RecIterator() UNIV_NOTHROW
+ {
+ memset(&m_cur, 0x0, sizeof(m_cur));
+ /* Make page_cur_delete_rec() happy. */
+ m_mtr.start();
+ m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ /** Position the cursor on the first user record. */
+ rec_t* open(buf_block_t* block, const dict_index_t* index) noexcept
+ MY_ATTRIBUTE((warn_unused_result))
+ {
+ m_cur.index = const_cast<dict_index_t*>(index);
+ page_cur_set_before_first(block, &m_cur);
+ return next();
+ }
+
+ /** Move to the next record. */
+ rec_t* next() noexcept MY_ATTRIBUTE((warn_unused_result))
+ {
+ return page_cur_move_to_next(&m_cur);
+ }
+
+ /**
+ @return the current record */
+ rec_t* current() UNIV_NOTHROW
+ {
+ ut_ad(!end());
+ return(page_cur_get_rec(&m_cur));
+ }
+
+ buf_block_t* current_block() const { return m_cur.block; }
+
+ /**
+ @return true if cursor is at the end */
+ bool end() UNIV_NOTHROW
+ {
+ return(page_cur_is_after_last(&m_cur) == TRUE);
+ }
+
+ /** Remove the current record
+ @return true on success */
+ bool remove(rec_offs* offsets) UNIV_NOTHROW
+ {
+ const dict_index_t* const index = m_cur.index;
+ ut_ad(page_is_leaf(m_cur.block->page.frame));
+ /* We can't end up with an empty page unless it is root. */
+ if (page_get_n_recs(m_cur.block->page.frame) <= 1) {
+ return(false);
+ }
+
+ if (!rec_offs_any_extern(offsets)
+ && m_cur.block->page.id().page_no() != index->page
+ && ((page_get_data_size(m_cur.block->page.frame)
+ - rec_offs_size(offsets)
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(index))
+ || !page_has_siblings(m_cur.block->page.frame)
+ || (page_get_n_recs(m_cur.block->page.frame) < 2))) {
+ return false;
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block);
+ ut_a(!page_zip || page_zip_validate(
+ page_zip, m_cur.block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_cur_delete_rec(&m_cur, offsets, &m_mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(
+ page_zip, m_cur.block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ return true;
+ }
+
+private:
+ page_cur_t m_cur;
+public:
+ mtr_t m_mtr;
+};
+
+/** Class that purges delete marked records from indexes, both secondary
+and cluster. It does a pessimistic delete. This should only be done if we
+couldn't purge the delete marked reocrds during Phase I. */
+class IndexPurge {
+public:
+ /** Constructor
+ @param trx the user transaction covering the import tablespace
+ @param index to be imported
+ @param space_id space id of the tablespace */
+ IndexPurge(
+ trx_t* trx,
+ dict_index_t* index) UNIV_NOTHROW
+ :
+ m_trx(trx),
+ m_index(index),
+ m_n_rows(0)
+ {
+ ib::info() << "Phase II - Purge records from index "
+ << index->name;
+ }
+
+ /** Destructor */
+ ~IndexPurge() UNIV_NOTHROW = default;
+
+ /** Purge delete marked records.
+ @return DB_SUCCESS or error code. */
+ dberr_t garbage_collect() UNIV_NOTHROW;
+
+ /** The number of records that are not delete marked.
+ @return total records in the index after purge */
+ ulint get_n_rows() const UNIV_NOTHROW
+ {
+ return(m_n_rows);
+ }
+
+private:
+ /** Begin import, position the cursor on the first record. */
+ inline bool open() noexcept;
+
+ /** Close the persistent cursor and commit the mini-transaction. */
+ void close() noexcept { m_mtr.commit(); btr_pcur_close(&m_pcur); }
+
+ /** Position the cursor on the next record.
+ @return DB_SUCCESS or error code */
+ dberr_t next() noexcept;
+
+ /** Store the persistent cursor position and reopen the
+ B-tree cursor in BTR_MODIFY_TREE mode, because the
+ tree structure may be changed during a pessimistic delete. */
+ inline dberr_t purge_pessimistic_delete() noexcept;
+
+ /** Purge a delete-marked record. */
+ dberr_t purge() noexcept;
+
+protected:
+ // Disable copying
+ IndexPurge();
+ IndexPurge(const IndexPurge&);
+ IndexPurge &operator=(const IndexPurge&);
+
+private:
+ trx_t* m_trx; /*!< User transaction */
+ mtr_t m_mtr; /*!< Mini-transaction */
+ btr_pcur_t m_pcur; /*!< Persistent cursor */
+ dict_index_t* m_index; /*!< Index to be processed */
+ ulint m_n_rows; /*!< Records in index */
+};
+
+/** Functor that is called for each physical page that is read from the
+tablespace file. */
+class AbstractCallback
+{
+public:
+ /** Constructor
+ @param trx covering transaction */
+ AbstractCallback(trx_t* trx, uint32_t space_id)
+ :
+ m_zip_size(0),
+ m_trx(trx),
+ m_space(space_id),
+ m_xdes(),
+ m_xdes_page_no(UINT32_MAX),
+ m_space_flags(UINT32_MAX) UNIV_NOTHROW { }
+
+ /** Free any extent descriptor instance */
+ virtual ~AbstractCallback()
+ {
+ UT_DELETE_ARRAY(m_xdes);
+ }
+
+ /** Determine the page size to use for traversing the tablespace
+ @param file_size size of the tablespace file in bytes
+ @param block contents of the first page in the tablespace file.
+ @retval DB_SUCCESS or error code. */
+ virtual dberr_t init(
+ os_offset_t file_size,
+ const buf_block_t* block) UNIV_NOTHROW;
+
+ /** @return true if compressed table. */
+ bool is_compressed_table() const UNIV_NOTHROW
+ {
+ return get_zip_size();
+ }
+
+ /** @return the tablespace flags */
+ uint32_t get_space_flags() const { return m_space_flags; }
+
+ /**
+ Set the name of the physical file and the file handle that is used
+ to open it for the file that is being iterated over.
+ @param filename the physical name of the tablespace file
+ @param file OS file handle */
+ void set_file(const char* filename, pfs_os_file_t file) UNIV_NOTHROW
+ {
+ m_file = file;
+ m_filepath = filename;
+ }
+
+ ulint get_zip_size() const { return m_zip_size; }
+ ulint physical_size() const
+ {
+ return m_zip_size ? m_zip_size : srv_page_size;
+ }
+
+ const char* filename() const { return m_filepath; }
+
+ /**
+ Called for every page in the tablespace. If the page was not
+ updated then its state must be set to BUF_PAGE_NOT_USED. For
+ compressed tables the page descriptor memory will be at offset:
+ block->page.frame + srv_page_size;
+ @param block block read from file, note it is not from the buffer pool
+ @retval DB_SUCCESS or error code. */
+ virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0;
+
+ /** @return the tablespace identifier */
+ uint32_t get_space_id() const { return m_space; }
+
+ bool is_interrupted() const { return trx_is_interrupted(m_trx); }
+
+ /**
+ Get the data page depending on the table type, compressed or not.
+ @param block - block read from disk
+ @retval the buffer frame */
+ static byte* get_frame(const buf_block_t* block)
+ {
+ return block->page.zip.data
+ ? block->page.zip.data : block->page.frame;
+ }
+
+ /** Invoke the functionality for the callback */
+ virtual dberr_t run(const fil_iterator_t& iter,
+ buf_block_t* block) UNIV_NOTHROW = 0;
+
+protected:
+ /** Get the physical offset of the extent descriptor within the page.
+ @param page_no page number of the extent descriptor
+ @param page contents of the page containing the extent descriptor.
+ @return the start of the xdes array in a page */
+ const xdes_t* xdes(
+ ulint page_no,
+ const page_t* page) const UNIV_NOTHROW
+ {
+ ulint offset;
+
+ offset = xdes_calc_descriptor_index(get_zip_size(), page_no);
+
+ return(page + XDES_ARR_OFFSET + XDES_SIZE * offset);
+ }
+
+ /** Set the current page directory (xdes). If the extent descriptor is
+ marked as free then free the current extent descriptor and set it to
+ 0. This implies that all pages that are covered by this extent
+ descriptor are also freed.
+
+ @param page_no offset of page within the file
+ @param page page contents
+ @return DB_SUCCESS or error code. */
+ dberr_t set_current_xdes(
+ uint32_t page_no,
+ const page_t* page) UNIV_NOTHROW
+ {
+ m_xdes_page_no = page_no;
+
+ UT_DELETE_ARRAY(m_xdes);
+ m_xdes = NULL;
+
+ if (mach_read_from_4(XDES_ARR_OFFSET + XDES_STATE + page)
+ != XDES_FREE) {
+ const ulint physical_size = m_zip_size
+ ? m_zip_size : srv_page_size;
+
+ m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, physical_size);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_13",
+ UT_DELETE_ARRAY(m_xdes);
+ m_xdes = NULL;
+ );
+
+ if (m_xdes == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memcpy(m_xdes, page, physical_size);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ /** Check if the page is marked as free in the extent descriptor.
+ @param page_no page number to check in the extent descriptor.
+ @return true if the page is marked as free */
+ bool is_free(uint32_t page_no) const UNIV_NOTHROW
+ {
+ ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
+ == m_xdes_page_no);
+
+ if (m_xdes != 0) {
+ const xdes_t* xdesc = xdes(page_no, m_xdes);
+ ulint pos = page_no % FSP_EXTENT_SIZE;
+
+ return xdes_is_free(xdesc, pos);
+ }
+
+ /* If the current xdes was free, the page must be free. */
+ return(true);
+ }
+
+protected:
+ /** The ROW_FORMAT=COMPRESSED page size, or 0. */
+ ulint m_zip_size;
+
+ /** File handle to the tablespace */
+ pfs_os_file_t m_file;
+
+ /** Physical file path. */
+ const char* m_filepath;
+
+ /** Covering transaction. */
+ trx_t* m_trx;
+
+ /** Space id of the file being iterated over. */
+ uint32_t m_space;
+
+ /** Current extent descriptor page */
+ xdes_t* m_xdes;
+
+ /** Physical page offset in the file of the extent descriptor */
+ uint32_t m_xdes_page_no;
+
+ /** Flags value read from the header page */
+ uint32_t m_space_flags;
+};
+
+ATTRIBUTE_COLD static dberr_t invalid_space_flags(uint32_t flags)
+{
+ if (fsp_flags_is_incompatible_mysql(flags))
+ {
+ sql_print_error("InnoDB: unsupported MySQL tablespace");
+ return DB_UNSUPPORTED;
+ }
+
+ sql_print_error("InnoDB: Invalid FSP_SPACE_FLAGS=0x%" PRIx32, flags);
+ return DB_CORRUPTION;
+}
+
+/** Determine the page size to use for traversing the tablespace
+@param file_size size of the tablespace file in bytes
+@param block contents of the first page in the tablespace file.
+@retval DB_SUCCESS or error code. */
+dberr_t
+AbstractCallback::init(
+ os_offset_t file_size,
+ const buf_block_t* block) UNIV_NOTHROW
+{
+ const page_t* page = block->page.frame;
+
+ m_space_flags = fsp_header_get_flags(page);
+ if (!fil_space_t::is_valid_flags(m_space_flags, true)) {
+ uint32_t cflags = fsp_flags_convert_from_101(m_space_flags);
+ if (cflags == UINT32_MAX) {
+ return DB_CORRUPTION;
+ }
+ m_space_flags = cflags;
+ }
+
+ /* Clear the DATA_DIR flag, which is basically garbage. */
+ m_space_flags &= ~(1U << FSP_FLAGS_POS_RESERVED);
+ m_zip_size = fil_space_t::zip_size(m_space_flags);
+ const ulint logical_size = fil_space_t::logical_size(m_space_flags);
+ const ulint physical_size = fil_space_t::physical_size(m_space_flags);
+
+ if (logical_size != srv_page_size) {
+
+ ib::error() << "Page size " << logical_size
+ << " of ibd file is not the same as the server page"
+ " size " << srv_page_size;
+
+ return(DB_CORRUPTION);
+
+ } else if (file_size & (physical_size - 1)) {
+
+ ib::error() << "File size " << file_size << " is not a"
+ " multiple of the page size "
+ << physical_size;
+
+ return(DB_CORRUPTION);
+ }
+
+ if (m_space == UINT32_MAX) {
+ m_space = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID
+ + page);
+ }
+
+ return set_current_xdes(0, page);
+}
+
+/**
+TODO: This can be made parallel trivially by chunking up the file
+and creating a callback per thread.. Main benefit will be to use
+multiple CPUs for checksums and compressed tables. We have to do
+compressed tables block by block right now. Secondly we need to
+decompress/compress and copy too much of data. These are
+CPU intensive.
+
+Iterate over all the pages in the tablespace.
+@param iter - Tablespace iterator
+@param block - block to use for IO
+@param callback - Callback to inspect and update page contents
+@retval DB_SUCCESS or error code */
+static dberr_t fil_iterate(
+ const fil_iterator_t& iter,
+ buf_block_t* block,
+ AbstractCallback& callback);
+
+/**
+Try and determine the index root pages by checking if the next/prev
+pointers are both FIL_NULL. We need to ensure that skip deleted pages. */
+struct FetchIndexRootPages : public AbstractCallback {
+
+ /** Index information gathered from the .ibd file. */
+ struct Index {
+
+ Index(index_id_t id, uint32_t page_no)
+ :
+ m_id(id),
+ m_page_no(page_no) { }
+
+ index_id_t m_id; /*!< Index id */
+ uint32_t m_page_no; /*!< Root page number */
+ };
+
+ /** Constructor
+ @param trx covering (user) transaction
+ @param table table definition in server .*/
+ FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+ :
+ AbstractCallback(trx, UINT32_MAX),
+ m_table(table), m_index(0, 0) UNIV_NOTHROW { }
+
+ /** Destructor */
+ ~FetchIndexRootPages() UNIV_NOTHROW override = default;
+
+ /** Fetch the clustered index root page in the tablespace
+ @param iter Tablespace iterator
+ @param block Block to use for IO
+ @retval DB_SUCCESS or error code */
+ dberr_t run(const fil_iterator_t& iter,
+ buf_block_t* block) UNIV_NOTHROW override;
+
+ /** Called for each block as it is read from the file.
+ @param block block to convert, it is not from the buffer pool.
+ @retval DB_SUCCESS or error code. */
+ dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+ /** Update the import configuration that will be used to import
+ the tablespace. */
+ dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
+
+ /** Table definition in server. */
+ const dict_table_t* m_table;
+
+ /** Index information */
+ Index m_index;
+};
+
+/** Called for each block as it is read from the file. Check index pages to
+determine the exact row format. We can't get that from the tablespace
+header flags alone.
+
+@param block block to convert, it is not from the buffer pool.
+@retval DB_SUCCESS or error code. */
+dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+ if (is_interrupted()) return DB_INTERRUPTED;
+
+ const page_t* page = get_frame(block);
+
+ m_index.m_id = btr_page_get_index_id(page);
+ m_index.m_page_no = block->page.id().page_no();
+
+ /* Check that the tablespace flags match the table flags. */
+ const uint32_t expected = dict_tf_to_fsp_flags(m_table->flags);
+ if (!fsp_flags_match(expected, m_space_flags)) {
+ ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Expected FSP_SPACE_FLAGS=0x%x, .ibd "
+ "file contains 0x%x.",
+ unsigned(expected),
+ unsigned(m_space_flags));
+ return(DB_CORRUPTION);
+ }
+
+ if (!page_is_comp(block->page.frame) !=
+ !dict_table_is_comp(m_table)) {
+ ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "ROW_FORMAT mismatch");
+ return DB_CORRUPTION;
+ }
+
+ return DB_SUCCESS;
+}
+
+/**
+Update the import configuration that will be used to import the tablespace.
+@return error code or DB_SUCCESS */
+dberr_t
+FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW
+{
+ ut_a(cfg->m_table == m_table);
+ cfg->m_zip_size = m_zip_size;
+ cfg->m_n_indexes = 1;
+
+ if (cfg->m_n_indexes == 0) {
+
+ ib::error() << "No B+Tree found in tablespace";
+
+ return(DB_CORRUPTION);
+ }
+
+ cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_11",
+ UT_DELETE_ARRAY(cfg->m_indexes);
+ cfg->m_indexes = NULL;
+ );
+
+ if (cfg->m_indexes == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+ row_index_t* cfg_index = cfg->m_indexes;
+
+ char name[BUFSIZ];
+
+ snprintf(name, sizeof(name), "index" IB_ID_FMT, m_index.m_id);
+
+ ulint len = strlen(name) + 1;
+
+ cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_12",
+ UT_DELETE_ARRAY(cfg_index->m_name);
+ cfg_index->m_name = NULL;
+ );
+
+ if (cfg_index->m_name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memcpy(cfg_index->m_name, name, len);
+
+ cfg_index->m_id = m_index.m_id;
+
+ cfg_index->m_space = m_space;
+
+ cfg_index->m_page_no = m_index.m_page_no;
+
+ return(DB_SUCCESS);
+}
+
+/* Functor that is called for each physical page that is read from the
+tablespace file.
+
+ 1. Check each page for corruption.
+
+ 2. Update the space id and LSN on every page
+ * For the header page
+ - Validate the flags
+ - Update the LSN
+
+ 3. On Btree pages
+ * Set the index id
+ * Update the max trx id
+ * In a cluster index, update the system columns
+ * In a cluster index, update the BLOB ptr, set the space id
+ * Purge delete marked records, but only if they can be easily
+ removed from the page
+ * Keep a counter of number of rows, ie. non-delete-marked rows
+ * Keep a counter of number of delete marked rows
+ * Keep a counter of number of purge failure
+ * If a page is stamped with an index id that isn't in the .cfg file
+ we assume it is deleted and the page can be ignored.
+
+ 4. Set the page state to dirty so that it will be written to disk.
+*/
+class PageConverter : public AbstractCallback {
+public:
+ /** Constructor
+ @param cfg config of table being imported.
+ @param space_id tablespace identifier
+ @param trx transaction covering the import */
+ PageConverter(row_import* cfg, uint32_t space_id, trx_t* trx)
+ :
+ AbstractCallback(trx, space_id),
+ m_cfg(cfg),
+ m_index(cfg->m_indexes),
+ m_rec_iter(),
+ m_offsets_(), m_offsets(m_offsets_),
+ m_heap(0),
+ m_cluster_index(dict_table_get_first_index(cfg->m_table))
+ {
+ rec_offs_init(m_offsets_);
+ }
+
+ ~PageConverter() UNIV_NOTHROW override
+ {
+ if (m_heap != 0) {
+ mem_heap_free(m_heap);
+ }
+ }
+
+ dberr_t run(const fil_iterator_t& iter,
+ buf_block_t* block) UNIV_NOTHROW override
+ {
+ return fil_iterate(iter, block, *this);
+ }
+
+ /** Called for each block as it is read from the file.
+ @param block block to convert, it is not from the buffer pool.
+ @retval DB_SUCCESS or error code. */
+ dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+private:
+ /** Update the page, set the space id, max trx id and index id.
+ @param block block read from file
+ @param page_type type of the page
+ @retval DB_SUCCESS or error code */
+ dberr_t update_page(buf_block_t* block, uint16_t& page_type)
+ UNIV_NOTHROW;
+
+ /** Update the space, index id, trx id.
+ @param block block to convert
+ @return DB_SUCCESS or error code */
+ dberr_t update_index_page(buf_block_t* block) UNIV_NOTHROW;
+
+ /** Update the BLOB refrences and write UNDO log entries for
+ rows that can't be purged optimistically.
+ @param block block to update
+ @retval DB_SUCCESS or error code */
+ dberr_t update_records(buf_block_t* block) UNIV_NOTHROW;
+
+ /** Validate the space flags and update tablespace header page.
+ @param block block read from file, not from the buffer pool.
+ @retval DB_SUCCESS or error code */
+ dberr_t update_header(buf_block_t* block) UNIV_NOTHROW;
+
+ /** Adjust the BLOB reference for a single column that is externally stored
+ @param rec record to update
+ @param offsets column offsets for the record
+ @param i column ordinal value
+ @return DB_SUCCESS or error code */
+ dberr_t adjust_cluster_index_blob_column(
+ rec_t* rec,
+ const rec_offs* offsets,
+ ulint i) UNIV_NOTHROW;
+
+ /** Adjusts the BLOB reference in the clustered index row for all
+ externally stored columns.
+ @param rec record to update
+ @param offsets column offsets for the record
+ @return DB_SUCCESS or error code */
+ dberr_t adjust_cluster_index_blob_columns(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW;
+
+ /** In the clustered index, adjist the BLOB pointers as needed.
+ Also update the BLOB reference, write the new space id.
+ @param rec record to update
+ @param offsets column offsets for the record
+ @return DB_SUCCESS or error code */
+ dberr_t adjust_cluster_index_blob_ref(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW;
+
+ /** Purge delete-marked records, only if it is possible to do
+ so without re-organising the B+tree.
+ @retval true if purged */
+ bool purge() UNIV_NOTHROW;
+
+ /** Adjust the BLOB references and sys fields for the current record.
+ @param rec record to update
+ @param offsets column offsets for the record
+ @return DB_SUCCESS or error code. */
+ dberr_t adjust_cluster_record(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW;
+
+ /** Find an index with the matching id.
+ @return row_index_t* instance or 0 */
+ row_index_t* find_index(index_id_t id) UNIV_NOTHROW
+ {
+ row_index_t* index = &m_cfg->m_indexes[0];
+
+ for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) {
+ if (id == index->m_id) {
+ return(index);
+ }
+ }
+
+ return(0);
+
+ }
+private:
+ /** Config for table that is being imported. */
+ row_import* m_cfg;
+
+ /** Current index whose pages are being imported */
+ row_index_t* m_index;
+
+ /** Iterator over records in a block */
+ RecIterator m_rec_iter;
+
+ /** Record offset */
+ rec_offs m_offsets_[REC_OFFS_NORMAL_SIZE];
+
+ /** Pointer to m_offsets_ */
+ rec_offs* m_offsets;
+
+ /** Memory heap for the record offsets */
+ mem_heap_t* m_heap;
+
+ /** Cluster index instance */
+ dict_index_t* m_cluster_index;
+};
+
+/**
+row_import destructor. */
+row_import::~row_import() UNIV_NOTHROW
+{
+ for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) {
+ UT_DELETE_ARRAY(m_indexes[i].m_name);
+
+ if (m_indexes[i].m_fields == NULL) {
+ continue;
+ }
+
+ dict_field_t* fields = m_indexes[i].m_fields;
+ ulint n_fields = m_indexes[i].m_n_fields;
+
+ for (ulint j = 0; j < n_fields; ++j) {
+ UT_DELETE_ARRAY(const_cast<char*>(fields[j].name()));
+ }
+
+ UT_DELETE_ARRAY(fields);
+ }
+
+ for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) {
+ UT_DELETE_ARRAY(m_col_names[i]);
+ }
+
+ UT_DELETE_ARRAY(m_cols);
+ UT_DELETE_ARRAY(m_indexes);
+ UT_DELETE_ARRAY(m_col_names);
+ UT_DELETE_ARRAY(m_table_name);
+ UT_DELETE_ARRAY(m_hostname);
+}
+
+/** Find the index entry in in the indexes array.
+@param name index name
+@return instance if found else 0. */
+row_index_t*
+row_import::get_index(
+ const char* name) const UNIV_NOTHROW
+{
+ for (ulint i = 0; i < m_n_indexes; ++i) {
+ const char* index_name;
+ row_index_t* index = &m_indexes[i];
+
+ index_name = reinterpret_cast<const char*>(index->m_name);
+
+ if (strcmp(index_name, name) == 0) {
+
+ return(index);
+ }
+ }
+
+ return(0);
+}
+
+/** Get the number of rows in the index.
+@param name index name
+@return number of rows (doesn't include delete marked rows). */
+ulint
+row_import::get_n_rows(
+ const char* name) const UNIV_NOTHROW
+{
+ const row_index_t* index = get_index(name);
+
+ ut_a(name != 0);
+
+ return(index->m_stats.m_n_rows);
+}
+
+/** Get the number of rows for which purge failed uding the convert phase.
+@param name index name
+@return number of rows for which purge failed. */
+ulint
+row_import::get_n_purge_failed(
+ const char* name) const UNIV_NOTHROW
+{
+ const row_index_t* index = get_index(name);
+
+ ut_a(name != 0);
+
+ return(index->m_stats.m_n_purge_failed);
+}
+
+/** Find the ordinal value of the column name in the cfg table columns.
+@param name of column to look for.
+@return ULINT_UNDEFINED if not found. */
+ulint
+row_import::find_col(
+ const char* name) const UNIV_NOTHROW
+{
+ for (ulint i = 0; i < m_n_cols; ++i) {
+ const char* col_name;
+
+ col_name = reinterpret_cast<const char*>(m_col_names[i]);
+
+ if (strcmp(col_name, name) == 0) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**
+Check if the index schema that was read from the .cfg file matches the
+in memory index definition.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_index_columns(
+ THD* thd,
+ const dict_index_t* index) UNIV_NOTHROW
+{
+ row_index_t* cfg_index;
+ dberr_t err = DB_SUCCESS;
+
+ cfg_index = get_index(index->name);
+
+ if (cfg_index == 0) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index %s not found in tablespace meta-data file.",
+ index->name());
+
+ return(DB_ERROR);
+ }
+
+ if (cfg_index->m_n_fields != index->n_fields) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index field count %u doesn't match"
+ " tablespace metadata file value " ULINTPF,
+ index->n_fields, cfg_index->m_n_fields);
+
+ return(DB_ERROR);
+ }
+
+ cfg_index->m_srv_index = index;
+
+ const dict_field_t* field = index->fields;
+ const dict_field_t* cfg_field = cfg_index->m_fields;
+
+ for (ulint i = 0; i < index->n_fields; ++i, ++field, ++cfg_field) {
+
+ if (field->name() && cfg_field->name()
+ && strcmp(field->name(), cfg_field->name()) != 0) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index field name %s doesn't match"
+ " tablespace metadata field name %s"
+ " for field position " ULINTPF,
+ field->name(), cfg_field->name(), i);
+
+ err = DB_ERROR;
+ }
+
+ if (cfg_field->prefix_len != field->prefix_len) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index %s field %s prefix len %u"
+ " doesn't match metadata file value %u",
+ index->name(), field->name(),
+ field->prefix_len, cfg_field->prefix_len);
+
+ err = DB_ERROR;
+ }
+
+ if (cfg_field->fixed_len != field->fixed_len) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Index %s field %s fixed len %u"
+ " doesn't match metadata file value %u",
+ index->name(), field->name(),
+ field->fixed_len,
+ cfg_field->fixed_len);
+
+ err = DB_ERROR;
+ }
+ }
+
+ return(err);
+}
+
+/** Check if the table schema that was read from the .cfg file matches the
+in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_table_columns(
+ THD* thd) UNIV_NOTHROW
+{
+ dberr_t err = DB_SUCCESS;
+ const dict_col_t* col = m_table->cols;
+
+ for (ulint i = 0; i < m_table->n_cols; ++i, ++col) {
+
+ const char* col_name;
+ ulint cfg_col_index;
+
+ col_name = dict_table_get_col_name(
+ m_table, dict_col_get_no(col));
+
+ cfg_col_index = find_col(col_name);
+
+ if (cfg_col_index == ULINT_UNDEFINED) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s not found in tablespace.",
+ col_name);
+
+ err = DB_ERROR;
+ } else if (cfg_col_index != col->ind) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s ordinal value mismatch, it's at %u"
+ " in the table and " ULINTPF
+ " in the tablespace meta-data file",
+ col_name, col->ind, cfg_col_index);
+
+ err = DB_ERROR;
+ } else {
+ const dict_col_t* cfg_col;
+
+ cfg_col = &m_cols[cfg_col_index];
+ ut_a(cfg_col->ind == cfg_col_index);
+
+ if (cfg_col->prtype != col->prtype) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s precise type mismatch,"
+ " it's 0X%X in the table and 0X%X"
+ " in the tablespace meta file",
+ col_name, col->prtype, cfg_col->prtype);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->mtype != col->mtype) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s main type mismatch,"
+ " it's 0X%X in the table and 0X%X"
+ " in the tablespace meta file",
+ col_name, col->mtype, cfg_col->mtype);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->len != col->len) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s length mismatch,"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->len, cfg_col->len);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->mbminlen != col->mbminlen
+ || cfg_col->mbmaxlen != col->mbmaxlen) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s multi-byte len mismatch,"
+ " it's %u-%u in the table and %u-%u"
+ " in the tablespace meta file",
+ col_name, col->mbminlen, col->mbmaxlen,
+ cfg_col->mbminlen, cfg_col->mbmaxlen);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->ind != col->ind) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s position mismatch,"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->ind, cfg_col->ind);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->ord_part != col->ord_part) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s ordering mismatch,"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->ord_part,
+ cfg_col->ord_part);
+ err = DB_ERROR;
+ }
+
+ if (cfg_col->max_prefix != col->max_prefix) {
+ ib_errf(thd,
+ IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Column %s max prefix mismatch"
+ " it's %u in the table and %u"
+ " in the tablespace meta file",
+ col_name, col->max_prefix,
+ cfg_col->max_prefix);
+ err = DB_ERROR;
+ }
+ }
+ }
+
+ return(err);
+}
+
+dberr_t row_import::match_flags(THD *thd) const
+{
+ ulint mismatch= (m_table->flags ^ m_flags) & ~DICT_TF_MASK_DATA_DIR;
+ if (!mismatch)
+ return DB_SUCCESS;
+
+ const char *msg;
+ if (mismatch & DICT_TF_MASK_ZIP_SSIZE)
+ {
+ if ((m_table->flags & DICT_TF_MASK_ZIP_SSIZE) &&
+ (m_flags & DICT_TF_MASK_ZIP_SSIZE))
+ {
+ switch (m_flags & DICT_TF_MASK_ZIP_SSIZE) {
+ case 0U << DICT_TF_POS_ZIP_SSIZE:
+ goto uncompressed;
+ case 1U << DICT_TF_POS_ZIP_SSIZE:
+ msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1";
+ break;
+ case 2U << DICT_TF_POS_ZIP_SSIZE:
+ msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2";
+ break;
+ case 3U << DICT_TF_POS_ZIP_SSIZE:
+ msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4";
+ break;
+ case 4U << DICT_TF_POS_ZIP_SSIZE:
+ msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8";
+ break;
+ case 5U << DICT_TF_POS_ZIP_SSIZE:
+ msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=16";
+ break;
+ default:
+ msg= "strange KEY_BLOCK_SIZE";
+ }
+ }
+ else if (m_flags & DICT_TF_MASK_ZIP_SSIZE)
+ msg= "ROW_FORMAT=COMPRESSED";
+ else
+ goto uncompressed;
+ }
+ else
+ {
+ uncompressed:
+ msg= (m_flags & DICT_TF_MASK_ATOMIC_BLOBS) ? "ROW_FORMAT=DYNAMIC"
+ : (m_flags & DICT_TF_MASK_COMPACT) ? "ROW_FORMAT=COMPACT"
+ : "ROW_FORMAT=REDUNDANT";
+ }
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Table flags don't match, server table has 0x%x and the meta-data "
+ "file has 0x%zx; .cfg file uses %s",
+ m_table->flags, m_flags, msg);
+
+ return DB_ERROR;
+}
+
+/** Check if the table (and index) schema that was read from the .cfg file
+matches the in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_schema(
+ THD* thd) UNIV_NOTHROW
+{
+ /* Do some simple checks. */
+
+ if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+ /* If the number of indexes don't match then it is better
+ to abort the IMPORT. It is easy for the user to create a
+ table matching the IMPORT definition. */
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Number of indexes don't match, table has " ULINTPF
+ " indexes but the tablespace meta-data file has "
+ ULINTPF " indexes",
+ UT_LIST_GET_LEN(m_table->indexes), m_n_indexes);
+
+ return(DB_ERROR);
+ }
+
+ dberr_t err = match_table_columns(thd);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Check if the index definitions match. */
+
+ const dict_index_t* index;
+
+ for (index = UT_LIST_GET_FIRST(m_table->indexes);
+ index != 0;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ dberr_t index_err;
+
+ index_err = match_index_columns(thd, index);
+
+ if (index_err != DB_SUCCESS) {
+ err = index_err;
+ }
+ }
+
+ return(err);
+}
+
+/**
+Set the index root <space, pageno>, using index name. */
+void
+row_import::set_root_by_name() UNIV_NOTHROW
+{
+ row_index_t* cfg_index = m_indexes;
+
+ for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) {
+ dict_index_t* index;
+
+ const char* index_name;
+
+ index_name = reinterpret_cast<const char*>(cfg_index->m_name);
+
+ index = dict_table_get_index_on_name(m_table, index_name);
+
+ /* We've already checked that it exists. */
+ ut_a(index != 0);
+
+ index->page = cfg_index->m_page_no;
+ }
+}
+
+/**
+Set the index root <space, pageno>, using a heuristic.
+@return DB_SUCCESS or error code */
+dberr_t
+row_import::set_root_by_heuristic() UNIV_NOTHROW
+{
+ row_index_t* cfg_index = m_indexes;
+
+ ut_a(m_n_indexes > 0);
+
+ // TODO: For now use brute force, based on ordinality
+
+ if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+ ib::warn() << "Table " << m_table->name << " should have "
+ << UT_LIST_GET_LEN(m_table->indexes) << " indexes but"
+ " the tablespace has " << m_n_indexes << " indexes";
+ }
+
+ ulint i = 0;
+ dberr_t err = DB_SUCCESS;
+
+ for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes);
+ index != 0;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (index->type & DICT_FTS) {
+ index->type |= DICT_CORRUPT;
+ ib::warn() << "Skipping FTS index: " << index->name;
+ } else if (i < m_n_indexes) {
+
+ UT_DELETE_ARRAY(cfg_index[i].m_name);
+
+ ulint len = strlen(index->name) + 1;
+
+ cfg_index[i].m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_14",
+ UT_DELETE_ARRAY(cfg_index[i].m_name);
+ cfg_index[i].m_name = NULL;
+ );
+
+ if (cfg_index[i].m_name == NULL) {
+ err = DB_OUT_OF_MEMORY;
+ break;
+ }
+
+ memcpy(cfg_index[i].m_name, index->name, len);
+
+ cfg_index[i].m_srv_index = index;
+
+ index->page = cfg_index[i++].m_page_no;
+ }
+ }
+
+ return(err);
+}
+
+/**
+Purge delete marked records.
+@return DB_SUCCESS or error code. */
+dberr_t
+IndexPurge::garbage_collect() UNIV_NOTHROW
+{
+ ibool comp = dict_table_is_comp(m_index->table);
+
+ /* Open the persistent cursor and start the mini-transaction. */
+
+ dberr_t err = open() ? next() : DB_CORRUPTION;
+
+ for (; err == DB_SUCCESS; err = next()) {
+
+ rec_t* rec = btr_pcur_get_rec(&m_pcur);
+ ibool deleted = rec_get_deleted_flag(rec, comp);
+
+ if (!deleted) {
+ ++m_n_rows;
+ } else {
+ err = purge();
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+ }
+
+ /* Close the persistent cursor and commit the mini-transaction. */
+
+ close();
+
+ return(err == DB_END_OF_INDEX ? DB_SUCCESS : err);
+}
+
+/**
+Begin import, position the cursor on the first record. */
+inline bool IndexPurge::open() noexcept
+{
+ m_mtr.start();
+ m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ btr_pcur_init(&m_pcur);
+
+ if (m_pcur.open_leaf(true, m_index, BTR_MODIFY_LEAF, &m_mtr) != DB_SUCCESS)
+ return false;
+
+ rec_t *rec= page_rec_get_next(btr_pcur_get_rec(&m_pcur));
+ if (!rec)
+ return false;
+ if (rec_is_metadata(rec, *m_index))
+ /* Skip the metadata pseudo-record. */
+ btr_pcur_get_page_cur(&m_pcur)->rec= rec;
+ return true;
+}
+
+/**
+Position the cursor on the next record.
+@return DB_SUCCESS or error code */
+dberr_t IndexPurge::next() noexcept
+{
+ if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(&m_pcur))) {
+ return DB_CORRUPTION;
+ }
+
+ /* When switching pages, commit the mini-transaction
+ in order to release the latch on the old page. */
+
+ if (!btr_pcur_is_after_last_on_page(&m_pcur)) {
+ return(DB_SUCCESS);
+ } else if (trx_is_interrupted(m_trx)) {
+ /* Check after every page because the check
+ is expensive. */
+ return(DB_INTERRUPTED);
+ }
+
+ btr_pcur_store_position(&m_pcur, &m_mtr);
+
+ mtr_commit(&m_mtr);
+
+ mtr_start(&m_mtr);
+
+ mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+ if (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr)
+ == btr_pcur_t::CORRUPTED) {
+ return DB_CORRUPTION;
+ }
+ /* The following is based on btr_pcur_move_to_next_user_rec(). */
+ m_pcur.old_rec = nullptr;
+ ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF);
+ do {
+ if (btr_pcur_is_after_last_on_page(&m_pcur)) {
+ if (btr_pcur_is_after_last_in_tree(&m_pcur)) {
+ return DB_END_OF_INDEX;
+ }
+
+ if (dberr_t err = btr_pcur_move_to_next_page(&m_pcur,
+ &m_mtr)) {
+ return err;
+ }
+ } else if (!btr_pcur_move_to_next_on_page(&m_pcur)) {
+ return DB_CORRUPTION;
+ }
+ } while (!btr_pcur_is_on_user_rec(&m_pcur));
+
+ return DB_SUCCESS;
+}
+
+/**
+Store the persistent cursor position and reopen the
+B-tree cursor in BTR_MODIFY_TREE mode, because the
+tree structure may be changed during a pessimistic delete. */
+inline dberr_t IndexPurge::purge_pessimistic_delete() noexcept
+{
+ dberr_t err;
+ if (m_pcur.restore_position(BTR_PURGE_TREE, &m_mtr) != btr_pcur_t::CORRUPTED)
+ {
+ ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(&m_pcur),
+ m_index->table->not_redundant()));
+ btr_cur_pessimistic_delete(&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0,
+ false, &m_mtr);
+ }
+ else
+ err= DB_CORRUPTION;
+
+ m_mtr.commit();
+ return err;
+}
+
+dberr_t IndexPurge::purge() noexcept
+{
+ btr_pcur_store_position(&m_pcur, &m_mtr);
+ m_mtr.commit();
+ m_mtr.start();
+ m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+ dberr_t err= purge_pessimistic_delete();
+
+ m_mtr.start();
+ m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+ if (err == DB_SUCCESS)
+ err= (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr) ==
+ btr_pcur_t::CORRUPTED)
+ ? DB_CORRUPTION : DB_SUCCESS;
+ return err;
+}
+
+/** Adjust the BLOB reference for a single column that is externally stored
+@param rec record to update
+@param offsets column offsets for the record
+@param i column ordinal value
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_column(
+ rec_t* rec,
+ const rec_offs* offsets,
+ ulint i) UNIV_NOTHROW
+{
+ ulint len;
+ byte* field;
+
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ DBUG_EXECUTE_IF("ib_import_trigger_corruption_2",
+ len = BTR_EXTERN_FIELD_REF_SIZE - 1;);
+
+ if (len < BTR_EXTERN_FIELD_REF_SIZE) {
+
+ ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_INNODB_INDEX_CORRUPT,
+ "Externally stored column(" ULINTPF
+ ") has a reference length of " ULINTPF
+ " in the cluster index %s",
+ i, len, m_cluster_index->name());
+
+ return(DB_CORRUPTION);
+ }
+
+ field += len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_SPACE_ID);
+
+ mach_write_to_4(field, get_space_id());
+
+ if (UNIV_LIKELY_NULL(m_rec_iter.current_block()->page.zip.data)) {
+ page_zip_write_blob_ptr(
+ m_rec_iter.current_block(), rec, m_cluster_index,
+ offsets, i, &m_rec_iter.m_mtr);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Adjusts the BLOB reference in the clustered index row for all externally
+stored columns.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_columns(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW
+{
+ ut_ad(rec_offs_any_extern(offsets));
+
+ /* Adjust the space_id in the BLOB pointers. */
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) {
+
+ /* Only if the column is stored "externally". */
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dberr_t err;
+
+ err = adjust_cluster_index_blob_column(rec, offsets, i);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** In the clustered index, adjust BLOB pointers as needed. Also update the
+BLOB reference, write the new space id.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_ref(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW
+{
+ if (rec_offs_any_extern(offsets)) {
+ dberr_t err;
+
+ err = adjust_cluster_index_blob_columns(rec, offsets);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Purge delete-marked records, only if it is possible to do so without
+re-organising the B+tree.
+@return true if purge succeeded */
+inline bool PageConverter::purge() UNIV_NOTHROW
+{
+ /* We can't have a page that is empty and not root. */
+ if (m_rec_iter.remove(m_offsets)) {
+
+ ++m_index->m_stats.m_n_purged;
+
+ return(true);
+ } else {
+ ++m_index->m_stats.m_n_purge_failed;
+ }
+
+ return(false);
+}
+
+/** Adjust the BLOB references and sys fields for the current record.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code. */
+inline
+dberr_t
+PageConverter::adjust_cluster_record(
+ rec_t* rec,
+ const rec_offs* offsets) UNIV_NOTHROW
+{
+ dberr_t err;
+
+ if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) {
+
+ /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields
+ are only written in conjunction with other changes to the
+ record. */
+ ulint trx_id_pos = m_cluster_index->n_uniq
+ ? m_cluster_index->n_uniq : 1;
+ if (UNIV_LIKELY_NULL(m_rec_iter.current_block()
+ ->page.zip.data)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ m_rec_iter.current_block(),
+ rec, m_offsets, trx_id_pos,
+ 0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS,
+ &m_rec_iter.m_mtr);
+ } else {
+ ulint len;
+ byte* ptr = rec_get_nth_field(
+ rec, m_offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memcpy(ptr, reset_trx_id, sizeof reset_trx_id);
+ }
+ }
+
+ return(err);
+}
+
+/** Update the BLOB refrences and write UNDO log entries for
+rows that can't be purged optimistically.
+@param block block to update
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_records(
+ buf_block_t* block) UNIV_NOTHROW
+{
+ ibool comp = dict_table_is_comp(m_cfg->m_table);
+ bool clust_index = m_index->m_srv_index == m_cluster_index;
+
+ /* This will also position the cursor on the first user record. */
+
+ if (!m_rec_iter.open(block, m_index->m_srv_index)) {
+ return DB_CORRUPTION;
+ }
+
+ while (!m_rec_iter.end()) {
+ rec_t* rec = m_rec_iter.current();
+ ibool deleted = rec_get_deleted_flag(rec, comp);
+
+ /* For the clustered index we have to adjust the BLOB
+ reference and the system fields irrespective of the
+ delete marked flag. The adjustment of delete marked
+ cluster records is required for purge to work later. */
+
+ if (deleted || clust_index) {
+ m_offsets = rec_get_offsets(
+ rec, m_index->m_srv_index, m_offsets,
+ m_index->m_srv_index->n_core_fields,
+ ULINT_UNDEFINED, &m_heap);
+ }
+
+ if (clust_index) {
+
+ dberr_t err = adjust_cluster_record(rec, m_offsets);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* If it is a delete marked record then try an
+ optimistic delete. */
+
+ if (deleted) {
+ ++m_index->m_stats.m_n_deleted;
+ /* A successful purge will move the cursor to the
+ next record. */
+
+ if (purge()) {
+ continue;
+ }
+ } else {
+ ++m_index->m_stats.m_n_rows;
+ }
+
+ if (!m_rec_iter.next()) {
+ return DB_CORRUPTION;
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Update the space, index id, trx id.
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_index_page(
+ buf_block_t* block) UNIV_NOTHROW
+{
+ const page_id_t page_id(block->page.id());
+
+ if (is_free(page_id.page_no())) {
+ return(DB_SUCCESS);
+ }
+
+ buf_frame_t* page = block->page.frame;
+ const index_id_t id = btr_page_get_index_id(page);
+
+ if (id != m_index->m_id) {
+ row_index_t* index = find_index(id);
+
+ if (UNIV_UNLIKELY(!index)) {
+ if (!m_cfg->m_missing) {
+ ib::warn() << "Unknown index id " << id
+ << " on page " << page_id.page_no();
+ }
+ return DB_SUCCESS;
+ }
+
+ m_index = index;
+ }
+
+ /* If the .cfg file is missing and there is an index mismatch
+ then ignore the error. */
+ if (m_cfg->m_missing && !m_index->m_srv_index) {
+ return(DB_SUCCESS);
+ }
+
+ if (m_index && page_id.page_no() == m_index->m_page_no) {
+ byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE
+ + page;
+ mach_write_to_4(b, page_id.space());
+
+ memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE
+ + page, b, 4);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy(&block->page.zip.data[FIL_PAGE_DATA
+ + PAGE_BTR_SEG_TOP
+ + FSEG_HDR_SPACE], b, 4);
+ memcpy(&block->page.zip.data[FIL_PAGE_DATA
+ + PAGE_BTR_SEG_LEAF
+ + FSEG_HDR_SPACE], b, 4);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!block->page.zip.data || page_zip_validate(&block->page.zip, page,
+ m_index->m_srv_index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* This has to be written to uncompressed index header. Set it to
+ the current index id. */
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID),
+ m_index->m_srv_index->id);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID],
+ &block->page.frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
+ }
+
+ if (m_index->m_srv_index->is_clust()) {
+ if (page_id.page_no() != m_index->m_srv_index->page) {
+ goto clear_page_max_trx_id;
+ }
+ } else if (page_is_leaf(page)) {
+ /* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */
+ mach_write_to_8(&block->page.frame
+ [PAGE_HEADER + PAGE_MAX_TRX_ID], m_trx->id);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memcpy_aligned<8>(&block->page.zip.data
+ [PAGE_HEADER + PAGE_MAX_TRX_ID],
+ &block->page.frame
+ [PAGE_HEADER + PAGE_MAX_TRX_ID], 8);
+ }
+ } else {
+clear_page_max_trx_id:
+ /* Clear PAGE_MAX_TRX_ID so that it can be
+ used for other purposes in the future. IMPORT
+ in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
+ would set the field to the transaction ID even
+ on clustered index pages. */
+ memset_aligned<8>(&block->page.frame
+ [PAGE_HEADER + PAGE_MAX_TRX_ID],
+ 0, 8);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ memset_aligned<8>(&block->page.zip.data
+ [PAGE_HEADER + PAGE_MAX_TRX_ID],
+ 0, 8);
+ }
+ }
+
+ if (page_is_empty(page)) {
+
+ /* Only a root page can be empty. */
+ if (page_has_siblings(page)) {
+ // TODO: We should relax this and skip secondary
+ // indexes. Mark them as corrupt because they can
+ // always be rebuilt.
+ return(DB_CORRUPTION);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ return page_is_leaf(block->page.frame)
+ ? update_records(block)
+ : DB_SUCCESS;
+}
+
+/** Validate the space flags and update tablespace header page.
+@param block block read from file, not from the buffer pool.
+@retval DB_SUCCESS or error code */
+inline dberr_t PageConverter::update_header(buf_block_t* block) UNIV_NOTHROW
+{
+ byte *frame= get_frame(block);
+ if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + frame,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, 4))
+ ib::warn() << "Space id check in the header failed: ignored";
+ else if (!mach_read_from_4(FIL_PAGE_SPACE_ID + frame))
+ return DB_CORRUPTION;
+
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+
+ /* Write space_id to the tablespace header, page 0. */
+ mach_write_to_4(FIL_PAGE_SPACE_ID + frame, get_space_id());
+ memcpy_aligned<2>(FSP_HEADER_OFFSET + FSP_SPACE_ID + frame,
+ FIL_PAGE_SPACE_ID + frame, 4);
+ /* Write back the adjusted flags. */
+ mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + frame, m_space_flags);
+
+ return DB_SUCCESS;
+}
+
+/** Update the page, set the space id, max trx id and index id.
+@param block block read from file
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_page(buf_block_t* block, uint16_t& page_type)
+ UNIV_NOTHROW
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(!block->page.zip.data == !is_compressed_table());
+
+ switch (page_type = fil_page_get_type(get_frame(block))) {
+ case FIL_PAGE_TYPE_FSP_HDR:
+ ut_a(block->page.id().page_no() == 0);
+ /* Work directly on the uncompressed page headers. */
+ return(update_header(block));
+
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ /* We need to decompress the contents
+ before we can do anything. */
+
+ if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
+ return(DB_CORRUPTION);
+ }
+
+ /* fall through */
+ case FIL_PAGE_TYPE_INSTANT:
+ /* This is on every page in the tablespace. */
+ mach_write_to_4(
+ get_frame(block)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+ /* Only update the Btree nodes. */
+ return(update_index_page(block));
+
+ case FIL_PAGE_TYPE_SYS:
+ /* This is page 0 in the system tablespace. */
+ return(DB_CORRUPTION);
+
+ case FIL_PAGE_TYPE_XDES:
+ err = set_current_xdes(
+ block->page.id().page_no(), get_frame(block));
+ /* fall through */
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_TYPE_TRX_SYS:
+ case FIL_PAGE_IBUF_FREE_LIST:
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_BLOB:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+
+ /* Work directly on the uncompressed page headers. */
+ /* This is on every page in the tablespace. */
+ mach_write_to_4(
+ get_frame(block)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+ return(err);
+ }
+
+ ib::warn() << "Unknown page type (" << page_type << ")";
+
+ return(DB_CORRUPTION);
+}
+
+/** Called for every page in the tablespace. If the page was not
+updated then its state must be set to BUF_PAGE_NOT_USED.
+@param block block read from file, note it is not from the buffer pool
+@retval DB_SUCCESS or error code. */
+dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+ /* If we already had an old page with matching number
+ in the buffer pool, evict it now, because
+ we no longer evict the pages on DISCARD TABLESPACE. */
+ buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH,
+ nullptr, BUF_PEEK_IF_IN_POOL,
+ nullptr, nullptr, false);
+
+ uint16_t page_type;
+
+ if (dberr_t err = update_page(block, page_type)) {
+ return err;
+ }
+
+ const bool full_crc32 = fil_space_t::full_crc32(get_space_flags());
+ byte* frame = get_frame(block);
+ memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
+
+ if (!block->page.zip.data) {
+ buf_flush_init_for_writing(
+ NULL, block->page.frame, NULL, full_crc32);
+ } else if (fil_page_type_is_index(page_type)) {
+ buf_flush_init_for_writing(
+ NULL, block->page.zip.data, &block->page.zip,
+ full_crc32);
+ } else {
+ /* Calculate and update the checksum of non-index
+ pages for ROW_FORMAT=COMPRESSED tables. */
+ buf_flush_update_zip_checksum(
+ block->page.zip.data, block->zip_size());
+ }
+
+ return DB_SUCCESS;
+}
+
+/*****************************************************************//**
+Clean up after import tablespace. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cleanup(
+/*===============*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */
+ dberr_t err) /*!< in: error code */
+{
+ if (err != DB_SUCCESS) {
+ dict_table_t* table = prebuilt->table;
+ table->file_unreadable = true;
+ if (table->space) {
+ fil_close_tablespace(table->space_id);
+ table->space = NULL;
+ }
+
+ prebuilt->trx->error_info = NULL;
+
+ ib::info() << "Discarding tablespace of table "
+ << table->name << ": " << err;
+
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ index->page = FIL_NULL;
+ }
+ }
+
+ DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+
+ prebuilt->trx->commit();
+
+ if (prebuilt->trx->dict_operation_lock_mode) {
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+ }
+
+ prebuilt->trx->op_info = "";
+
+ DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
+
+ return(err);
+}
+
+/*****************************************************************//**
+Report error during tablespace import. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_error(
+/*=============*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */
+ dberr_t err) /*!< in: error code */
+{
+ if (!trx_is_interrupted(prebuilt->trx)) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ prebuilt->table->name.m_name);
+
+ ib_senderrf(
+ prebuilt->trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_INNODB_IMPORT_ERROR,
+ table_name, (ulong) err, ut_strerr(err));
+ }
+
+ return row_import_cleanup(prebuilt, err);
+}
+
+/*****************************************************************//**
+Adjust the root page index node and leaf node segment headers, update
+with the new space id. For all the table's secondary indexes.
+@return error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_adjust_root_pages_of_secondary_indexes(
+/*==============================================*/
+ trx_t* trx, /*!< in: transaction used for
+ the import */
+ dict_table_t* table, /*!< in: table the indexes
+ belong to */
+ const row_import& cfg) /*!< Import context */
+{
+ dict_index_t* index;
+ ulint n_rows_in_table;
+ dberr_t err = DB_SUCCESS;
+
+ /* Skip the clustered index. */
+ index = dict_table_get_first_index(table);
+
+ n_rows_in_table = cfg.get_n_rows(index->name);
+
+ DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure",
+ n_rows_in_table++;);
+
+ /* Adjust the root pages of the secondary indexes only. */
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ ut_a(!dict_index_is_clust(index));
+
+ if (!(index->type & DICT_CORRUPT)
+ && index->page != FIL_NULL) {
+
+ /* Update the Btree segment headers for index node and
+ leaf nodes in the root page. Set the new space id. */
+
+ err = btr_root_adjust_on_import(index);
+ } else {
+ ib::warn() << "Skip adjustment of root pages for"
+ " index " << index->name << ".";
+
+ err = DB_CORRUPTION;
+ }
+
+ if (err != DB_SUCCESS) {
+
+ if (index->type & DICT_CLUSTERED) {
+ break;
+ }
+
+ ib_errf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN,
+ ER_INNODB_INDEX_CORRUPT,
+ "Index %s not found or corrupt,"
+ " you should recreate this index.",
+ index->name());
+
+ /* Do not bail out, so that the data
+ can be recovered. */
+
+ err = DB_SUCCESS;
+ index->type |= DICT_CORRUPT;
+ continue;
+ }
+
+ /* If we failed to purge any records in the index then
+ do it the hard way.
+
+ TODO: We can do this in the first pass by generating UNDO log
+ records for the failed rows. */
+
+ if (!cfg.requires_purge(index->name)) {
+ continue;
+ }
+
+ IndexPurge purge(trx, index);
+
+ trx->op_info = "secondary: purge delete marked records";
+
+ err = purge.garbage_collect();
+
+ trx->op_info = "";
+
+ if (err != DB_SUCCESS) {
+ break;
+ } else if (purge.get_n_rows() != n_rows_in_table) {
+
+ ib_errf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN,
+ ER_INNODB_INDEX_CORRUPT,
+ "Index '%s' contains " ULINTPF " entries, "
+ "should be " ULINTPF ", you should recreate "
+ "this index.", index->name(),
+ purge.get_n_rows(), n_rows_in_table);
+
+ index->type |= DICT_CORRUPT;
+
+ /* Do not bail out, so that the data
+ can be recovered. */
+
+ err = DB_SUCCESS;
+ }
+ }
+
+ return(err);
+}
+
+/*****************************************************************//**
+Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */
+MY_ATTRIBUTE((nonnull)) static
+void
+row_import_set_sys_max_row_id(
+/*==========================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from
+ handler */
+ const dict_table_t* table) /*!< in: table to import */
+{
+ const rec_t* rec;
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ row_id_t row_id = 0;
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+ ut_ad(index->is_primary());
+ ut_ad(dict_index_is_auto_gen_clust(index));
+
+ mtr_start(&mtr);
+
+ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+ if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr)
+ == DB_SUCCESS) {
+ rec = btr_pcur_move_to_prev_on_page(&pcur);
+
+ if (!rec) {
+ /* The table is corrupted. */
+ } else if (page_rec_is_infimum(rec)) {
+ /* The table is empty. */
+ } else if (rec_is_metadata(rec, *index)) {
+ /* The clustered index contains the metadata
+ record only, that is, the table is empty. */
+ } else {
+ row_id = mach_read_from_6(rec);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ if (row_id) {
+ /* Update the system row id if the imported index row id is
+ greater than the max system row id. */
+ dict_sys.update_row_id(row_id);
+ }
+}
+
+/*****************************************************************//**
+Read the a string from the meta data file.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_cfg_read_string(
+/*=======================*/
+ FILE* file, /*!< in/out: File to read from */
+ byte* ptr, /*!< out: string to read */
+ ulint max_len) /*!< in: maximum length of the output
+ buffer in bytes */
+{
+ DBUG_EXECUTE_IF("ib_import_string_read_error",
+ errno = EINVAL; return(DB_IO_ERROR););
+
+ ulint len = 0;
+
+ while (!feof(file)) {
+ int ch = fgetc(file);
+
+ if (ch == EOF) {
+ break;
+ } else if (ch != 0) {
+ if (len < max_len) {
+ ptr[len++] = static_cast<byte>(ch);
+ } else {
+ break;
+ }
+ /* max_len includes the NUL byte */
+ } else if (len != max_len - 1) {
+ break;
+ } else {
+ ptr[len] = 0;
+ return(DB_SUCCESS);
+ }
+ }
+
+ errno = EINVAL;
+
+ return(DB_IO_ERROR);
+}
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cfg_read_index_fields(
+/*=============================*/
+ FILE* file, /*!< in: file to write to */
+ THD* thd, /*!< in/out: session */
+ row_index_t* index) /*!< Index being read in */
+{
+ byte row[sizeof(ib_uint32_t) * 3];
+ ulint n_fields = index->m_n_fields;
+
+ index->m_fields = UT_NEW_ARRAY_NOKEY(dict_field_t, n_fields);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_4",
+ UT_DELETE_ARRAY(index->m_fields);
+ index->m_fields = NULL;
+ );
+
+ if (index->m_fields == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dict_field_t* field = index->m_fields;
+
+ for (ulint i = 0; i < n_fields; ++i, ++field) {
+ byte* ptr = row;
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_1",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading index fields.");
+
+ return(DB_IO_ERROR);
+ }
+
+ new (field) dict_field_t();
+
+ field->prefix_len = mach_read_from_4(ptr) & ((1U << 12) - 1);
+ ptr += sizeof(ib_uint32_t);
+
+ field->fixed_len = mach_read_from_4(ptr) & ((1U << 10) - 1);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Include the NUL byte in the length. */
+ ulint len = mach_read_from_4(ptr);
+
+ byte* name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_5",
+ UT_DELETE_ARRAY(name);
+ name = NULL;
+ );
+
+ if (name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ field->name = reinterpret_cast<const char*>(name);
+
+ dberr_t err = row_import_cfg_read_string(file, name, len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing table name.");
+
+ return(err);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the index names and root page numbers of the indexes and set the values.
+Row format [root_page_no, len of str, str ... ]
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_index_data(
+/*=======================*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import* cfg) /*!< in/out: meta-data read */
+{
+ byte* ptr;
+ row_index_t* cfg_index;
+ byte row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9];
+
+ /* FIXME: What is the max value? */
+ ut_a(cfg->m_n_indexes > 0);
+ ut_a(cfg->m_n_indexes < 1024);
+
+ cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_6",
+ UT_DELETE_ARRAY(cfg->m_indexes);
+ cfg->m_indexes = NULL;
+ );
+
+ if (cfg->m_indexes == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+ cfg_index = cfg->m_indexes;
+
+ for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) {
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_2",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the index data. */
+ size_t n_bytes = fread(row, 1, sizeof(row), file);
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (n_bytes != sizeof(row)) {
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg),
+ "while reading index meta-data, expected "
+ "to read " ULINTPF
+ " bytes but read only " ULINTPF " bytes",
+ sizeof(row), n_bytes);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno), msg);
+
+ ib::error() << "IO Error: " << msg;
+
+ return(DB_IO_ERROR);
+ }
+
+ ptr = row;
+
+ cfg_index->m_id = mach_read_from_8(ptr);
+ ptr += sizeof(index_id_t);
+
+ cfg_index->m_space = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_page_no = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_type = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_trx_id_offset = mach_read_from_4(ptr);
+ if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) {
+ ut_ad(0);
+ /* Overflow. Pretend that the clustered index
+ has a variable-length PRIMARY KEY. */
+ cfg_index->m_trx_id_offset = 0;
+ }
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_uniq = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_nullable = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg_index->m_n_fields = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ /* The NUL byte is included in the name length. */
+ ulint len = mach_read_from_4(ptr);
+
+ if (len > OS_FILE_MAX_PATH) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_INNODB_INDEX_CORRUPT,
+ "Index name length (" ULINTPF ") is too long, "
+ "the meta-data is corrupt", len);
+
+ return(DB_CORRUPTION);
+ }
+
+ cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_7",
+ UT_DELETE_ARRAY(cfg_index->m_name);
+ cfg_index->m_name = NULL;
+ );
+
+ if (cfg_index->m_name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dberr_t err;
+
+ err = row_import_cfg_read_string(file, cfg_index->m_name, len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing index name.");
+
+ return(err);
+ }
+
+ err = row_import_cfg_read_index_fields(file, thd, cfg_index);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the index root page number for v1 format.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_read_indexes(
+/*====================*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import* cfg) /*!< in/out: meta-data read */
+{
+ byte row[sizeof(ib_uint32_t)];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_3",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the number of indexes. */
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading number of indexes.");
+
+ return(DB_IO_ERROR);
+ }
+
+ cfg->m_n_indexes = mach_read_from_4(row);
+
+ if (cfg->m_n_indexes == 0) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ "Number of indexes in meta-data file is 0");
+
+ return(DB_CORRUPTION);
+
+ } else if (cfg->m_n_indexes > 1024) {
+ // FIXME: What is the upper limit? */
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ "Number of indexes in meta-data file is too high: "
+ ULINTPF, cfg->m_n_indexes);
+ cfg->m_n_indexes = 0;
+
+ return(DB_CORRUPTION);
+ }
+
+ return(row_import_read_index_data(file, thd, cfg));
+}
+
+/*********************************************************************//**
+Read the meta data (table columns) config file. Deserialise the contents of
+dict_col_t structure, along with the column name. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_columns(
+/*====================*/
+ FILE* file, /*!< in: file to write to */
+ THD* thd, /*!< in/out: session */
+ row_import* cfg) /*!< in/out: meta-data read */
+{
+ dict_col_t* col;
+ byte row[sizeof(ib_uint32_t) * 8];
+
+ /* FIXME: What should the upper limit be? */
+ ut_a(cfg->m_n_cols > 0);
+ ut_a(cfg->m_n_cols < 1024);
+
+ cfg->m_cols = UT_NEW_ARRAY_NOKEY(dict_col_t, cfg->m_n_cols);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_8",
+ UT_DELETE_ARRAY(cfg->m_cols);
+ cfg->m_cols = NULL;
+ );
+
+ if (cfg->m_cols == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ cfg->m_col_names = UT_NEW_ARRAY_NOKEY(byte*, cfg->m_n_cols);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_9",
+ UT_DELETE_ARRAY(cfg->m_col_names);
+ cfg->m_col_names = NULL;
+ );
+
+ if (cfg->m_col_names == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols);
+ memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols);
+
+ col = cfg->m_cols;
+
+ for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) {
+ byte* ptr = row;
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_4",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading table column meta-data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ col->prtype = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ col->mtype = static_cast<byte>(mach_read_from_4(ptr));
+ ptr += sizeof(ib_uint32_t);
+
+ col->len = static_cast<uint16_t>(mach_read_from_4(ptr));
+ ptr += sizeof(ib_uint32_t);
+
+ uint32_t mbminmaxlen = mach_read_from_4(ptr);
+ col->mbmaxlen = (mbminmaxlen / 5) & 7;
+ col->mbminlen = (mbminmaxlen % 5) & 7;
+ ptr += sizeof(ib_uint32_t);
+
+ col->ind = mach_read_from_4(ptr) & dict_index_t::MAX_N_FIELDS;
+ ptr += sizeof(ib_uint32_t);
+
+ col->ord_part = mach_read_from_4(ptr) & 1;
+ ptr += sizeof(ib_uint32_t);
+
+ col->max_prefix = mach_read_from_4(ptr) & ((1U << 12) - 1);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Read in the column name as [len, byte array]. The len
+ includes the NUL byte. */
+
+ ulint len = mach_read_from_4(ptr);
+
+ /* FIXME: What is the maximum column name length? */
+ if (len == 0 || len > 128) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_IO_READ_ERROR,
+ "Column name length " ULINTPF ", is invalid",
+ len);
+
+ return(DB_CORRUPTION);
+ }
+
+ cfg->m_col_names[i] = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_10",
+ UT_DELETE_ARRAY(cfg->m_col_names[i]);
+ cfg->m_col_names[i] = NULL;
+ );
+
+ if (cfg->m_col_names[i] == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dberr_t err;
+
+ err = row_import_cfg_read_string(
+ file, cfg->m_col_names[i], len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing table column name.");
+
+ return(err);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_v1(
+/*===============*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import* cfg) /*!< out: meta data */
+{
+ byte value[sizeof(ib_uint32_t)];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_5",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the hostname where the tablespace was exported. */
+ if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data export hostname length.");
+
+ return(DB_IO_ERROR);
+ }
+
+ ulint len = mach_read_from_4(value);
+
+ /* NUL byte is part of name length. */
+ cfg->m_hostname = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_1",
+ UT_DELETE_ARRAY(cfg->m_hostname);
+ cfg->m_hostname = NULL;
+ );
+
+ if (cfg->m_hostname == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ dberr_t err = row_import_cfg_read_string(file, cfg->m_hostname, len);
+
+ if (err != DB_SUCCESS) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing export hostname.");
+
+ return(err);
+ }
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_6",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the table name of tablespace that was exported. */
+ if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data table name length.");
+
+ return(DB_IO_ERROR);
+ }
+
+ len = mach_read_from_4(value);
+
+ /* NUL byte is part of name length. */
+ cfg->m_table_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+ /* Trigger OOM */
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_2",
+ UT_DELETE_ARRAY(cfg->m_table_name);
+ cfg->m_table_name = NULL;
+ );
+
+ if (cfg->m_table_name == NULL) {
+ return(DB_OUT_OF_MEMORY);
+ }
+
+ err = row_import_cfg_read_string(file, cfg->m_table_name, len);
+
+ if (err != DB_SUCCESS) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while parsing table name.");
+
+ return(err);
+ }
+
+ ib::info() << "Importing tablespace for table '" << cfg->m_table_name
+ << "' that was exported from host '" << cfg->m_hostname << "'";
+
+ byte row[sizeof(ib_uint32_t) * 3];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_7",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the autoinc value. */
+ if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading autoinc value.");
+
+ return(DB_IO_ERROR);
+ }
+
+ cfg->m_autoinc = mach_read_from_8(row);
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_8",
+ (void) fseek(file, 0L, SEEK_END););
+
+ /* Read the tablespace page size. */
+ if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data header.");
+
+ return(DB_IO_ERROR);
+ }
+
+ byte* ptr = row;
+
+ const ulint logical_page_size = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ if (logical_page_size != srv_page_size) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "Tablespace to be imported has a different"
+ " page size than this server. Server page size"
+ " is %lu, whereas tablespace page size"
+ " is " ULINTPF,
+ srv_page_size,
+ logical_page_size);
+
+ return(DB_ERROR);
+ }
+
+ cfg->m_flags = mach_read_from_4(ptr);
+ ptr += sizeof(ib_uint32_t);
+
+ cfg->m_zip_size = dict_tf_get_zip_size(cfg->m_flags);
+ cfg->m_n_cols = mach_read_from_4(ptr);
+
+ if (!dict_tf_is_valid(cfg->m_flags)) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLE_SCHEMA_MISMATCH,
+ "Invalid table flags: " ULINTPF, cfg->m_flags);
+
+ return(DB_CORRUPTION);
+ }
+
+ err = row_import_read_columns(file, thd, cfg);
+
+ if (err == DB_SUCCESS) {
+ err = row_import_read_indexes(file, thd, cfg);
+ }
+
+ return(err);
+}
+
+/**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_meta_data(
+/*======================*/
+ FILE* file, /*!< in: File to read from */
+ THD* thd, /*!< in: session */
+ row_import& cfg) /*!< out: contents of the .cfg file */
+{
+ byte row[sizeof(ib_uint32_t)];
+
+ /* Trigger EOF */
+ DBUG_EXECUTE_IF("ib_import_io_read_error_9",
+ (void) fseek(file, 0L, SEEK_END););
+
+ if (fread(&row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno),
+ "while reading meta-data version.");
+
+ return(DB_IO_ERROR);
+ }
+
+ cfg.m_version = mach_read_from_4(row);
+
+ /* Check the version number. */
+ switch (cfg.m_version) {
+ case IB_EXPORT_CFG_VERSION_V1:
+
+ return(row_import_read_v1(file, thd, &cfg));
+ default:
+ ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+ "Unsupported meta-data version number (" ULINTPF "), "
+ "file ignored", cfg.m_version);
+ }
+
+ return(DB_ERROR);
+}
+
+#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
+ FIL_NULL if none */
+#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB part header, in bytes */
+
+/* decrypt and decompress page if needed */
+static dberr_t decrypt_decompress(fil_space_crypt_t *space_crypt,
+ uint32_t space_flags, span<byte> page,
+ uint32_t space_id, byte *page_compress_buf)
+{
+ auto *data= page.data();
+
+ if (space_crypt && space_crypt->should_encrypt())
+ {
+ if (!buf_page_verify_crypt_checksum(data, space_flags))
+ return DB_CORRUPTION;
+
+ if (dberr_t err= fil_space_decrypt(space_id, space_flags, space_crypt,
+ data, page.size(), data))
+ return err;
+ }
+
+ bool page_compressed= false;
+
+ if (fil_space_t::full_crc32(space_flags) &&
+ fil_space_t::is_compressed(space_flags))
+ page_compressed= buf_page_is_compressed(data, space_flags);
+ else
+ {
+ switch (fil_page_get_type(data)) {
+ case FIL_PAGE_PAGE_COMPRESSED:
+ case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+ page_compressed= true;
+ }
+ }
+
+ if (page_compressed)
+ {
+ auto compress_length=
+ fil_page_decompress(page_compress_buf, data, space_flags);
+ ut_ad(compress_length != srv_page_size);
+
+ if (compress_length == 0)
+ return DB_CORRUPTION;
+ }
+
+ return DB_SUCCESS;
+}
+
+static size_t get_buf_size()
+{
+ return srv_page_size + (
+ provider_service_lzo->is_loaded ? LZO1X_1_15_MEM_COMPRESS :
+ provider_service_snappy->is_loaded ? snappy_max_compressed_length(srv_page_size) :
+ 0
+ );
+}
+
+/* find, parse instant metadata, performing variaous checks,
+and apply it to dict_table_t
+@return DB_SUCCESS or some error */
+static dberr_t handle_instant_metadata(dict_table_t *table,
+ const row_import &cfg)
+{
+ dict_get_and_save_data_dir_path(table);
+
+ char *filepath;
+ if (DICT_TF_HAS_DATA_DIR(table->flags))
+ {
+ ut_a(table->data_dir_path);
+ filepath= fil_make_filepath(table->data_dir_path, table->name, IBD, true);
+ }
+ else
+ filepath= fil_make_filepath(nullptr, table->name, IBD, false);
+
+ if (!filepath)
+ return DB_OUT_OF_MEMORY;
+
+ SCOPE_EXIT([filepath]() { ut_free(filepath); });
+
+ bool success;
+ auto file= os_file_create_simple_no_error_handling(
+ innodb_data_file_key, filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, false,
+ &success);
+ if (!success)
+ return DB_IO_ERROR;
+
+ if (os_file_get_size(file) < srv_page_size)
+ return DB_CORRUPTION;
+
+ SCOPE_EXIT([&file]() { os_file_close(file); });
+
+ std::unique_ptr<byte[], decltype(&aligned_free)> first_page(
+ static_cast<byte *>(aligned_malloc(srv_page_size, srv_page_size)),
+ &aligned_free);
+
+ if (dberr_t err= os_file_read(IORequestReadPartial, file, first_page.get(),
+ 0, srv_page_size, nullptr))
+ return err;
+
+ auto space_flags= fsp_header_get_flags(first_page.get());
+
+ if (!fil_space_t::is_valid_flags(space_flags, true))
+ {
+ auto cflags= fsp_flags_convert_from_101(space_flags);
+ if (cflags == UINT32_MAX)
+ return invalid_space_flags(space_flags);
+ space_flags= static_cast<decltype(space_flags)>(cflags);
+ }
+
+ if (!cfg.m_missing)
+ {
+ if (dberr_t err= cfg.match_flags(current_thd))
+ return err;
+ }
+
+ const unsigned zip_size= fil_space_t::zip_size(space_flags);
+ const unsigned physical_size= zip_size ? zip_size : unsigned(srv_page_size);
+ ut_ad(physical_size <= UNIV_PAGE_SIZE_MAX);
+ const uint32_t space_id= page_get_space_id(first_page.get());
+
+ auto *space_crypt= fil_space_read_crypt_data(zip_size, first_page.get());
+ SCOPE_EXIT([&space_crypt]() {
+ if (space_crypt)
+ fil_space_destroy_crypt_data(&space_crypt);
+ });
+
+ std::unique_ptr<byte[], decltype(&aligned_free)> page(
+ static_cast<byte *>(
+ aligned_malloc(UNIV_PAGE_SIZE_MAX, UNIV_PAGE_SIZE_MAX)),
+ &aligned_free);
+
+ if (dberr_t err= os_file_read(
+ IORequestReadPartial, file, page.get(), 3 * physical_size,
+ physical_size, nullptr))
+ return err;
+
+ std::unique_ptr<byte[]> page_compress_buf(new byte[get_buf_size()]);
+
+ if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+ {page.get(), static_cast<size_t>
+ (physical_size)},
+ space_id, page_compress_buf.get()))
+ return err;
+
+ if (table->supports_instant())
+ {
+ dict_index_t *index= dict_table_get_first_index(table);
+
+ if (!page_is_comp(page.get()) != !dict_table_is_comp(table))
+ {
+ ib_errf(current_thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+ "ROW_FORMAT mismatch");
+ return DB_CORRUPTION;
+ }
+
+ if (btr_cur_instant_root_init(index, page.get()))
+ return DB_CORRUPTION;
+
+ ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+ if (fil_page_get_type(page.get()) == FIL_PAGE_INDEX)
+ {
+ ut_ad(!index->is_instant());
+ return DB_SUCCESS;
+ }
+
+ mem_heap_t *heap= NULL;
+ SCOPE_EXIT([&heap]() {
+ if (heap)
+ mem_heap_free(heap);
+ });
+
+ while (btr_page_get_level(page.get()) != 0)
+ {
+ const rec_t *rec= page_rec_get_next(page_get_infimum_rec(page.get()));
+ if (!rec)
+ return DB_CORRUPTION;
+
+ /* Relax the assertion in rec_init_offsets(). */
+ ut_ad(!index->in_instant_init);
+ ut_d(index->in_instant_init= true);
+ rec_offs *offsets=
+ rec_get_offsets(rec, index, nullptr, 0, ULINT_UNDEFINED, &heap);
+ ut_d(index->in_instant_init= false);
+
+ uint64_t child_page_no= btr_node_ptr_get_child_page_no(rec, offsets);
+
+ if (dberr_t err=
+ os_file_read(IORequestReadPartial, file, page.get(),
+ child_page_no * physical_size, physical_size, nullptr))
+ return err;
+
+ if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+ {page.get(), static_cast<size_t>
+ (physical_size)}, space_id,
+ page_compress_buf.get()))
+ return err;
+ }
+
+ const auto *rec= page_rec_get_next_const(page_get_infimum_rec(page.get()));
+ const auto comp= dict_table_is_comp(index->table);
+
+ if (!rec || page_rec_is_supremum(rec))
+ {
+ corrupted_metadata:
+ ib::error() << "Table " << index->table->name
+ << " is missing instant ALTER metadata";
+ index->table->corrupted= true;
+ return DB_CORRUPTION;
+ }
+
+ const auto info_bits= rec_get_info_bits(rec, comp);
+ if (!(info_bits & REC_INFO_MIN_REC_FLAG))
+ goto corrupted_metadata;
+
+ if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG ||
+ (comp && rec_get_status(rec) != REC_STATUS_INSTANT))
+ {
+ incompatible:
+ ib::error() << "Table " << index->table->name
+ << " contains unrecognizable instant ALTER metadata";
+ index->table->corrupted= true;
+ return DB_CORRUPTION;
+ }
+
+ if (info_bits & REC_INFO_DELETED_FLAG)
+ {
+ ulint trx_id_offset= index->trx_id_offset;
+ ut_ad(index->n_uniq);
+
+ if (trx_id_offset)
+ {
+ }
+ else if (index->table->not_redundant())
+ {
+
+ for (uint i= index->n_uniq; i--;)
+ trx_id_offset+= index->fields[i].fixed_len;
+ }
+ else if (rec_get_1byte_offs_flag(rec))
+ {
+ trx_id_offset= rec_1_get_field_end_info(rec, index->n_uniq - 1);
+ ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+ trx_id_offset&= ~REC_1BYTE_SQL_NULL_MASK;
+ }
+ else
+ {
+ trx_id_offset= rec_2_get_field_end_info(rec, index->n_uniq - 1);
+ ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+ trx_id_offset&= ~REC_2BYTE_SQL_NULL_MASK;
+ }
+
+ const byte *ptr=
+ rec + trx_id_offset + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (mach_read_from_4(ptr + BTR_EXTERN_LEN))
+ goto incompatible;
+
+ uint len= mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+ if (!len || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) != FIL_PAGE_DATA)
+ goto incompatible;
+
+ std::unique_ptr<byte[], decltype(&aligned_free)>
+ second_page(static_cast<byte*>(aligned_malloc(physical_size,
+ physical_size)),
+ &aligned_free);
+
+ if (dberr_t err=
+ os_file_read(IORequestReadPartial, file, second_page.get(),
+ physical_size *
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO),
+ physical_size, nullptr))
+ return err;
+
+ if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+ {second_page.get(),
+ static_cast<size_t>(physical_size)},
+ space_id, page_compress_buf.get()))
+ return err;
+
+ if (fil_page_get_type(second_page.get()) != FIL_PAGE_TYPE_BLOB ||
+ mach_read_from_4(
+ &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO]) !=
+ FIL_NULL ||
+ mach_read_from_4(
+ &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN]) != len)
+ goto incompatible;
+
+ /* The unused part of the BLOB page should be zero-filled. */
+ for (const byte *
+ b= second_page.get() + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) +
+ len,
+ *const end= second_page.get() + srv_page_size - BTR_EXTERN_LEN;
+ b < end;)
+ {
+ if (*b++)
+ goto incompatible;
+ }
+
+ if (index->table->deserialise_columns(
+ &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len))
+ goto incompatible;
+ }
+
+ rec_offs *offsets= rec_get_offsets(
+ rec, index, nullptr, index->n_core_fields, ULINT_UNDEFINED, &heap);
+ if (rec_offs_any_default(offsets))
+ {
+ inconsistent:
+ goto incompatible;
+ }
+
+ /* In fact, because we only ever append fields to the metadata
+ record, it is also OK to perform READ UNCOMMITTED and
+ then ignore any extra fields, provided that
+ trx_sys.is_registered(DB_TRX_ID). */
+ if (rec_offs_n_fields(offsets) >
+ ulint(index->n_fields) + !!index->table->instant &&
+ !trx_sys.is_registered(current_trx(),
+ row_get_rec_trx_id(rec, index, offsets)))
+ goto inconsistent;
+
+ for (unsigned i= index->n_core_fields; i < index->n_fields; i++)
+ {
+ dict_col_t *col= index->fields[i].col;
+ const unsigned o= i + !!index->table->instant;
+ ulint len;
+ const byte *data= rec_get_nth_field(rec, offsets, o, &len);
+ ut_ad(!col->is_added());
+ ut_ad(!col->def_val.data);
+ col->def_val.len= len;
+ switch (len) {
+ case UNIV_SQL_NULL:
+ continue;
+ case 0:
+ col->def_val.data= field_ref_zero;
+ continue;
+ }
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ if (!rec_offs_nth_extern(offsets, o))
+ col->def_val.data= mem_heap_dup(index->table->heap, data, len);
+ else if (len < BTR_EXTERN_FIELD_REF_SIZE ||
+ !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE))
+ {
+ col->def_val.len= UNIV_SQL_DEFAULT;
+ goto inconsistent;
+ }
+ else
+ {
+ col->def_val.data= btr_copy_externally_stored_field(
+ &col->def_val.len, data, srv_page_size, len, index->table->heap);
+ }
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/**
+Read the contents of the <tablename>.cfg file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_cfg(
+/*================*/
+ dict_table_t* table, /*!< in: table */
+ THD* thd, /*!< in: session */
+ row_import& cfg) /*!< out: contents of the .cfg file */
+{
+ dberr_t err;
+ char name[OS_FILE_MAX_PATH];
+
+ cfg.m_table = table;
+
+ srv_get_meta_data_filename(table, name, sizeof(name));
+
+ FILE* file = fopen(name, "rb");
+
+ if (file == NULL) {
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg),
+ "Error opening '%s', will attempt to import"
+ " without schema verification", name);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno), msg);
+
+ cfg.m_missing = true;
+
+ err = DB_FAIL;
+ } else {
+
+ cfg.m_missing = false;
+
+ err = row_import_read_meta_data(file, thd, cfg);
+ fclose(file);
+ }
+
+ return(err);
+}
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out] trx dictionary transaction
+@param[in,out] table persistent table
+@param[in] reset whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+{
+ const dict_index_t* index;
+ que_t* graph = 0;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(reset || table->space->id == table->space_id);
+
+ static const char sql[] = {
+ "PROCEDURE UPDATE_INDEX_ROOT() IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES\n"
+ "SET SPACE = :space,\n"
+ " PAGE_NO = :page,\n"
+ " TYPE = :type\n"
+ "WHERE TABLE_ID = :table_id AND ID = :index_id;\n"
+ "END;\n"};
+
+ table->def_trx_id = trx->id;
+
+ for (index = dict_table_get_first_index(table);
+ index != 0;
+ index = dict_table_get_next_index(index)) {
+
+ pars_info_t* info;
+ ib_uint32_t page;
+ ib_uint32_t space;
+ ib_uint32_t type;
+ index_id_t index_id;
+ table_id_t table_id;
+
+ info = (graph != 0) ? graph->info : pars_info_create();
+
+ mach_write_to_4(
+ reinterpret_cast<byte*>(&type),
+ index->type);
+
+ mach_write_to_4(
+ reinterpret_cast<byte*>(&page),
+ reset ? FIL_NULL : index->page);
+
+ mach_write_to_4(
+ reinterpret_cast<byte*>(&space),
+ reset ? FIL_NULL : index->table->space_id);
+
+ mach_write_to_8(
+ reinterpret_cast<byte*>(&index_id),
+ index->id);
+
+ mach_write_to_8(
+ reinterpret_cast<byte*>(&table_id),
+ table->id);
+
+ /* If we set the corrupt bit during the IMPORT phase then
+ we need to update the system tables. */
+ pars_info_bind_int4_literal(info, "type", &type);
+ pars_info_bind_int4_literal(info, "space", &space);
+ pars_info_bind_int4_literal(info, "page", &page);
+ pars_info_bind_ull_literal(info, "index_id", &index_id);
+ pars_info_bind_ull_literal(info, "table_id", &table_id);
+
+ if (graph == 0) {
+ graph = pars_sql(info, sql);
+ ut_a(graph);
+ graph->trx = trx;
+ }
+
+ que_thr_t* thr;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ DBUG_EXECUTE_IF("ib_import_internal_error",
+ trx->error_state = DB_ERROR;);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_INTERNAL_ERROR,
+ "While updating the <space, root page"
+ " number> of index %s - %s",
+ index->name(), ut_strerr(err));
+
+ break;
+ }
+ }
+
+ que_graph_free(graph);
+
+ return(err);
+}
+
+/** Callback arg for row_import_set_discarded. */
+struct discard_t {
+ ib_uint32_t flags2; /*!< Value read from column */
+ bool state; /*!< New state of the flag */
+ ulint n_recs; /*!< Number of recs processed */
+};
+
+/******************************************************************//**
+Fetch callback that sets or unsets the DISCARDED tablespace flag in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+row_import_set_discarded(
+/*=====================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: bool set/unset flag */
+{
+ sel_node_t* node = static_cast<sel_node_t*>(row);
+ discard_t* discard = static_cast<discard_t*>(user_arg);
+ dfield_t* dfield = que_node_get_val(node->select_list);
+ dtype_t* type = dfield_get_type(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ ut_a(dtype_get_mtype(type) == DATA_INT);
+ ut_a(len == sizeof(ib_uint32_t));
+
+ ulint flags2 = mach_read_from_4(
+ static_cast<byte*>(dfield_get_data(dfield)));
+
+ if (discard->state) {
+ flags2 |= DICT_TF2_DISCARDED;
+ } else {
+ flags2 &= ~DICT_TF2_DISCARDED;
+ }
+
+ mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
+
+ ++discard->n_recs;
+
+ /* There should be at most one matching record. */
+ ut_a(discard->n_recs == 1);
+
+ return(FALSE);
+}
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out] trx dictionary transaction
+@param[in] table_id table identifier
+@param[in] discarded whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+ bool discarded)
+{
+ pars_info_t* info;
+ discard_t discard;
+
+ static const char sql[] =
+ "PROCEDURE UPDATE_DISCARDED_FLAG() IS\n"
+ "DECLARE FUNCTION my_func;\n"
+ "DECLARE CURSOR c IS\n"
+ " SELECT MIX_LEN"
+ " FROM SYS_TABLES"
+ " WHERE ID = :table_id FOR UPDATE;"
+ "\n"
+ "BEGIN\n"
+ "OPEN c;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH c INTO my_func();\n"
+ " IF c % NOTFOUND THEN\n"
+ " EXIT;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "UPDATE SYS_TABLES"
+ " SET MIX_LEN = :flags2"
+ " WHERE ID = :table_id;\n"
+ "CLOSE c;\n"
+ "END;\n";
+
+ discard.n_recs = 0;
+ discard.state = discarded;
+ discard.flags2 = ULINT32_UNDEFINED;
+
+ info = pars_info_create();
+
+ pars_info_add_ull_literal(info, "table_id", table_id);
+ pars_info_bind_int4_literal(info, "flags2", &discard.flags2);
+
+ pars_info_bind_function(
+ info, "my_func", row_import_set_discarded, &discard);
+
+ dberr_t err = que_eval_sql(info, sql, trx);
+
+ ut_a(discard.n_recs == 1);
+ ut_a(discard.flags2 != ULINT32_UNDEFINED);
+
+ return(err);
+}
+
+/** InnoDB writes page by page when there is page compressed
+tablespace involved. It does help to save the disk space when
+punch hole is enabled
+@param iter Tablespace iterator
+@param full_crc32 whether the file is in the full_crc32 format
+@param offset offset of the file to be written
+@param writeptr buffer to be written
+@param n_bytes number of bytes to be written
+@param try_punch_only Try the range punch only because the
+ current range is full of empty pages
+@return DB_SUCCESS */
+static
+dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
+ bool full_crc32,
+ os_offset_t offset,
+ const byte *writeptr,
+ ulint n_bytes,
+ bool try_punch_only= false)
+{
+ if (dberr_t err= os_file_punch_hole(iter.file, offset, n_bytes))
+ return err;
+
+ if (try_punch_only)
+ return DB_SUCCESS;
+
+ for (ulint j= 0; j < n_bytes; j+= srv_page_size)
+ {
+ /* Read the original data length from block and
+ safer to read FIL_PAGE_COMPRESSED_SIZE because it
+ is not encrypted*/
+ ulint n_write_bytes= srv_page_size;
+ if (j || offset)
+ {
+ n_write_bytes= mach_read_from_2(writeptr + j + FIL_PAGE_DATA);
+ const unsigned ptype= mach_read_from_2(writeptr + j + FIL_PAGE_TYPE);
+ /* Ignore the empty page */
+ if (ptype == 0 && n_write_bytes == 0)
+ continue;
+ if (full_crc32)
+ n_write_bytes= buf_page_full_crc32_size(writeptr + j,
+ nullptr, nullptr);
+ else
+ {
+ n_write_bytes+= ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+ ? FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN
+ : FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+ }
+ }
+
+ if (dberr_t err= os_file_write(IORequestWrite, iter.filepath, iter.file,
+ writeptr + j, offset + j, n_write_bytes))
+ return err;
+ }
+
+ return DB_SUCCESS;
+}
+
+dberr_t FetchIndexRootPages::run(const fil_iterator_t& iter,
+ buf_block_t* block) UNIV_NOTHROW
+{
+ const unsigned zip_size= fil_space_t::zip_size(m_space_flags);
+ const unsigned size= zip_size ? zip_size : unsigned(srv_page_size);
+ byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size()));
+ const bool full_crc32 = fil_space_t::full_crc32(m_space_flags);
+ bool skip_checksum_check = false;
+ ut_ad(!srv_read_only_mode);
+
+ if (!page_compress_buf)
+ return DB_OUT_OF_MEMORY;
+
+ const bool encrypted= iter.crypt_data != NULL &&
+ iter.crypt_data->should_encrypt();
+ byte* const readptr= iter.io_buffer;
+ block->page.frame= readptr;
+
+ if (block->page.zip.data)
+ block->page.zip.data= readptr;
+
+ bool page_compressed= false;
+
+ dberr_t err= os_file_read(IORequestReadPartial, iter.file, readptr,
+ 3 * size, size, nullptr);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << iter.filepath << ": os_file_read() failed";
+ goto func_exit;
+ }
+
+ if (page_get_page_no(readptr) != 3)
+ {
+page_corrupted:
+ ib::warn() << filename() << ": Page 3 at offset "
+ << 3 * size << " looks corrupted.";
+ err= DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ block->page.id_.set_page_no(3);
+ if (full_crc32 && fil_space_t::is_compressed(m_space_flags))
+ page_compressed= buf_page_is_compressed(readptr, m_space_flags);
+ else
+ {
+ switch (fil_page_get_type(readptr)) {
+ case FIL_PAGE_PAGE_COMPRESSED:
+ case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+ if (block->page.zip.data)
+ goto page_corrupted;
+ page_compressed= true;
+ }
+ }
+
+ if (encrypted)
+ {
+ if (!buf_page_verify_crypt_checksum(readptr, m_space_flags))
+ goto page_corrupted;
+
+ if ((err= fil_space_decrypt(get_space_id(), m_space_flags, iter.crypt_data,
+ readptr, size, readptr)))
+ goto func_exit;
+ }
+
+ /* For full_crc32 format, skip checksum check
+ after decryption. */
+ skip_checksum_check= full_crc32 && encrypted;
+
+ if (page_compressed)
+ {
+ ulint compress_length= fil_page_decompress(page_compress_buf,
+ readptr,
+ m_space_flags);
+ ut_ad(compress_length != srv_page_size);
+ if (compress_length == 0)
+ goto page_corrupted;
+ }
+ else if (!skip_checksum_check
+ && buf_page_is_corrupted(false, readptr, m_space_flags))
+ goto page_corrupted;
+
+ err= this->operator()(block);
+func_exit:
+ free(page_compress_buf);
+ return err;
+}
+
+static dberr_t fil_iterate(
+ const fil_iterator_t& iter,
+ buf_block_t* block,
+ AbstractCallback& callback)
+{
+ os_offset_t offset;
+ const ulint size = callback.physical_size();
+ ulint n_bytes = iter.n_io_buffers * size;
+
+ byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size()));
+ ut_ad(!srv_read_only_mode);
+
+ if (!page_compress_buf) {
+ return DB_OUT_OF_MEMORY;
+ }
+
+ uint32_t actual_space_id = 0;
+ const bool full_crc32 = fil_space_t::full_crc32(
+ callback.get_space_flags());
+
+ /* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless
+ copying for non-index pages. Unfortunately, it is
+ required by buf_zip_decompress() */
+ dberr_t err = DB_SUCCESS;
+ bool page_compressed = false;
+ bool punch_hole = !my_test_if_thinly_provisioned(iter.file);
+
+ for (offset = iter.start; offset < iter.end; offset += n_bytes) {
+ if (callback.is_interrupted()) {
+ err = DB_INTERRUPTED;
+ goto func_exit;
+ }
+
+ byte* io_buffer = iter.io_buffer;
+ block->page.frame = io_buffer;
+
+ if (block->page.zip.data) {
+ /* Zip IO is done in the compressed page buffer. */
+ io_buffer = block->page.zip.data;
+ }
+
+ /* We have to read the exact number of bytes. Otherwise the
+ InnoDB IO functions croak on failed reads. */
+
+ n_bytes = ulint(ut_min(os_offset_t(n_bytes),
+ iter.end - offset));
+
+ ut_ad(n_bytes > 0);
+ ut_ad(!(n_bytes % size));
+
+ const bool encrypted = iter.crypt_data != NULL
+ && iter.crypt_data->should_encrypt();
+ /* Use additional crypt io buffer if tablespace is encrypted */
+ byte* const readptr = encrypted
+ ? iter.crypt_io_buffer : io_buffer;
+ byte* const writeptr = readptr;
+
+ err = os_file_read(IORequestReadPartial, iter.file, readptr,
+ offset, n_bytes, nullptr);
+ if (err != DB_SUCCESS) {
+ ib::error() << iter.filepath
+ << ": os_file_read() failed";
+ goto func_exit;
+ }
+
+ bool updated = false;
+ os_offset_t page_off = offset;
+ ulint n_pages_read = n_bytes / size;
+ /* This block is not attached to buf_pool */
+ block->page.id_.set_page_no(uint32_t(page_off / size));
+
+ for (ulint i = 0; i < n_pages_read;
+ ++block->page.id_,
+ ++i, page_off += size, block->page.frame += size) {
+ byte* src = readptr + i * size;
+ const ulint page_no = page_get_page_no(src);
+ if (!page_no && block->page.id().page_no()) {
+ if (!buf_is_zeroes(span<const byte>(src,
+ size))) {
+ goto page_corrupted;
+ }
+ /* Proceed to the next page,
+ because this one is all zero. */
+ continue;
+ }
+
+ if (page_no != block->page.id().page_no()) {
+page_corrupted:
+ ib::warn() << callback.filename()
+ << ": Page " << (offset / size)
+ << " at offset " << offset
+ << " looks corrupted.";
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ if (block->page.id().page_no() == 0) {
+ actual_space_id = mach_read_from_4(
+ src + FIL_PAGE_SPACE_ID);
+ }
+
+ const uint16_t type = fil_page_get_type(src);
+ page_compressed =
+ (full_crc32
+ && fil_space_t::is_compressed(
+ callback.get_space_flags())
+ && buf_page_is_compressed(
+ src, callback.get_space_flags()))
+ || type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+ || type == FIL_PAGE_PAGE_COMPRESSED;
+
+ if (page_compressed && block->page.zip.data) {
+ goto page_corrupted;
+ }
+
+ bool decrypted = false;
+ byte* dst = io_buffer + i * size;
+ bool frame_changed = false;
+ uint key_version = buf_page_get_key_version(
+ src, callback.get_space_flags());
+
+ if (!encrypted) {
+ } else if (!key_version) {
+ if (block->page.id().page_no() == 0
+ && block->page.zip.data) {
+ block->page.zip.data = src;
+ frame_changed = true;
+ } else if (!page_compressed
+ && type != FIL_PAGE_TYPE_XDES
+ && !block->page.zip.data) {
+ block->page.frame = src;
+ frame_changed = true;
+ } else {
+ ut_ad(dst != src);
+ memcpy(dst, src, size);
+ }
+ } else {
+ if (!buf_page_verify_crypt_checksum(
+ src, callback.get_space_flags())) {
+ goto page_corrupted;
+ }
+
+ if ((err = fil_space_decrypt(
+ actual_space_id,
+ callback.get_space_flags(),
+ iter.crypt_data, dst,
+ callback.physical_size(),
+ src))) {
+ goto func_exit;
+ }
+
+ decrypted = true;
+ updated = true;
+ }
+
+ /* For full_crc32 format, skip checksum check
+ after decryption. */
+ bool skip_checksum_check = full_crc32 && encrypted;
+
+ /* If the original page is page_compressed, we need
+ to decompress it before adjusting further. */
+ if (page_compressed) {
+ ulint compress_length = fil_page_decompress(
+ page_compress_buf, dst,
+ callback.get_space_flags());
+ ut_ad(compress_length != srv_page_size);
+ if (compress_length == 0) {
+ goto page_corrupted;
+ }
+ updated = true;
+ } else if (!skip_checksum_check
+ && buf_page_is_corrupted(
+ false,
+ encrypted && !frame_changed
+ ? dst : src,
+ callback.get_space_flags())) {
+ goto page_corrupted;
+ }
+
+ if ((err = callback(block)) != DB_SUCCESS) {
+ goto func_exit;
+ } else if (!updated) {
+ updated = !!block->page.frame;
+ }
+
+ /* If tablespace is encrypted we use additional
+ temporary scratch area where pages are read
+ for decrypting readptr == crypt_io_buffer != io_buffer.
+
+ Destination for decryption is a buffer pool block
+ block->page.frame == dst == io_buffer that is updated.
+ Pages that did not require decryption even when
+ tablespace is marked as encrypted are not copied
+ instead block->page.frame is set to src == readptr.
+
+ For encryption we again use temporary scratch area
+ writeptr != io_buffer == dst
+ that is then written to the tablespace
+
+ (1) For normal tables io_buffer == dst == writeptr
+ (2) For only page compressed tables
+ io_buffer == dst == writeptr
+ (3) For encrypted (and page compressed)
+ readptr != io_buffer == dst != writeptr
+ */
+
+ ut_ad(!encrypted && !page_compressed ?
+ src == dst && dst == writeptr + (i * size):1);
+ ut_ad(page_compressed && !encrypted ?
+ src == dst && dst == writeptr + (i * size):1);
+ ut_ad(encrypted ?
+ src != dst && dst != writeptr + (i * size):1);
+
+ /* When tablespace is encrypted or compressed its
+ first page (i.e. page 0) is not encrypted or
+ compressed and there is no need to copy frame. */
+ if (encrypted && block->page.id().page_no() != 0) {
+ byte *local_frame = callback.get_frame(block);
+ ut_ad((writeptr + (i * size)) != local_frame);
+ memcpy((writeptr + (i * size)), local_frame, size);
+ }
+
+ if (frame_changed) {
+ if (block->page.zip.data) {
+ block->page.zip.data = dst;
+ } else {
+ block->page.frame = dst;
+ }
+ }
+
+ src = io_buffer + (i * size);
+
+ if (page_compressed) {
+ updated = true;
+ if (ulint len = fil_page_compress(
+ src,
+ page_compress_buf,
+ callback.get_space_flags(),
+ 512,/* FIXME: proper block size */
+ encrypted)) {
+ /* FIXME: remove memcpy() */
+ memcpy(src, page_compress_buf, len);
+ memset(src + len, 0,
+ srv_page_size - len);
+ }
+ }
+
+ /* Encrypt the page if encryption was used. */
+ if (encrypted && decrypted) {
+ byte *dest = writeptr + i * size;
+
+ byte* tmp = fil_encrypt_buf(
+ iter.crypt_data,
+ block->page.id().space(),
+ block->page.id().page_no(),
+ src, block->zip_size(), dest,
+ full_crc32);
+
+ if (tmp == src) {
+ /* TODO: remove unnecessary memcpy's */
+ ut_ad(dest != src);
+ memcpy(dest, src, size);
+ }
+
+ updated = true;
+ }
+
+ /* Write checksum for the compressed full crc32 page.*/
+ if (full_crc32 && page_compressed) {
+ ut_ad(updated);
+ byte* dest = writeptr + i * size;
+ ut_d(bool comp = false);
+ ut_d(bool corrupt = false);
+ ulint size = buf_page_full_crc32_size(
+ dest,
+#ifdef UNIV_DEBUG
+ &comp, &corrupt
+#else
+ NULL, NULL
+#endif
+ );
+ ut_ad(!comp == (size == srv_page_size));
+ ut_ad(!corrupt);
+ mach_write_to_4(dest + (size - 4),
+ my_crc32c(0, dest, size - 4));
+ }
+ }
+
+ if (page_compressed && punch_hole) {
+ err = fil_import_compress_fwrite(
+ iter, full_crc32, offset, writeptr, n_bytes,
+ !updated);
+
+ if (err != DB_SUCCESS) {
+ punch_hole = false;
+ if (updated) {
+ goto normal_write;
+ }
+ }
+ } else if (updated) {
+normal_write:
+ /* A page was updated in the set, write it back. */
+ err = os_file_write(IORequestWrite,
+ iter.filepath, iter.file,
+ writeptr, offset, n_bytes);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ }
+
+func_exit:
+ free(page_compress_buf);
+ return err;
+}
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return DB_SUCCESS or error code */
+static
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+ dict_table_t* table,
+ ulint n_io_buffers,
+ AbstractCallback& callback)
+{
+ dberr_t err;
+ pfs_os_file_t file;
+ char* filepath;
+
+ ut_a(n_io_buffers > 0);
+ ut_ad(!srv_read_only_mode);
+
+ DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
+ return(DB_CORRUPTION););
+
+ /* Make sure the data_dir_path is set. */
+ dict_get_and_save_data_dir_path(table);
+
+ ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+
+ const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+ ? table->data_dir_path : nullptr;
+
+ filepath = fil_make_filepath(data_dir_path,
+ {table->name.m_name,
+ strlen(table->name.m_name)},
+ IBD, data_dir_path != nullptr);
+ if (!filepath) {
+ return(DB_OUT_OF_MEMORY);
+ } else {
+ bool success;
+
+ file = os_file_create_simple_no_error_handling(
+ innodb_data_file_key, filepath,
+ OS_FILE_OPEN, OS_FILE_READ_WRITE, false, &success);
+
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(true);
+ ib::error() << "Trying to import a tablespace,"
+ " but could not open the tablespace file "
+ << filepath;
+ ut_free(filepath);
+ return DB_TABLESPACE_NOT_FOUND;
+ } else {
+ err = DB_SUCCESS;
+ }
+ }
+
+ callback.set_file(filepath, file);
+
+ os_offset_t file_size = os_file_get_size(file);
+ ut_a(file_size != (os_offset_t) -1);
+
+ /* Allocate a page to read in the tablespace header, so that we
+ can determine the page size and zip_size (if it is compressed).
+ We allocate an extra page in case it is a compressed table. */
+
+ byte* page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+ srv_page_size));
+
+ buf_block_t* block = reinterpret_cast<buf_block_t*>
+ (ut_zalloc_nokey(sizeof *block));
+ block->page.frame = page;
+ block->page.init(buf_page_t::UNFIXED + 1, page_id_t{~0ULL});
+
+ /* Read the first page and determine the page size. */
+
+ err = os_file_read(IORequestReadPartial, file, page, 0, srv_page_size,
+ nullptr);
+
+ if (err == DB_SUCCESS) {
+ err = callback.init(file_size, block);
+ }
+
+ if (err == DB_SUCCESS) {
+ block->page.id_ = page_id_t(callback.get_space_id(), 0);
+ if (ulint zip_size = callback.get_zip_size()) {
+ page_zip_set_size(&block->page.zip, zip_size);
+ /* ROW_FORMAT=COMPRESSED is not optimised for block IO
+ for now. We do the IMPORT page by page. */
+ n_io_buffers = 1;
+ }
+
+ fil_iterator_t iter;
+
+ /* read (optional) crypt data */
+ iter.crypt_data = fil_space_read_crypt_data(
+ callback.get_zip_size(), page);
+
+ /* If tablespace is encrypted, it needs extra buffers */
+ if (iter.crypt_data && n_io_buffers > 1) {
+ /* decrease io buffers so that memory
+ consumption will not double */
+ n_io_buffers /= 2;
+ }
+
+ iter.file = file;
+ iter.start = 0;
+ iter.end = file_size;
+ iter.filepath = filepath;
+ iter.file_size = file_size;
+ iter.n_io_buffers = n_io_buffers;
+
+ /* Add an extra page for compressed page scratch area. */
+ iter.io_buffer = static_cast<byte*>(
+ aligned_malloc((1 + iter.n_io_buffers)
+ << srv_page_size_shift, srv_page_size));
+
+ iter.crypt_io_buffer = iter.crypt_data
+ ? static_cast<byte*>(
+ aligned_malloc((1 + iter.n_io_buffers)
+ << srv_page_size_shift,
+ srv_page_size))
+ : NULL;
+
+ if (block->page.zip.ssize) {
+ ut_ad(iter.n_io_buffers == 1);
+ block->page.frame = iter.io_buffer;
+ block->page.zip.data = block->page.frame
+ + srv_page_size;
+ }
+
+ err = callback.run(iter, block);
+
+ if (iter.crypt_data) {
+ fil_space_destroy_crypt_data(&iter.crypt_data);
+ }
+
+ aligned_free(iter.crypt_io_buffer);
+ aligned_free(iter.io_buffer);
+ }
+
+ if (err == DB_SUCCESS) {
+ ib::info() << "Sync to disk";
+
+ if (!os_file_flush(file)) {
+ ib::info() << "os_file_flush() failed!";
+ err = DB_IO_ERROR;
+ } else {
+ ib::info() << "Sync to disk - done!";
+ }
+ }
+
+ os_file_close(file);
+
+ aligned_free(page);
+ ut_free(filepath);
+ ut_free(block);
+
+ return(err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+ dict_table_t* table, /*!< in/out: table */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */
+{
+ dberr_t err;
+ ib_uint64_t autoinc = 0;
+ char* filepath = NULL;
+ trx_t* trx = prebuilt->trx;
+
+ /* The caller assured that this is not read_only_mode and that no
+ temorary tablespace is being imported. */
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!table->is_temporary());
+
+ ut_ad(table->space_id);
+ ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
+ ut_ad(trx);
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ ut_ad(!table->is_readable());
+
+ ibuf_delete_for_discarded_space(table->space_id);
+
+ /* Assign an undo segment for the transaction, so that the
+ transaction will be recovered after a crash. */
+
+ /* TODO: Do not write any undo log for the IMPORT cleanup. */
+ {
+ mtr_t mtr;
+ mtr.start();
+ trx_undo_assign(trx, &err, &mtr);
+ mtr.commit();
+ }
+
+ DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
+ err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+ if (err == DB_SUCCESS && !trx->has_logged_persistent()) {
+ err = DB_TOO_MANY_CONCURRENT_TRXS;
+ }
+ if (err != DB_SUCCESS) {
+ return row_import_cleanup(prebuilt, err);
+ }
+
+ trx->op_info = "read meta-data file";
+
+ row_import cfg;
+ THD* thd = trx->mysql_thd;
+
+ err = row_import_read_cfg(table, thd, cfg);
+
+ /* Check if the table column definitions match the contents
+ of the config file. */
+
+ if (err == DB_SUCCESS) {
+
+ if (dberr_t err = handle_instant_metadata(table, cfg)) {
+ return row_import_error(prebuilt, err);
+ }
+
+ /* We have a schema file, try and match it with our
+ data dictionary. */
+
+ err = cfg.match_schema(thd);
+
+ /* Update index->page and SYS_INDEXES.PAGE_NO to match the
+ B-tree root page numbers in the tablespace. Use the index
+ name from the .cfg file to find match. */
+
+ if (err == DB_SUCCESS) {
+ cfg.set_root_by_name();
+ autoinc = cfg.m_autoinc;
+ }
+
+ DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
+ err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+ } else if (cfg.m_missing) {
+ /* We don't have a schema file, we will have to discover
+ the index root pages from the .ibd file and skip the schema
+ matching step. */
+
+ ut_a(err == DB_FAIL);
+
+ cfg.m_zip_size = 0;
+
+ if (UT_LIST_GET_LEN(table->indexes) > 1) {
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_INTERNAL_ERROR,
+ "Drop all secondary indexes before importing "
+ "table %s when .cfg file is missing.",
+ table->name.m_name);
+ err = DB_ERROR;
+ return row_import_error(prebuilt, err);
+ }
+
+ FetchIndexRootPages fetchIndexRootPages(table, trx);
+
+ err = fil_tablespace_iterate(
+ table, IO_BUFFER_SIZE(srv_page_size),
+ fetchIndexRootPages);
+
+ if (err == DB_SUCCESS) {
+
+ err = fetchIndexRootPages.build_row_import(&cfg);
+
+ /* Update index->page and SYS_INDEXES.PAGE_NO
+ to match the B-tree root page numbers in the
+ tablespace. */
+
+ if (err == DB_SUCCESS) {
+ err = cfg.set_root_by_heuristic();
+
+ if (err == DB_SUCCESS) {
+ err = handle_instant_metadata(table,
+ cfg);
+ }
+ }
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return row_import_error(prebuilt, err);
+ }
+
+ trx->op_info = "importing tablespace";
+
+ ib::info() << "Phase I - Update all pages";
+
+ /* Iterate over all the pages and do the sanity checking and
+ the conversion required to import the tablespace. */
+
+ PageConverter converter(&cfg, table->space_id, trx);
+
+ /* Set the IO buffer size in pages. */
+
+ err = fil_tablespace_iterate(
+ table, IO_BUFFER_SIZE(cfg.m_zip_size ? cfg.m_zip_size
+ : srv_page_size), converter);
+
+ DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
+ err = DB_TOO_MANY_CONCURRENT_TRXS;);
+#ifdef BTR_CUR_HASH_ADAPT
+ /* On DISCARD TABLESPACE, we did not drop any adaptive hash
+ index entries. If we replaced the discarded tablespace with a
+ smaller one here, there could still be some adaptive hash
+ index entries that point to cached garbage pages in the buffer
+ pool, because PageConverter::operator() only evicted those
+ pages that were replaced by the imported pages. We must
+ detach any remaining adaptive hash index entries, because the
+ adaptive hash index must be a subset of the table contents;
+ false positives are not tolerated. */
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ index = index->clone_if_needed();
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (err != DB_SUCCESS) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ table->name.m_name);
+
+ if (err != DB_DECRYPTION_FAILED) {
+
+ ib_errf(thd, IB_LOG_LEVEL_ERROR,
+ ER_INTERNAL_ERROR,
+ "Error importing tablespace for table %s : %s",
+ table_name, ut_strerr(err));
+ }
+
+ return row_import_cleanup(prebuilt, err);
+ }
+
+ /* If the table is stored in a remote tablespace, we need to
+ determine that filepath from the link file and system tables.
+ Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
+ dict_get_and_save_data_dir_path(table);
+
+ ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+ const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+ ? table->data_dir_path : nullptr;
+ fil_space_t::name_type name{
+ table->name.m_name, strlen(table->name.m_name)};
+
+ filepath = fil_make_filepath(data_dir_path, name, IBD,
+ data_dir_path != nullptr);
+
+ DBUG_EXECUTE_IF(
+ "ib_import_OOM_15",
+ ut_free(filepath);
+ filepath = NULL;
+ );
+
+ if (filepath == NULL) {
+ return row_import_cleanup(prebuilt, DB_OUT_OF_MEMORY);
+ }
+
+ /* Open the tablespace so that we can access via the buffer pool.
+ The tablespace is initially opened as a temporary one, because
+ we will not be writing any redo log for it before we have invoked
+ fil_space_t::set_imported() to declare it a persistent tablespace. */
+
+ table->space = fil_ibd_open(
+ 2, FIL_TYPE_IMPORT, table->space_id,
+ dict_tf_to_fsp_flags(table->flags), name, filepath, &err);
+
+ ut_ad((table->space == NULL) == (err != DB_SUCCESS));
+ DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
+ err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;);
+
+ if (!table->space) {
+ ib_senderrf(thd, IB_LOG_LEVEL_ERROR,
+ ER_GET_ERRMSG,
+ err, ut_strerr(err), filepath);
+ }
+
+ ut_free(filepath);
+
+ if (err == DB_SUCCESS) {
+ err = ibuf_check_bitmap_on_import(trx, table->space);
+ }
+
+ DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return row_import_cleanup(prebuilt, err);
+ }
+
+ /* The first index must always be the clustered index. */
+
+ dict_index_t* index = dict_table_get_first_index(table);
+
+ if (!dict_index_is_clust(index)) {
+ return row_import_error(prebuilt, DB_CORRUPTION);
+ }
+
+ /* Update the Btree segment headers for index node and
+ leaf nodes in the root page. Set the new space id. */
+
+ err = btr_root_adjust_on_import(index);
+
+ DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure",
+ err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return row_import_error(prebuilt, err);
+ } else if (cfg.requires_purge(index->name)) {
+
+ /* Purge any delete-marked records that couldn't be
+ purged during the page conversion phase from the
+ cluster index. */
+
+ IndexPurge purge(trx, index);
+
+ trx->op_info = "cluster: purging delete marked records";
+
+ err = purge.garbage_collect();
+
+ trx->op_info = "";
+ }
+
+ DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return row_import_error(prebuilt, err);
+ }
+
+ /* For secondary indexes, purge any records that couldn't be purged
+ during the page conversion phase. */
+
+ err = row_import_adjust_root_pages_of_secondary_indexes(
+ trx, table, cfg);
+
+ DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure",
+ err = DB_CORRUPTION;);
+
+ if (err != DB_SUCCESS) {
+ return row_import_error(prebuilt, err);
+ }
+
+ /* Ensure that the next available DB_ROW_ID is not smaller than
+ any DB_ROW_ID stored in the table. */
+
+ if (prebuilt->clust_index_was_generated) {
+ row_import_set_sys_max_row_id(prebuilt, table);
+ }
+
+ ib::info() << "Phase III - Flush changes to disk";
+
+ /* Ensure that all pages dirtied during the IMPORT make it to disk.
+ The only dirty pages generated should be from the pessimistic purge
+ of delete marked records that couldn't be purged in Phase I. */
+ while (buf_flush_list_space(prebuilt->table->space));
+
+ for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
+ /* Issue a warning every 10.24 seconds, starting after
+ 2.56 seconds */
+ if ((count & 511) == 128) {
+ ib::warn() << "Waiting for flush to complete on "
+ << prebuilt->table->name;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
+ }
+
+ ib::info() << "Phase IV - Flush complete";
+ prebuilt->table->space->set_imported();
+
+ /* The dictionary latches will be released in in row_import_cleanup()
+ after the transaction commit, for both success and error. */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Update the root pages of the table's indexes. */
+ err = row_import_update_index_root(trx, table, false);
+
+ if (err != DB_SUCCESS) {
+ return row_import_error(prebuilt, err);
+ }
+
+ err = row_import_update_discarded_flag(trx, table->id, false);
+
+ if (err != DB_SUCCESS) {
+ return row_import_error(prebuilt, err);
+ }
+
+ table->file_unreadable = false;
+ table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1);
+
+ /* Set autoinc value read from .cfg file, if one was specified.
+ Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */
+ if (autoinc) {
+ ib::info() << table->name << " autoinc value set to "
+ << autoinc;
+
+ table->autoinc = autoinc--;
+ btr_write_autoinc(dict_table_get_first_index(table), autoinc);
+ }
+
+ return row_import_cleanup(prebuilt, err);
+}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
new file mode 100644
index 00000000..bdee0ed1
--- /dev/null
+++ b/storage/innobase/row/row0ins.cc
@@ -0,0 +1,3843 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.cc
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+#include "dict0dict.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "buf0lru.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#ifdef WITH_WSREP
+#include <wsrep.h>
+#include <mysql/service_wsrep.h>
+#include "ha_prototypes.h"
+#endif /* WITH_WSREP */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/** Create an row template for each index of a table. */
+static void ins_node_create_entry_list(ins_node_t *node)
+{
+ node->entry_list.reserve(UT_LIST_GET_LEN(node->table->indexes));
+
+ for (dict_index_t *index= dict_table_get_first_index(node->table); index;
+ index= dict_table_get_next_index(index))
+ {
+ /* Corrupted or incomplete secondary indexes will be filtered out in
+ row_ins(). */
+ dtuple_t *entry= index->online_status >= ONLINE_INDEX_ABORTED
+ ? dtuple_create(node->entry_sys_heap, 0)
+ : row_build_index_entry_low(node->row, NULL, index, node->entry_sys_heap,
+ ROW_BUILD_FOR_INSERT);
+ node->entry_list.push_back(entry);
+ }
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+ ins_node_t* node) /*!< in: insert node */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ const dict_col_t* col;
+ dfield_t* dfield;
+
+ row = node->row;
+ table = node->table;
+
+ ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+ /* allocate buffer to hold the needed system created hidden columns. */
+ compile_time_assert(DATA_ROW_ID_LEN
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+ == sizeof node->sys_buf);
+ memset(node->sys_buf, 0, sizeof node->sys_buf);
+ /* Assign DB_ROLL_PTR to 1 << ROLL_PTR_INSERT_FLAG_POS */
+ node->sys_buf[DATA_ROW_ID_LEN + DATA_TRX_ID_LEN] = 0x80;
+ ut_ad(!memcmp(node->sys_buf + DATA_ROW_ID_LEN, reset_trx_id,
+ sizeof reset_trx_id));
+
+ /* 1. Populate row-id */
+ col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_set_data(dfield, node->sys_buf, DATA_ROW_ID_LEN);
+
+ /* 2. Populate trx id */
+ col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN],
+ DATA_TRX_ID_LEN);
+
+ col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN
+ + DATA_TRX_ID_LEN],
+ DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row) /*!< in: new row (or first row) for the node */
+{
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->index = NULL;
+ node->entry_list.clear();
+ node->entry = node->entry_list.end();
+
+ node->row = row;
+
+ mem_heap_empty(node->entry_sys_heap);
+
+ /* Create templates for index entries */
+
+ ins_node_create_entry_list(node);
+
+ /* Allocate from entry_sys_heap buffers for sys fields */
+
+ row_ins_alloc_sys_fields(node);
+
+ /* As we allocated a new trx id buf, the trx id should be written
+ there again: */
+
+ node->trx_id = 0;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ rec_offs** offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+ mem_heap_t* offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ big_rec_t* dummy_big_rec;
+ upd_t* update;
+ rec_t* rec;
+ dberr_t err;
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(!cursor->index()->is_clust());
+ ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+ ut_ad(!entry->info_bits);
+
+ /* We know that in the alphabetical ordering, entry and rec are
+ identified. But in their binary form there may be differences if
+ there are char fields in them. Therefore we have to calculate the
+ difference. */
+
+ update = row_upd_build_sec_rec_difference_binary(
+ rec, cursor->index(), *offsets, entry, heap);
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+ /* We should never insert in place of a record that
+ has not been delete-marked. The only exception is when
+ online CREATE INDEX copied the changes that we already
+ made to the clustered index, and completed the
+ secondary index creation before we got here. In this
+ case, the change would already be there. The CREATE
+ INDEX should be in wait_while_table_is_used() at least
+ until this INSERT or UPDATE returns. After that point,
+ set_committed(true) would be invoked in
+ commit_inplace_alter_table(). */
+ ut_a(update->n_fields == 0);
+ ut_ad(!dict_index_is_online_ddl(cursor->index()));
+ return cursor->index()->is_committed()
+ ? DB_CORRUPTION : DB_SUCCESS;
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try an optimistic updating of the record, keeping changes
+ within the page */
+
+ /* TODO: pass only *offsets */
+ err = btr_cur_optimistic_update(
+ flags | BTR_KEEP_SYS_FLAG, cursor,
+ offsets, &offsets_heap, update, 0, thr,
+ thr_get_trx(thr)->id, mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ default:
+ break;
+ }
+ } else {
+ ut_ad(mode == BTR_INSERT_TREE);
+ if (buf_pool.running_out()) {
+
+ return(DB_LOCK_TABLE_FULL);
+ }
+
+ err = btr_cur_pessimistic_update(
+ flags | BTR_KEEP_SYS_FLAG, cursor,
+ offsets, &offsets_heap,
+ heap, &dummy_big_rec, update, 0,
+ thr, thr_get_trx(thr)->id, mtr);
+ ut_ad(!dummy_big_rec);
+ }
+
+ return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return DB_SUCCESS, DB_FAIL, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+ btr_pcur_t* pcur, /*!< in/out: a persistent cursor pointing
+ to the clust_rec that is being modified. */
+ ulint flags, /*!< in: undo logging and locking flags */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: pointer to memory heap that can
+ be emptied, or NULL */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ const rec_t* rec;
+ upd_t* update;
+ dberr_t err = DB_SUCCESS;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ TABLE* mysql_table = NULL;
+ ut_ad(cursor->index()->is_clust());
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(rec_get_deleted_flag(rec,
+ cursor->index()->table->not_redundant()));
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(rec_get_trx_id(rec, cursor->index()));
+
+ /* Build an update vector containing all the fields to be modified;
+ NOTE that this vector may NOT contain system columns trx_id or
+ roll_ptr */
+ if (thr->prebuilt != NULL) {
+ mysql_table = thr->prebuilt->m_mysql_table;
+ ut_ad(thr->prebuilt->trx == thr_get_trx(thr));
+ }
+
+ update = row_upd_build_difference_binary(
+ cursor->index(), entry, rec, NULL, true, true,
+ thr_get_trx(thr), heap, mysql_table, &err);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ if (mode != BTR_MODIFY_TREE) {
+ ut_ad(mode == BTR_MODIFY_LEAF
+ || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+ || mode == BTR_MODIFY_ROOT_AND_LEAF
+ || mode == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
+
+ /* Try optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(
+ flags, cursor, offsets, offsets_heap, update, 0, thr,
+ thr_get_trx(thr)->id, mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ default:
+ break;
+ }
+ } else {
+ if (buf_pool.running_out()) {
+ return DB_LOCK_TABLE_FULL;
+ }
+
+ big_rec_t* big_rec = NULL;
+
+ err = btr_cur_pessimistic_update(
+ flags | BTR_KEEP_POS_FLAG,
+ cursor, offsets, offsets_heap, heap,
+ &big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr);
+
+ if (big_rec) {
+ ut_a(err == DB_SUCCESS);
+
+ DEBUG_SYNC_C("before_row_ins_upd_extern");
+ err = btr_store_big_rec_extern_fields(
+ pcur, *offsets, big_rec, mtr,
+ BTR_STORE_INSERT_UPDATE);
+ DEBUG_SYNC_C("after_row_ins_upd_extern");
+ dtuple_big_rec_free(big_rec);
+ }
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+ que_node_t* node, /*!< in: node in a query graph */
+ dict_table_t* table) /*!< in: table */
+{
+ que_node_t* parent;
+
+ for (parent = que_node_get_parent(node);
+ que_node_get_type(parent) == QUE_NODE_UPDATE;
+ parent = que_node_get_parent(parent)) {
+
+ upd_node_t* upd_node;
+
+ upd_node = static_cast<upd_node_t*>(parent);
+
+ if (upd_node->table == table && !upd_node->is_delete) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return number of ancestors */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+ que_node_t* node) /*!< in: node in a query graph */
+{
+ que_node_t* parent;
+ ulint n_ancestors = 0;
+
+ for (parent = que_node_get_parent(node);
+ que_node_get_type(parent) == QUE_NODE_UPDATE;
+ parent = que_node_get_parent(parent)) {
+
+ n_ancestors++;
+ }
+
+ return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return whether any FULLTEXT INDEX is affected */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_ins_cascade_calc_update_vec(
+/*============================*/
+ upd_node_t* node, /*!< in: update node of the parent
+ table */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint whose
+ type is != 0 */
+ mem_heap_t* heap, /*!< in: memory heap to use as
+ temporary storage */
+ trx_t* trx) /*!< in: update transaction */
+{
+ upd_node_t* cascade = node->cascade_node;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index = foreign->foreign_index;
+ upd_t* update;
+ dict_table_t* parent_table;
+ dict_index_t* parent_index;
+ upd_t* parent_update;
+ ulint n_fields_updated;
+ ulint parent_field_no;
+ ulint i;
+ ulint j;
+ bool doc_id_updated = false;
+ unsigned doc_id_pos = 0;
+ doc_id_t new_doc_id = FTS_NULL_DOC_ID;
+ ulint prefix_col;
+
+ ut_a(cascade);
+ ut_a(table);
+ ut_a(index);
+
+ /* Calculate the appropriate update vector which will set the fields
+ in the child index record to the same value (possibly padded with
+ spaces if the column is a fixed length CHAR or FIXBINARY column) as
+ the referenced index record will get in the update. */
+
+ parent_table = node->table;
+ ut_a(parent_table == foreign->referenced_table);
+ parent_index = foreign->referenced_index;
+ parent_update = node->update;
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+
+ n_fields_updated = 0;
+
+ bool affects_fulltext = foreign->affects_fulltext();
+
+ if (table->fts) {
+ doc_id_pos = dict_table_get_nth_col_pos(
+ table, table->fts->doc_col, &prefix_col);
+ }
+
+ for (i = 0; i < foreign->n_fields; i++) {
+
+ parent_field_no = dict_table_get_nth_col_pos(
+ parent_table,
+ dict_index_get_nth_col_no(parent_index, i),
+ &prefix_col);
+
+ for (j = 0; j < parent_update->n_fields; j++) {
+ const upd_field_t* parent_ufield
+ = &parent_update->fields[j];
+
+ if (parent_ufield->field_no == parent_field_no) {
+
+ ulint min_size;
+ const dict_col_t* col;
+ ulint ufield_len;
+ upd_field_t* ufield;
+
+ col = dict_index_get_nth_col(index, i);
+
+ /* A field in the parent index record is
+ updated. Let us make the update vector
+ field for the child table. */
+
+ ufield = update->fields + n_fields_updated;
+
+ ufield->field_no = static_cast<uint16_t>(
+ dict_table_get_nth_col_pos(
+ table, dict_col_get_no(col),
+ &prefix_col));
+
+ ufield->orig_len = 0;
+ ufield->exp = NULL;
+
+ ufield->new_val = parent_ufield->new_val;
+ dfield_get_type(&ufield->new_val)->prtype |=
+ col->prtype & DATA_VERSIONED;
+ ufield_len = dfield_get_len(&ufield->new_val);
+
+ /* Clear the "external storage" flag */
+ dfield_set_len(&ufield->new_val, ufield_len);
+
+ /* Do not allow a NOT NULL column to be
+ updated as NULL */
+
+ if (dfield_is_null(&ufield->new_val)
+ && (col->prtype & DATA_NOT_NULL)) {
+ goto err_exit;
+ }
+
+ /* If the new value would not fit in the
+ column, do not allow the update */
+
+ if (!dfield_is_null(&ufield->new_val)
+ && dtype_get_at_most_n_mbchars(
+ col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ col->len,
+ ufield_len,
+ static_cast<char*>(
+ dfield_get_data(
+ &ufield->new_val)))
+ < ufield_len) {
+ goto err_exit;
+ }
+
+ /* If the parent column type has a different
+ length than the child column type, we may
+ need to pad with spaces the new value of the
+ child column */
+
+ min_size = dict_col_get_min_size(col);
+
+ /* Because UNIV_SQL_NULL (the marker
+ of SQL NULL values) exceeds all possible
+ values of min_size, the test below will
+ not hold for SQL NULL columns. */
+
+ if (min_size > ufield_len) {
+
+ byte* pad;
+ ulint pad_len;
+ byte* padded_data;
+ ulint mbminlen;
+
+ padded_data = static_cast<byte*>(
+ mem_heap_alloc(
+ heap, min_size));
+
+ pad = padded_data + ufield_len;
+ pad_len = min_size - ufield_len;
+
+ memcpy(padded_data,
+ dfield_get_data(&ufield
+ ->new_val),
+ ufield_len);
+
+ mbminlen = dict_col_get_mbminlen(col);
+
+ ut_ad(!(ufield_len % mbminlen));
+ ut_ad(!(min_size % mbminlen));
+
+ if (mbminlen == 1
+ && dtype_get_charset_coll(
+ col->prtype)
+ == DATA_MYSQL_BINARY_CHARSET_COLL) {
+ /* Do not pad BINARY columns */
+ goto err_exit;
+ }
+
+ row_mysql_pad_col(mbminlen,
+ pad, pad_len);
+ dfield_set_data(&ufield->new_val,
+ padded_data, min_size);
+ }
+
+ /* If Doc ID is updated, check whether the
+ Doc ID is valid */
+ if (table->fts
+ && ufield->field_no == doc_id_pos) {
+ doc_id_t n_doc_id;
+
+ n_doc_id =
+ table->fts->cache->next_doc_id;
+
+ new_doc_id = fts_read_doc_id(
+ static_cast<const byte*>(
+ dfield_get_data(
+ &ufield->new_val)));
+
+ affects_fulltext = true;
+ doc_id_updated = true;
+
+ if (new_doc_id <= 0) {
+ ib::error() << "FTS Doc ID"
+ " must be larger than"
+ " 0";
+ goto err_exit;
+ }
+
+ if (new_doc_id < n_doc_id) {
+ ib::error() << "FTS Doc ID"
+ " must be larger than "
+ << n_doc_id - 1
+ << " for table "
+ << table->name;
+ goto err_exit;
+ }
+ }
+
+ n_fields_updated++;
+ }
+ }
+ }
+
+ if (affects_fulltext) {
+ ut_ad(table->fts);
+
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ doc_id_t doc_id;
+ doc_id_t* next_doc_id;
+ upd_field_t* ufield;
+
+ next_doc_id = static_cast<doc_id_t*>(mem_heap_alloc(
+ heap, sizeof(doc_id_t)));
+
+ ut_ad(!doc_id_updated);
+ ufield = update->fields + n_fields_updated;
+ fts_get_next_doc_id(table, next_doc_id);
+ doc_id = fts_update_doc_id(table, ufield, next_doc_id);
+ n_fields_updated++;
+ fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+ } else {
+ if (doc_id_updated) {
+ ut_ad(new_doc_id);
+ fts_trx_add_op(trx, table, new_doc_id,
+ FTS_INSERT, NULL);
+ } else {
+ ib::error() << "FTS Doc ID must be updated"
+ " along with FTS indexed column for"
+ " table " << table->name;
+err_exit:
+ n_fields_updated = ULINT_UNDEFINED;
+ }
+ }
+ }
+
+ update->n_fields = n_fields_updated;
+
+ return affects_fulltext;
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign) /*!< in: foreign key constraint */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mysql_mutex_lock(&srv_misc_tmpfile_mutex);
+ rewind(srv_misc_tmpfile);
+
+ if (os_file_set_eof(srv_misc_tmpfile)) {
+ ut_print_name(srv_misc_tmpfile, trx,
+ foreign->foreign_table_name);
+ std::string fk_str = dict_print_info_on_foreign_key_in_create_format(
+ trx, foreign, FALSE);
+ fputs(fk_str.c_str(), srv_misc_tmpfile);
+ trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+ } else {
+ trx_set_detailed_error(trx, "temp file operation failed");
+ }
+
+ mysql_mutex_unlock(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+TRANSACTIONAL_TARGET
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ulint n_rec_locks;
+ ulint n_trx_locks;
+ ulint heap_size;
+
+ ut_ad(!srv_read_only_mode);
+
+ {
+ TMLockMutexGuard g{SRW_LOCK_CALL};
+ n_rec_locks = trx->lock.n_rec_locks;
+ n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+ heap_size = mem_heap_get_size(trx->lock.lock_heap);
+ }
+
+ mysql_mutex_lock(&dict_foreign_err_mutex);
+ rewind(dict_foreign_err_file);
+ ut_print_timestamp(dict_foreign_err_file);
+ fputs(" Transaction:\n", dict_foreign_err_file);
+
+ trx_print_low(dict_foreign_err_file, trx, 600,
+ n_rec_locks, n_trx_locks, heap_size);
+
+ mysql_mutex_assert_owner(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+ const char* errstr, /*!< in: error string from the viewpoint
+ of the parent table */
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!< in: a matching index record in the
+ child table */
+ const dtuple_t* entry) /*!< in: index entry in the parent
+ table */
+{
+ std::string fk_str;
+
+ if (srv_read_only_mode) {
+ return;
+ }
+
+ FILE* ef = dict_foreign_err_file;
+ trx_t* trx = thr_get_trx(thr);
+
+ row_ins_set_detailed(trx, foreign);
+
+ row_ins_foreign_trx_print(trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+ TRUE);
+ fputs(fk_str.c_str(), ef);
+ putc('\n', ef);
+ fputs(errstr, ef);
+ fprintf(ef, " in parent table, in index %s",
+ foreign->referenced_index->name());
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in child table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fprintf(ef, ", in index %s", foreign->foreign_index->name());
+ if (rec) {
+ fputs(", there is a record:\n", ef);
+ rec_print(ef, rec, foreign->foreign_index);
+ } else {
+ fputs(", the record is not available\n", ef);
+ }
+ putc('\n', ef);
+
+ mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!< in: a record in the parent table:
+ it does not match entry because we
+ have an error! */
+ const dtuple_t* entry) /*!< in: index entry to insert in the
+ child table */
+{
+ std::string fk_str;
+
+ if (srv_read_only_mode) {
+ return;
+ }
+
+ FILE* ef = dict_foreign_err_file;
+
+ row_ins_set_detailed(trx, foreign);
+
+ row_ins_foreign_trx_print(trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+ TRUE);
+ fputs(fk_str.c_str(), ef);
+ if (foreign->foreign_index) {
+ fprintf(ef, " in parent table, in index %s",
+ foreign->foreign_index->name());
+ } else {
+ fputs(" in parent table", ef);
+ }
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+ It would be better to only display the user columns. */
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in parent table ", ef);
+ ut_print_name(ef, trx, foreign->referenced_table_name);
+ fprintf(ef, ", in index %s,\n"
+ "the closest match we can find is record:\n",
+ foreign->referenced_index->name());
+ if (rec && page_rec_is_supremum(rec)) {
+ /* If the cursor ended on a supremum record, it is better
+ to report the previous record in the error message, so that
+ the user gets a more descriptive error message. */
+ rec = page_rec_get_prev_const(rec);
+ }
+
+ if (rec) {
+ rec_print(ef, rec, foreign->referenced_index);
+ }
+ putc('\n', ef);
+
+ mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ const char* name) /*!< in: table name prefixed with
+ database name and a '/' character */
+{
+ innobase_invalidate_query_cache(thr_get_trx(thr), name);
+}
+
+/** Fill virtual column information in cascade node for the child table.
+@param[out] cascade child update node
+@param[in] rec clustered rec of child table
+@param[in] index clustered index of child table
+@param[in] node parent update node
+@param[in] foreign foreign key information
+@return error code. */
+static
+dberr_t
+row_ins_foreign_fill_virtual(
+ upd_node_t* cascade,
+ const rec_t* rec,
+ dict_index_t* index,
+ upd_node_t* node,
+ dict_foreign_t* foreign)
+{
+ THD* thd = current_thd;
+ row_ext_t* ext;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+ const rec_offs* offsets =
+ rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+ ULINT_UNDEFINED, &cascade->heap);
+ TABLE* mysql_table= NULL;
+ upd_t* update = cascade->update;
+ ulint n_v_fld = index->table->n_v_def;
+ ulint n_diff;
+ upd_field_t* upd_field;
+ dict_vcol_set* v_cols = foreign->v_cols;
+ update->old_vrow = row_build(
+ ROW_COPY_DATA, index, rec,
+ offsets, index->table, NULL, NULL,
+ &ext, update->heap);
+ n_diff = update->n_fields;
+
+ ut_ad(index->table->vc_templ != NULL);
+
+ ib_vcol_row vc(NULL);
+ uchar *record = vc.record(thd, index, &mysql_table);
+ if (!record) {
+ return DB_OUT_OF_MEMORY;
+ }
+ ut_ad(!node->is_delete
+ || (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL));
+ ut_ad(foreign->type & (DICT_FOREIGN_ON_DELETE_SET_NULL
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL
+ | DICT_FOREIGN_ON_UPDATE_CASCADE));
+
+ for (uint16_t i = 0; i < n_v_fld; i++) {
+
+ dict_v_col_t* col = dict_table_get_nth_v_col(
+ index->table, i);
+
+ dict_vcol_set::iterator it = v_cols->find(col);
+
+ if (it == v_cols->end()) {
+ continue;
+ }
+
+ dfield_t* vfield = innobase_get_computed_value(
+ update->old_vrow, col, index,
+ &vc.heap, update->heap, NULL, thd, mysql_table,
+ record, NULL, NULL);
+
+ if (vfield == NULL) {
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+
+ upd_field = update->fields + n_diff;
+
+ upd_field->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(update->heap,
+ sizeof *upd_field->old_v_val));
+
+ dfield_copy(upd_field->old_v_val, vfield);
+
+ upd_field_set_v_field_no(upd_field, i, index);
+
+ dfield_t* new_vfield = innobase_get_computed_value(
+ update->old_vrow, col, index,
+ &vc.heap, update->heap, NULL, thd,
+ mysql_table, record, NULL,
+ update);
+
+ if (new_vfield == NULL) {
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+
+ dfield_copy(&upd_field->new_val, new_vfield);
+
+ if (!dfield_datas_are_binary_equal(
+ upd_field->old_v_val,
+ &upd_field->new_val, 0))
+ n_diff++;
+ }
+
+ update->n_fields = n_diff;
+ return DB_SUCCESS;
+}
+
+#ifdef WITH_WSREP
+dberr_t wsrep_append_foreign_key(trx_t *trx,
+ dict_foreign_t* foreign,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ bool referenced,
+ upd_node_t* upd_node,
+ bool pa_disable,
+ Wsrep_service_key_type key_type);
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_foreign_check_on_constraint(
+/*================================*/
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint whose
+ type is != 0 */
+ btr_pcur_t* pcur, /*!< in: cursor placed on a matching
+ index record in the child table */
+ dtuple_t* entry, /*!< in: index entry in the parent
+ table */
+ mtr_t* mtr) /*!< in: mtr holding the latch of pcur
+ page */
+{
+ upd_node_t* node;
+ upd_node_t* cascade;
+ dict_table_t*const*const fktable = &foreign->foreign_table;
+ dict_table_t* table = *fktable;
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ const rec_t* rec;
+ const rec_t* clust_rec;
+ const buf_block_t* clust_block;
+ upd_t* update;
+ dberr_t err;
+ trx_t* trx;
+ mem_heap_t* tmp_heap = NULL;
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
+
+ DBUG_ENTER("row_ins_foreign_check_on_constraint");
+
+ trx = thr_get_trx(thr);
+
+ /* Since we are going to delete or update a row, we have to invalidate
+ the MySQL query cache for table. A deadlock of threads is not possible
+ here because the caller of this function does not hold any latches with
+ the mutex rank above the lock_sys.latch. The query cache mutex
+ has a rank just above the lock_sys.latch. */
+
+ row_ins_invalidate_query_cache(thr, table->name.m_name);
+
+ node = static_cast<upd_node_t*>(thr->run_node);
+
+ if (node->is_delete && 0 == (foreign->type
+ & (DICT_FOREIGN_ON_DELETE_CASCADE
+ | DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+ row_ins_foreign_report_err("Trying to delete",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ DBUG_RETURN(DB_ROW_IS_REFERENCED);
+ }
+
+ if (!node->is_delete && 0 == (foreign->type
+ & (DICT_FOREIGN_ON_UPDATE_CASCADE
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+ /* This is an UPDATE */
+
+ row_ins_foreign_report_err("Trying to update",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ DBUG_RETURN(DB_ROW_IS_REFERENCED);
+ }
+
+ if (node->cascade_node == NULL) {
+ node->cascade_heap = mem_heap_create(128);
+ node->cascade_node = row_create_update_node_for_mysql(
+ table, node->cascade_heap);
+ que_node_set_parent(node->cascade_node, node);
+
+ }
+ cascade = node->cascade_node;
+ cascade->table = table;
+ cascade->foreign = foreign;
+
+ if (node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+ cascade->is_delete = PLAIN_DELETE;
+ } else {
+ cascade->is_delete = NO_DELETE;
+
+ if (foreign->n_fields > cascade->update_n_fields) {
+ /* We have to make the update vector longer */
+
+ cascade->update = upd_create(foreign->n_fields,
+ node->cascade_heap);
+ cascade->update_n_fields = foreign->n_fields;
+ }
+
+ /* We do not allow cyclic cascaded updating (DELETE is
+ allowed, but not UPDATE) of the same table, as this
+ can lead to an infinite cycle. Check that we are not
+ updating the same table which is already being
+ modified in this cascade chain. We have to check this
+ also because the modification of the indexes of a
+ 'parent' table may still be incomplete, and we must
+ avoid seeing the indexes of the parent table in an
+ inconsistent state! */
+
+ if (row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+ /* We do not know if this would break foreign key
+ constraints, but play safe and return an error */
+
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying an update, possibly causing a cyclic"
+ " cascaded update\n"
+ "in the child table,", thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ if (row_ins_cascade_n_ancestors(cascade) >= FK_MAX_CASCADE_DEL) {
+ err = DB_FOREIGN_EXCEED_MAX_CASCADE;
+
+ row_ins_foreign_report_err(
+ "Trying a too deep cascaded delete or update\n",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ index = pcur->index();
+
+ ut_a(index == foreign->foreign_index);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ tmp_heap = mem_heap_create(256);
+
+ if (dict_index_is_clust(index)) {
+ /* pcur is already positioned in the clustered index of
+ the child table */
+
+ clust_index = index;
+ clust_rec = rec;
+ clust_block = btr_pcur_get_block(pcur);
+ } else {
+ /* We have to look for the record in the clustered index
+ in the child table */
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+ tmp_heap);
+ cascade->pcur->old_rec = nullptr;
+ cascade->pcur->btr_cur.page_cur.index = clust_index;
+ err = btr_pcur_open_with_no_init(ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ cascade->pcur, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto nonstandard_exit_func;
+ }
+
+ clust_rec = btr_pcur_get_rec(cascade->pcur);
+ clust_block = btr_pcur_get_block(cascade->pcur);
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(cascade->pcur)
+ < dict_index_get_n_unique(clust_index)) {
+
+ ib::error() << "In cascade of a foreign key op index "
+ << index->name
+ << " of table " << index->table->name;
+
+ fputs("InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ fputs("\n"
+ "InnoDB: clustered record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report to"
+ " https://jira.mariadb.org/\n", stderr);
+ ut_ad(0);
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ /* Set an X-lock on the row to delete or update in the child table */
+
+ err = lock_table(table, fktable, LOCK_IX, thr);
+
+ if (err == DB_SUCCESS) {
+ /* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+ we already have a normal shared lock on the appropriate
+ gap if the search criterion was not unique */
+
+ err = lock_clust_rec_read_check_and_lock_alt(
+ 0, clust_block, clust_rec, clust_index,
+ LOCK_X, LOCK_REC_NOT_GAP, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto nonstandard_exit_func;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(rec_get_trx_id(clust_rec, clust_index));
+ /* This can happen if there is a circular reference of
+ rows such that cascading delete comes to delete a row
+ already in the process of being delete marked */
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+
+ if (table->fts) {
+ doc_id = fts_get_doc_id_from_rec(
+ clust_rec, clust_index,
+ rec_get_offsets(clust_rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap));
+ }
+
+ if (node->is_delete
+ ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+ : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
+ /* Build the appropriate update vector which sets
+ foreign->n_fields first fields in rec to SQL NULL */
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+ update->n_fields = foreign->n_fields;
+ MEM_UNDEFINED(update->fields,
+ update->n_fields * sizeof *update->fields);
+
+ for (ulint i = 0; i < foreign->n_fields; i++) {
+ upd_field_t* ufield = &update->fields[i];
+ ulint col_no = dict_index_get_nth_col_no(
+ index, i);
+ ulint prefix_col;
+
+ ufield->field_no = static_cast<uint16_t>(
+ dict_table_get_nth_col_pos(
+ table, col_no, &prefix_col));
+ dict_col_t* col = dict_table_get_nth_col(
+ table, col_no);
+ dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
+
+ ufield->orig_len = 0;
+ ufield->exp = NULL;
+ dfield_set_null(&ufield->new_val);
+ }
+
+ if (foreign->affects_fulltext()) {
+ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+ }
+
+ if (foreign->v_cols != NULL
+ && foreign->v_cols->size() > 0) {
+ err = row_ins_foreign_fill_virtual(
+ cascade, clust_rec, clust_index,
+ node, foreign);
+
+ if (err != DB_SUCCESS) {
+ goto nonstandard_exit_func;
+ }
+ }
+ } else if (table->fts && cascade->is_delete == PLAIN_DELETE
+ && foreign->affects_fulltext()) {
+ /* DICT_FOREIGN_ON_DELETE_CASCADE case */
+ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+ }
+
+ if (!node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+ /* Build the appropriate update vector which sets changing
+ foreign->n_fields first fields in rec to new values */
+
+ bool affects_fulltext = row_ins_cascade_calc_update_vec(
+ node, foreign, tmp_heap, trx);
+
+ if (foreign->v_cols && !foreign->v_cols->empty()) {
+ err = row_ins_foreign_fill_virtual(
+ cascade, clust_rec, clust_index,
+ node, foreign);
+
+ if (err != DB_SUCCESS) {
+ goto nonstandard_exit_func;
+ }
+ }
+
+ switch (cascade->update->n_fields) {
+ case ULINT_UNDEFINED:
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying a cascaded update where the"
+ " updated value in the child\n"
+ "table would not fit in the length"
+ " of the column, or the value would\n"
+ "be NULL and the column is"
+ " declared as not NULL in the child table,",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ case 0:
+ /* The update does not change any columns referred
+ to in this foreign key constraint: no need to do
+ anything */
+
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+
+ /* Mark the old Doc ID as deleted */
+ if (affects_fulltext) {
+ ut_ad(table->fts);
+ fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+ }
+ }
+
+ if (table->versioned() && cascade->is_delete != PLAIN_DELETE
+ && cascade->update->affects_versioned()) {
+ ut_ad(!cascade->historical_heap);
+ cascade->historical_heap = mem_heap_create(srv_page_size);
+ cascade->historical_row = row_build(
+ ROW_COPY_DATA, clust_index, clust_rec, NULL, table,
+ NULL, NULL, NULL, cascade->historical_heap);
+ }
+
+ /* Store pcur position and initialize or store the cascade node
+ pcur stored position */
+
+ btr_pcur_store_position(pcur, mtr);
+
+ if (index == clust_index) {
+ btr_pcur_copy_stored_position(cascade->pcur, pcur);
+ } else {
+ btr_pcur_store_position(cascade->pcur, mtr);
+ }
+
+#ifdef WITH_WSREP
+ if (trx->is_wsrep()) {
+ err = wsrep_append_foreign_key(trx, foreign, clust_rec, clust_index,
+ false, NULL, true,
+ WSREP_SERVICE_KEY_EXCLUSIVE);
+ if (err != DB_SUCCESS) {
+ goto nonstandard_exit_func;
+ }
+ }
+#endif /* WITH_WSREP */
+ mtr_commit(mtr);
+
+ ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+ cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ err = row_update_cascade_for_mysql(thr, cascade,
+ foreign->foreign_table);
+
+ mtr_start(mtr);
+
+ /* Restore pcur position */
+
+ if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+ != btr_pcur_t::SAME_ALL) {
+ err = DB_CORRUPTION;
+ }
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ DBUG_RETURN(err);
+
+nonstandard_exit_func:
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ btr_pcur_store_position(pcur, mtr);
+
+ mtr_commit(mtr);
+ mtr_start(mtr);
+
+ if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+ != btr_pcur_t::SAME_ALL && err == DB_SUCCESS) {
+ err = DB_CORRUPTION;
+ }
+
+ DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_shared_rec_lock(
+/*========================*/
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_S, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_S, type, thr);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_X, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_X, type, thr);
+ }
+
+ return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_sys.latch.
+@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE if we want to check that
+ the referenced table is ok, FALSE if we
+ want to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ upd_node_t* upd_node;
+ ulint n_fields_cmp;
+ btr_pcur_t pcur;
+ int cmp;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ bool skip_gap_lock;
+
+ skip_gap_lock = (trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+
+ DBUG_ENTER("row_ins_check_foreign_constraint");
+
+ rec_offs_init(offsets_);
+
+#ifdef WITH_WSREP
+ upd_node= NULL;
+#endif /* WITH_WSREP */
+
+ if (!trx->check_foreigns) {
+ /* The user has suppressed foreign key checks currently for
+ this session */
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ /* If any of the foreign key fields in entry is SQL NULL, we
+ suppress the foreign key check: this is compatible with Oracle,
+ for example */
+ for (ulint i = 0; i < entry->n_fields; i++) {
+ dfield_t* field = dtuple_get_nth_field(entry, i);
+ if (i < foreign->n_fields && dfield_is_null(field)) {
+ DBUG_RETURN(DB_SUCCESS);
+ }
+ /* System Versioning: if row_end != Inf, we
+ suppress the foreign key check */
+ if (field->type.vers_sys_end() && field->vers_history_row()) {
+ DBUG_RETURN(DB_SUCCESS);
+ }
+ }
+
+ if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+ upd_node = static_cast<upd_node_t*>(thr->run_node);
+
+ if (upd_node->is_delete != PLAIN_DELETE
+ && upd_node->foreign == foreign) {
+ /* If a cascaded update is done as defined by a
+ foreign key constraint, do not check that
+ constraint for the child row. In ON UPDATE CASCADE
+ the update of the parent row is only half done when
+ we come here: if we would check the constraint here
+ for the child row it would fail.
+
+ A QUESTION remains: if in the child table there are
+ several constraints which refer to the same parent
+ table, we should merge all updates to the child as
+ one update? And the updates can be contradictory!
+ Currently we just perform the update associated
+ with each foreign key constraint, one after
+ another, and the user has problems predicting in
+ which order they are performed. */
+
+ DBUG_RETURN(DB_SUCCESS);
+ }
+ }
+
+ if (que_node_get_type(thr->run_node) == QUE_NODE_INSERT) {
+ ins_node_t* insert_node =
+ static_cast<ins_node_t*>(thr->run_node);
+ dict_table_t* table = insert_node->index->table;
+ if (table->versioned()) {
+ dfield_t* row_end = dtuple_get_nth_field(
+ insert_node->row, table->vers_end);
+ if (row_end->vers_history_row()) {
+ DBUG_RETURN(DB_SUCCESS);
+ }
+ }
+ }
+
+ dict_table_t *check_table;
+ dict_index_t *check_index;
+ dberr_t err = DB_SUCCESS;
+
+ {
+ dict_table_t*& fktable = check_ref
+ ? foreign->referenced_table : foreign->foreign_table;
+ check_table = fktable;
+ if (check_table) {
+ err = lock_table(check_table, &fktable, LOCK_IS, thr);
+ if (err != DB_SUCCESS) {
+ goto do_possible_lock_wait;
+ }
+ }
+ check_table = fktable;
+ }
+
+ check_index = check_ref
+ ? foreign->referenced_index : foreign->foreign_index;
+
+ if (!check_table || !check_table->is_readable() || !check_index) {
+ FILE* ef = dict_foreign_err_file;
+ std::string fk_str;
+
+ row_ins_set_detailed(trx, foreign);
+ row_ins_foreign_trx_print(trx);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, check_ref
+ ? foreign->foreign_table_name
+ : foreign->referenced_table_name);
+ fputs(":\n", ef);
+ fk_str = dict_print_info_on_foreign_key_in_create_format(
+ trx, foreign, TRUE);
+ fputs(fk_str.c_str(), ef);
+ if (check_ref) {
+ if (foreign->foreign_index) {
+ fprintf(ef, "\nTrying to add to index %s"
+ " tuple:\n",
+ foreign->foreign_index->name());
+ } else {
+ fputs("\nTrying to add tuple:\n", ef);
+ }
+ dtuple_print(ef, entry);
+ fputs("\nBut the parent table ", ef);
+ ut_print_name(ef, trx, foreign->referenced_table_name);
+ fputs("\nor its .ibd file or the required index does"
+ " not currently exist!\n", ef);
+ err = DB_NO_REFERENCED_ROW;
+ } else {
+ if (foreign->referenced_index) {
+ fprintf(ef, "\nTrying to modify index %s"
+ " tuple:\n",
+ foreign->referenced_index->name());
+ } else {
+ fputs("\nTrying to modify tuple:\n", ef);
+ }
+ dtuple_print(ef, entry);
+ fputs("\nBut the referencing table ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ fputs("\nor its .ibd file or the required index does"
+ " not currently exist!\n", ef);
+ err = DB_ROW_IS_REFERENCED;
+ }
+
+ mysql_mutex_unlock(&dict_foreign_err_mutex);
+ goto exit_func;
+ }
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+ pcur.btr_cur.page_cur.index = check_index;
+ err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto end_scan;
+ }
+
+ /* Scan index records and check if there is a matching record */
+
+ do {
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ const buf_block_t* block = btr_pcur_get_block(&pcur);
+
+ if (page_rec_is_infimum(rec)) {
+
+ continue;
+ }
+
+ offsets = rec_get_offsets(rec, check_index, offsets,
+ check_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (page_rec_is_supremum(rec)) {
+
+ if (skip_gap_lock) {
+
+ continue;
+ }
+
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+ rec, check_index,
+ offsets, thr);
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ continue;
+ default:
+ goto end_scan;
+ }
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, check_index, offsets);
+
+ if (cmp == 0) {
+ if (rec_get_deleted_flag(rec,
+ rec_offs_comp(offsets))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(!dict_index_is_clust(check_index)
+ || row_get_rec_trx_id(rec, check_index,
+ offsets));
+
+ err = row_ins_set_shared_rec_lock(
+ skip_gap_lock
+ ? LOCK_REC_NOT_GAP
+ : LOCK_ORDINARY, block,
+ rec, check_index, offsets, thr);
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+ } else {
+ if (check_table->versioned()) {
+ bool history_row = false;
+
+ if (check_index->is_primary()) {
+ history_row = check_index->
+ vers_history_row(rec,
+ offsets);
+ } else if (check_index->
+ vers_history_row(rec,
+ history_row)) {
+ break;
+ }
+
+ if (history_row) {
+ continue;
+ }
+ }
+ /* Found a matching record. Lock only
+ a record because we can allow inserts
+ into gaps */
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP, block,
+ rec, check_index, offsets, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+
+ if (check_ref) {
+ err = DB_SUCCESS;
+#ifdef WITH_WSREP
+ if (trx->is_wsrep()) {
+ err = wsrep_append_foreign_key(
+ thr_get_trx(thr),
+ foreign,
+ rec,
+ check_index,
+ check_ref,
+ upd_node,
+ false,
+ WSREP_SERVICE_KEY_REFERENCE);
+ }
+#endif /* WITH_WSREP */
+ goto end_scan;
+ } else if (foreign->type != 0) {
+ /* There is an ON UPDATE or ON DELETE
+ condition: check them in a separate
+ function */
+
+ err = row_ins_foreign_check_on_constraint(
+ thr, foreign, &pcur, entry,
+ &mtr);
+ if (err != DB_SUCCESS) {
+ /* Since reporting a plain
+ "duplicate key" error
+ message to the user in
+ cases where a long CASCADE
+ operation would lead to a
+ duplicate key in some
+ other table is very
+ confusing, map duplicate
+ key errors resulting from
+ FK constraints to a
+ separate error code. */
+
+ if (err == DB_DUPLICATE_KEY) {
+ err = DB_FOREIGN_DUPLICATE_KEY;
+ }
+
+ goto end_scan;
+ }
+
+ /* row_ins_foreign_check_on_constraint
+ may have repositioned pcur on a
+ different block */
+ block = btr_pcur_get_block(&pcur);
+ } else {
+ row_ins_foreign_report_err(
+ "Trying to delete or update",
+ thr, foreign, rec, entry);
+
+ err = DB_ROW_IS_REFERENCED;
+ goto end_scan;
+ }
+ }
+ } else {
+ ut_a(cmp < 0);
+
+ err = skip_gap_lock
+ ? DB_SUCCESS
+ : row_ins_set_shared_rec_lock(
+ LOCK_GAP, block,
+ rec, check_index, offsets, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ if (check_ref) {
+ err = DB_NO_REFERENCED_ROW;
+ row_ins_foreign_report_add_err(
+ trx, foreign, rec, entry);
+ }
+ default:
+ break;
+ }
+
+ goto end_scan;
+ }
+ } while (btr_pcur_move_to_next(&pcur, &mtr));
+
+ if (check_ref) {
+ row_ins_foreign_report_add_err(
+ trx, foreign, btr_pcur_get_rec(&pcur), entry);
+ err = DB_NO_REFERENCED_ROW;
+ } else {
+ err = DB_SUCCESS;
+ }
+
+end_scan:
+ mtr_commit(&mtr);
+ ut_free(pcur.old_rec_buf);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+ if (err == DB_LOCK_WAIT) {
+ trx->error_state = err;
+
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ err = lock_wait(thr);
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ if (err == DB_SUCCESS) {
+ err = DB_LOCK_WAIT;
+ }
+ }
+
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ DBUG_RETURN(err);
+}
+
+/** Sets the values of the dtuple fields in ref_entry from the values of
+foreign columns in entry.
+@param[in] foreign foreign key constraint
+@param[in] index clustered index
+@param[in] entry tuple of clustered index
+@param[in] ref_entry tuple of foreign columns
+@return true if all foreign key fields present in clustered index */
+static
+bool row_ins_foreign_index_entry(dict_foreign_t *foreign,
+ const dict_index_t *index,
+ const dtuple_t *entry,
+ dtuple_t *ref_entry)
+{
+ for (ulint i= 0; i < foreign->n_fields; i++)
+ {
+ for (ulint j= 0; j < index->n_fields; j++)
+ {
+ const dict_col_t *col= dict_index_get_nth_col(index, j);
+
+ /* A clustered index may contain instantly dropped columns,
+ which must be skipped. */
+ if (col->is_dropped())
+ continue;
+
+ const char *col_name= dict_table_get_col_name(index->table, col->ind);
+ if (0 == innobase_strcasecmp(col_name, foreign->foreign_col_names[i]))
+ {
+ dfield_copy(&ref_entry->fields[i], &entry->fields[j]);
+ goto got_match;
+ }
+ }
+ return false;
+got_match:
+ continue;
+ }
+
+ return true;
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_check_foreign_constraints(
+/*==============================*/
+ dict_table_t* table, /*!< in: table */
+ dict_index_t* index, /*!< in: index */
+ bool pk, /*!< in: index->is_primary() */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_foreign_t* foreign;
+ dberr_t err = DB_SUCCESS;
+ mem_heap_t* heap = NULL;
+
+ DBUG_ASSERT(index->is_primary() == pk);
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "foreign_constraint_check_for_ins");
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ err == DB_SUCCESS && it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ if (foreign->foreign_index == index
+ || (pk && !foreign->foreign_index)) {
+
+ dtuple_t* ref_tuple = entry;
+ if (UNIV_UNLIKELY(!foreign->foreign_index)) {
+ /* Change primary key entry to
+ foreign key index entry */
+ if (!heap) {
+ heap = mem_heap_create(1000);
+ } else {
+ mem_heap_empty(heap);
+ }
+
+ ref_tuple = dtuple_create(
+ heap, foreign->n_fields);
+ dtuple_set_n_fields_cmp(
+ ref_tuple, foreign->n_fields);
+ if (!row_ins_foreign_index_entry(
+ foreign, index, entry, ref_tuple)) {
+ err = DB_NO_REFERENCED_ROW;
+ break;
+ }
+
+ }
+
+ dict_table_t* ref_table = NULL;
+ dict_table_t* referenced_table
+ = foreign->referenced_table;
+
+ if (referenced_table == NULL) {
+
+ ref_table = dict_table_open_on_name(
+ foreign->referenced_table_name_lookup,
+ false, DICT_ERR_IGNORE_NONE);
+ }
+
+ err = row_ins_check_foreign_constraint(
+ TRUE, foreign, table, ref_tuple, thr);
+
+ if (ref_table) {
+ dict_table_close(ref_table);
+ }
+ }
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return err;
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+ const rec_t* rec, /*!< in: user record; NOTE that we assume
+ that the caller already has a record lock on
+ the record! */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint matched_fields;
+ ulint n_unique;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ n_unique = dict_index_get_n_unique(index);
+
+ matched_fields = 0;
+
+ cmp_dtuple_rec_with_match(entry, rec, index, offsets, &matched_fields);
+
+ if (matched_fields < n_unique) {
+
+ return(FALSE);
+ }
+
+ /* In a unique secondary index we allow equal key values if they
+ contain SQL NULLs */
+
+ if (!dict_index_is_clust(index) && !index->nulls_equal) {
+
+ for (i = 0; i < n_unique; i++) {
+ if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
+
+ return(FALSE);
+ }
+ }
+ }
+
+ return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/** Determine whether a history row was inserted by this transaction
+(row TRX_ID is the same as current TRX_ID).
+@param index secondary index
+@param rec secondary index record
+@param trx transaction
+@return error code
+@retval DB_SUCCESS on success
+@retval DB_FOREIGN_DUPLICATE_KEY if a history row was inserted by trx */
+static dberr_t vers_row_same_trx(dict_index_t* index, const rec_t* rec,
+ const trx_t& trx)
+{
+ mtr_t mtr;
+ dberr_t ret= DB_SUCCESS;
+ dict_index_t *clust_index= dict_table_get_first_index(index->table);
+ ut_ad(index != clust_index);
+
+ mtr.start();
+
+ if (const rec_t *clust_rec=
+ row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr))
+ {
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *clust_offs= offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t *heap= NULL;
+
+ clust_offs=
+ rec_get_offsets(clust_rec, clust_index, clust_offs,
+ clust_index->n_core_fields, ULINT_UNDEFINED, &heap);
+ if (clust_index->vers_history_row(clust_rec, clust_offs))
+ {
+ ulint trx_id_len;
+ const byte *trx_id= rec_get_nth_field(clust_rec, clust_offs,
+ clust_index->n_uniq, &trx_id_len);
+ ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+
+ if (trx.id == trx_read_trx_id(trx_id))
+ ret= DB_FOREIGN_DUPLICATE_KEY;
+ }
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ }
+ else
+ {
+ ib::error() << "foreign constraints: secondary index " << index->name <<
+ " of table " << index->table->name << " is out of sync";
+ ut_ad("secondary index is out of sync" == 0);
+ ret= DB_TABLE_CORRUPT;
+ }
+
+ mtr.commit();
+ return ret;
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ dict_index_t* index, /*!< in: non-clustered unique index */
+ dtuple_t* entry, /*!< in: index entry */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ mem_heap_t* offsets_heap)
+ /*!< in/out: memory heap that can be emptied */
+{
+ ulint n_unique;
+ int cmp;
+ ulint n_fields_cmp;
+ btr_pcur_t pcur;
+ rec_offs offsets_[REC_OFFS_SEC_INDEX_SIZE];
+ rec_offs* offsets = offsets_;
+ DBUG_ENTER("row_ins_scan_sec_index_for_duplicate");
+
+ rec_offs_init(offsets_);
+
+ ut_ad(!index->lock.have_any());
+
+ n_unique = dict_index_get_n_unique(index);
+
+ /* If the secondary index is unique, but one of the fields in the
+ n_unique first fields is NULL, a unique key violation cannot occur,
+ since we define NULL != NULL in this case */
+
+ if (!index->nulls_equal) {
+ for (ulint i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ DBUG_RETURN(DB_SUCCESS);
+ }
+ }
+ }
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, n_unique);
+ pcur.btr_cur.page_cur.index = index;
+ trx_t* const trx = thr_get_trx(thr);
+ dberr_t err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF,
+ &pcur, mtr);
+ if (err != DB_SUCCESS) {
+ goto end_scan;
+ }
+
+ /* Scan index records and check if there is a duplicate */
+
+ do {
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ const buf_block_t* block = btr_pcur_get_block(&pcur);
+ const ulint lock_type = LOCK_ORDINARY;
+
+ if (page_rec_is_infimum(rec)) {
+
+ continue;
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+ /* Set no locks when applying log
+ in online table rebuild. */
+ } else if (trx->duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ lock_type, block, rec, index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ lock_type, block, rec, index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ continue;
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, index, offsets);
+
+ if (cmp == 0) {
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ index, offsets)) {
+
+ err = DB_DUPLICATE_KEY;
+
+ trx->error_info = index;
+
+ if (!index->table->versioned()) {
+ } else if (dberr_t e =
+ vers_row_same_trx(index, rec,
+ *trx)) {
+ err = e;
+ goto end_scan;
+ }
+
+ /* If the duplicate is on hidden FTS_DOC_ID,
+ state so in the error log */
+ if (index == index->table->fts_doc_id_index
+ && DICT_TF2_FLAG_IS_SET(
+ index->table,
+ DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ ib::error() << "Duplicate FTS_DOC_ID"
+ " value on table "
+ << index->table->name;
+ }
+
+ goto end_scan;
+ }
+ } else {
+ ut_a(cmp < 0);
+ goto end_scan;
+ }
+ } while (btr_pcur_move_to_next(&pcur, mtr));
+
+end_scan:
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+ DBUG_RETURN(err);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@param n_uniq index->db_trx_id()
+@param entry entry being inserted
+@param rec clustered index record at insert position
+@param index clustered index
+@param offsets rec_get_offsets(rec)
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_online(ulint n_uniq, const dtuple_t *entry,
+ const rec_t *rec, const dict_index_t *index,
+ rec_offs *offsets)
+{
+ ulint fields = 0;
+
+ /* During rebuild, there should not be any delete-marked rows
+ in the new table. */
+ ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+ ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq);
+ ut_ad(n_uniq == index->db_trx_id());
+
+ /* Compare the PRIMARY KEY fields and the DB_TRX_ID, DB_ROLL_PTR. */
+ cmp_dtuple_rec_with_match_low(entry, rec, index, offsets, n_uniq + 2,
+ &fields);
+
+ if (fields < n_uniq) {
+ /* Not a duplicate. */
+ return(DB_SUCCESS);
+ }
+
+ ulint trx_id_len;
+
+ if (fields == n_uniq + 2
+ && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len),
+ reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+ ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+ /* rec is an exact match of entry, and DB_TRX_ID belongs
+ to a transaction that started after our ALTER TABLE. */
+ return(DB_SUCCESS_LOCKED_REC);
+ }
+
+ return(DB_DUPLICATE_KEY);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust_online(
+/*====================================*/
+ ulint n_uniq, /*!< in: offset of DB_TRX_ID */
+ const dtuple_t* entry, /*!< in: entry that is being inserted */
+ const btr_cur_t*cursor, /*!< in: cursor on insert position */
+ rec_offs** offsets,/*!< in/out: rec_get_offsets(rec) */
+ mem_heap_t** heap) /*!< in/out: heap for offsets */
+{
+ dberr_t err = DB_SUCCESS;
+ const rec_t* rec = btr_cur_get_rec(cursor);
+
+ ut_ad(!cursor->index()->is_instant());
+
+ if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
+ *offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+ cursor->index()->n_fields,
+ ULINT_UNDEFINED, heap);
+ err = row_ins_duplicate_online(n_uniq, entry,
+ rec, cursor->index(), *offsets);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ if (!(rec = page_rec_get_next_const(btr_cur_get_rec(cursor)))) {
+ return DB_CORRUPTION;
+ }
+
+ if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
+ *offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+ cursor->index()->n_fields,
+ ULINT_UNDEFINED, heap);
+ err = row_ins_duplicate_online(n_uniq, entry,
+ rec, cursor->index(), *offsets);
+ }
+
+ return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@retval DB_SUCCESS if no error
+@retval DB_DUPLICATE_KEY if error,
+@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust(
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ rec_t* rec;
+ ulint n_unique;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t*heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(cursor->index()->is_clust());
+
+ /* NOTE: For unique non-clustered indexes there may be any number
+ of delete marked records with the same value for the non-clustered
+ index key (remember multiversioning), and which differ only in
+ the row refererence part of the index record, containing the
+ clustered index key fields. For such a secondary index record,
+ to avoid race condition, we must FIRST do the insertion and after
+ that check that the uniqueness condition is not breached! */
+
+ /* NOTE: A problem is that in the B-tree node pointers on an
+ upper level may match more to the entry than the actual existing
+ user records on the leaf level. So, even if low_match would suggest
+ that a duplicate key violation may occur, this may not be the case. */
+
+ n_unique = dict_index_get_n_unique(cursor->index());
+
+ if (cursor->low_match >= n_unique) {
+
+ rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_infimum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index(),
+ offsets,
+ cursor->index()
+ ->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* We set a lock on the possible duplicate: this
+ is needed in logical logging of MySQL to make
+ sure that in roll-forward we get the same duplicate
+ errors as in original execution */
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+ /* Do nothing if no-locking is set */
+ err = DB_SUCCESS;
+ } else if (trx->duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index(), offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor), rec,
+ cursor->index(), offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(
+ rec, entry, cursor->index(), offsets)) {
+duplicate:
+ trx->error_info = cursor->index();
+ err = DB_DUPLICATE_KEY;
+ if (thr->prebuilt
+ && thr->prebuilt->upd_node
+ && thr->prebuilt->upd_node->is_delete
+ == VERSIONED_DELETE
+ && entry->vers_history_row())
+ {
+ ulint trx_id_len;
+ byte *trx_id = rec_get_nth_field(
+ rec, offsets, n_unique,
+ &trx_id_len);
+ ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+ if (trx->id == trx_read_trx_id(trx_id)) {
+ err = DB_FOREIGN_DUPLICATE_KEY;
+ }
+ }
+ goto func_exit;
+ }
+ }
+ }
+
+ err = DB_SUCCESS;
+
+ if (cursor->up_match >= n_unique) {
+
+ rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+ if (rec && !page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index(),
+ offsets,
+ cursor->index()
+ ->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (trx->duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index(), offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index(), offsets, thr);
+ }
+
+ switch (err) {
+ default:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ if (row_ins_dupl_error_with_rec(
+ rec, entry, cursor->index(),
+ offsets)) {
+ goto duplicate;
+ }
+ }
+ }
+
+ /* This should never happen */
+ err = DB_CORRUPTION;
+ }
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal. InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
+UNIV_INLINE
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+ const btr_cur_t* cursor) /*!< in: B-tree cursor */
+{
+ /* NOTE: (compare to the note in row_ins_duplicate_error_in_clust)
+ Because node pointers on upper levels of the B-tree may match more
+ to entry than to actual user records on the leaf level, we
+ have to check if the candidate record is actually a user record.
+ A clustered index node pointer contains index->n_unique first fields,
+ and a secondary index node pointer contains all index fields. */
+
+ return(cursor->low_match
+ >= dict_index_get_n_unique_in_tree(cursor->index())
+ && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
+}
+
+/** Insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@param[in] entry index entry to insert
+@param[in] big_rec externally stored fields
+@param[in,out] offsets rec_get_offsets()
+@param[in,out] heap memory heap
+@param[in] thd client connection, or NULL
+@param[in] index clustered index
+@return error code
+@retval DB_SUCCESS
+@retval DB_OUT_OF_FILE_SPACE */
+static
+dberr_t
+row_ins_index_entry_big_rec(
+ const dtuple_t* entry,
+ const big_rec_t* big_rec,
+ rec_offs* offsets,
+ mem_heap_t** heap,
+ dict_index_t* index,
+ const void* thd __attribute__((unused)))
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ rec_t* rec;
+
+ pcur.btr_cur.page_cur.index = index;
+ ut_ad(index->is_primary());
+
+ DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+
+ dberr_t error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
+ &pcur, &mtr);
+ if (error != DB_SUCCESS) {
+ return error;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+
+ DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern");
+ error = btr_store_big_rec_extern_fields(
+ &pcur, offsets, big_rec, &mtr, BTR_STORE_INSERT);
+ DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
+
+ mtr.commit();
+
+ ut_free(pcur.old_rec_buf);
+ return(error);
+}
+
+#ifdef HAVE_REPLICATION /* Working around MDEV-24622 */
+extern "C" int thd_is_slave(const MYSQL_THD thd);
+#else
+# define thd_is_slave(thd) 0
+#endif
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock().
+We would only need this for row_ins_clust_index_entry_low(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint n_uniq, /*!< in: 0 or index->n_uniq */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t pcur;
+ dberr_t err = DB_SUCCESS;
+ big_rec_t* big_rec = NULL;
+ mtr_t mtr;
+ uint64_t auto_inc = 0;
+ mem_heap_t* offsets_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ trx_t* trx = thr_get_trx(thr);
+ buf_block_t* block;
+
+ DBUG_ENTER("row_ins_clust_index_entry_low");
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!dict_index_is_unique(index)
+ || n_uniq == dict_index_get_n_unique(index));
+ ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
+ ut_ad(!trx->in_rollback);
+
+ mtr.start();
+
+ if (index->table->is_temporary()) {
+ /* Disable REDO logging as the lifetime of temp-tables is
+ limited to server or connection lifetime and so REDO
+ information is not needed on restart for recovery.
+ Disable locking as temp-tables are local to a connection. */
+
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(!index->table->persistent_autoinc);
+ ut_ad(!index->is_instant());
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+
+ if (UNIV_UNLIKELY(entry->is_metadata())) {
+ ut_ad(index->is_instant());
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(mode == BTR_MODIFY_TREE);
+ } else {
+ if (mode == BTR_MODIFY_LEAF
+ && dict_index_is_online_ddl(index)) {
+ mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ }
+
+ if (unsigned ai = index->table->persistent_autoinc) {
+ /* Prepare to persist the AUTO_INCREMENT value
+ from the index entry to PAGE_ROOT_AUTO_INC. */
+ const dfield_t* dfield = dtuple_get_nth_field(
+ entry, ai - 1);
+ if (!dfield_is_null(dfield)) {
+ auto_inc = row_parse_int(
+ static_cast<const byte*>(
+ dfield->data),
+ dfield->len,
+ dfield->type.mtype,
+ dfield->type.prtype
+ & DATA_UNSIGNED);
+ if (auto_inc
+ && mode != BTR_MODIFY_TREE) {
+ mode = btr_latch_mode(
+ BTR_MODIFY_ROOT_AND_LEAF
+ ^ BTR_MODIFY_LEAF
+ ^ mode);
+ }
+ }
+ }
+ }
+ }
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+ pcur.btr_cur.page_cur.index = index;
+ err = btr_pcur_open(entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+ if (err != DB_SUCCESS) {
+ index->table->file_unreadable = true;
+err_exit:
+ mtr.commit();
+ goto func_exit;
+ }
+
+ if (auto_inc) {
+ buf_block_t* root
+ = mtr.at_savepoint(mode != BTR_MODIFY_ROOT_AND_LEAF);
+ ut_ad(index->page == root->page.id().page_no());
+ page_set_autoinc(root, auto_inc, &mtr, false);
+ }
+
+ btr_pcur_get_btr_cur(&pcur)->thr = thr;
+
+#ifdef UNIV_DEBUG
+ {
+ page_t* page = btr_pcur_get_page(&pcur);
+ rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ ut_ad(page_rec_is_supremum(first_rec)
+ || rec_n_fields_is_sane(index, first_rec, entry));
+ }
+#endif /* UNIV_DEBUG */
+
+ block = btr_pcur_get_block(&pcur);
+
+ DBUG_EXECUTE_IF("row_ins_row_level", goto skip_bulk_insert;);
+
+ if (!(flags & BTR_NO_UNDO_LOG_FLAG)
+ && page_is_empty(block->page.frame)
+ && !entry->is_metadata() && !trx->duplicates
+ && !trx->check_unique_secondary && !trx->check_foreigns
+ && !trx->dict_operation
+ && block->page.id().page_no() == index->page
+ && !index->table->skip_alter_undo
+ && !index->table->n_rec_locks
+ && !index->table->is_active_ddl()
+ && !index->table->has_spatial_index()
+ && !index->table->versioned()
+ && !thd_is_slave(trx->mysql_thd) /* FIXME: MDEV-24622 */) {
+ DEBUG_SYNC_C("empty_root_page_insert");
+
+ trx->bulk_insert = true;
+
+ if (!index->table->is_temporary()) {
+ err = lock_table(index->table, NULL, LOCK_X, thr);
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = err;
+ trx->bulk_insert = false;
+ goto err_exit;
+ }
+
+ if (index->table->n_rec_locks) {
+avoid_bulk:
+ trx->bulk_insert = false;
+ goto skip_bulk_insert;
+ }
+
+#ifdef WITH_WSREP
+ if (trx->is_wsrep())
+ {
+ if (!wsrep_thd_is_local_transaction(trx->mysql_thd))
+ goto skip_bulk_insert;
+ if (wsrep_append_table_key(trx->mysql_thd, *index->table))
+ {
+ trx->error_state = DB_ROLLBACK;
+ goto err_exit;
+ }
+ }
+#endif /* WITH_WSREP */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (btr_search_enabled) {
+ btr_search_x_lock_all();
+ index->table->bulk_trx_id = trx->id;
+ btr_search_x_unlock_all();
+ } else {
+ index->table->bulk_trx_id = trx->id;
+ }
+#else /* BTR_CUR_HASH_ADAPT */
+ index->table->bulk_trx_id = trx->id;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* Write TRX_UNDO_EMPTY undo log and
+ start buffering the insert operation */
+ err = trx_undo_report_row_operation(
+ thr, index, entry,
+ nullptr, 0, nullptr, nullptr,
+ nullptr);
+
+ if (err != DB_SUCCESS) {
+ goto avoid_bulk;
+ }
+
+ goto err_exit;
+ }
+ }
+
+skip_bulk_insert:
+ if (UNIV_UNLIKELY(entry->info_bits != 0)) {
+ ut_ad(entry->is_metadata());
+ ut_ad(flags == BTR_NO_LOCKING_FLAG);
+ ut_ad(index->is_instant());
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+
+ if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG) {
+ trx->error_info = index;
+ err = DB_DUPLICATE_KEY;
+ goto err_exit;
+ }
+
+ ut_ad(!row_ins_must_modify_rec(&pcur.btr_cur));
+ goto do_insert;
+ }
+
+ if (rec_is_metadata(btr_pcur_get_rec(&pcur), *index)) {
+ goto do_insert;
+ }
+
+ if (n_uniq
+ && (pcur.btr_cur.up_match >= n_uniq
+ || pcur.btr_cur.low_match >= n_uniq)) {
+
+ if (flags
+ == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) {
+ /* Set no locks when applying log
+ in online table rebuild. Only check for duplicates. */
+ err = row_ins_duplicate_error_in_clust_online(
+ n_uniq, entry, &pcur.btr_cur,
+ &offsets, &offsets_heap);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_DUPLICATE_KEY:
+ trx->error_info = index;
+ }
+ } else {
+ /* Note that the following may return also
+ DB_LOCK_WAIT */
+
+ err = row_ins_duplicate_error_in_clust(
+ flags, &pcur.btr_cur, entry, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+ }
+
+ /* Note: Allowing duplicates would qualify for modification of
+ an existing record as the new entry is exactly same as old entry. */
+ if (row_ins_must_modify_rec(&pcur.btr_cur)) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+ mem_heap_t* entry_heap = mem_heap_create(1024);
+
+ err = row_ins_clust_index_entry_by_modify(
+ &pcur, flags, mode, &offsets, &offsets_heap,
+ entry_heap, entry, thr, &mtr);
+
+ mtr_commit(&mtr);
+ mem_heap_free(entry_heap);
+ } else {
+ if (index->is_instant()) entry->trim(*index);
+do_insert:
+ rec_t* insert_rec;
+
+ if (mode != BTR_MODIFY_TREE) {
+ ut_ad(mode == BTR_MODIFY_LEAF
+ || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+ || mode == BTR_MODIFY_ROOT_AND_LEAF
+ || mode
+ == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
+ err = btr_cur_optimistic_insert(
+ flags, &pcur.btr_cur, &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+ } else {
+ if (buf_pool.running_out()) {
+ err = DB_LOCK_TABLE_FULL;
+ goto err_exit;
+ }
+
+ err = btr_cur_optimistic_insert(
+ flags, &pcur.btr_cur,
+ &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(
+ flags, &pcur.btr_cur,
+ &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+ }
+ }
+
+ mtr.commit();
+
+ if (big_rec) {
+ /* Online table rebuild could read (and
+ ignore) the incomplete record at this point.
+ If online rebuild is in progress, the
+ row_ins_index_entry_big_rec() will write log. */
+
+ DBUG_EXECUTE_IF(
+ "row_ins_extern_checkpoint",
+ log_write_up_to(mtr.commit_lsn(), true););
+ err = row_ins_index_entry_big_rec(
+ entry, big_rec, offsets, &offsets_heap, index,
+ trx->mysql_thd);
+ dtuple_convert_back_big_rec(index, entry, big_rec);
+ }
+ }
+
+func_exit:
+ if (offsets_heap != NULL) {
+ mem_heap_free(offsets_heap);
+ }
+
+ ut_free(pcur.old_rec_buf);
+ DBUG_RETURN(err);
+}
+
+/** Start a mini-transaction.
+@param[in,out] mtr mini-transaction
+@param[in,out] index secondary index */
+static void row_ins_sec_mtr_start(mtr_t *mtr, dict_index_t *index)
+{
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ const mtr_log_t log_mode = mtr->get_log_mode();
+
+ mtr->start();
+ index->set_modified(*mtr);
+ mtr->set_log_mode(log_mode);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: secondary index */
+ mem_heap_t* offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during
+ row_log_table_apply(), or 0 */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ DBUG_ENTER("row_ins_sec_index_entry_low");
+
+ btr_cur_t cursor;
+ btr_latch_mode search_mode = mode;
+ dberr_t err;
+ ulint n_unique;
+ mtr_t mtr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ rtr_info_t rtr_info;
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_INSERT_TREE);
+
+ cursor.thr = thr;
+ cursor.rtr_info = NULL;
+ cursor.page_cur.index = index;
+ ut_ad(thr_get_trx(thr)->id != 0);
+
+ mtr.start();
+
+ if (index->table->is_temporary()) {
+ /* Disable locking, because temporary tables are never
+ shared between transactions or connections. */
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+
+ if (index->is_spatial()) {
+ rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+ rtr_info_update_btr(&cursor, &rtr_info);
+
+ err = rtr_insert_leaf(&cursor, entry, search_mode, &mtr);
+
+ if (err == DB_SUCCESS && search_mode == BTR_MODIFY_LEAF
+ && rtr_info.mbr_adj) {
+ mtr_commit(&mtr);
+ search_mode = mode = BTR_MODIFY_TREE;
+ rtr_clean_rtr_info(&rtr_info, true);
+ rtr_init_rtr_info(&rtr_info, false, &cursor,
+ index, false);
+ rtr_info_update_btr(&cursor, &rtr_info);
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+ err = rtr_insert_leaf(&cursor, entry,
+ search_mode, &mtr);
+ }
+
+ DBUG_EXECUTE_IF(
+ "rtree_test_check_count", {
+ goto func_exit;});
+
+ } else {
+ if (!index->table->is_temporary()) {
+ search_mode = btr_latch_mode(
+ search_mode
+ | (thr_get_trx(thr)->check_unique_secondary
+ ? BTR_INSERT
+ : BTR_INSERT | BTR_IGNORE_SEC_UNIQUE));
+ }
+
+ err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode,
+ &mtr);
+ }
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ btr_decryption_failed(*index);
+ }
+ goto func_exit;
+ }
+
+ if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+ ut_ad(!dict_index_is_spatial(index));
+ /* The insert was buffered during the search: we are done */
+ goto func_exit;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ page_t* page = btr_cur_get_page(&cursor);
+ rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ ut_ad(page_rec_is_supremum(first_rec)
+ || rec_n_fields_is_sane(index, first_rec, entry));
+ }
+#endif /* UNIV_DEBUG */
+
+ n_unique = dict_index_get_n_unique(index);
+
+ if (dict_index_is_unique(index)
+ && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) {
+ mtr_commit(&mtr);
+
+ DEBUG_SYNC_C("row_ins_sec_index_unique");
+
+ row_ins_sec_mtr_start(&mtr, index);
+
+ err = row_ins_scan_sec_index_for_duplicate(
+ flags, index, entry, thr, &mtr, offsets_heap);
+
+ mtr_commit(&mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_DUPLICATE_KEY:
+ if (!index->is_committed()) {
+ ut_ad(!thr_get_trx(thr)
+ ->dict_operation_lock_mode);
+ index->type |= DICT_CORRUPT;
+ /* Do not return any error to the
+ caller. The duplicate will be reported
+ by ALTER TABLE or CREATE UNIQUE INDEX.
+ Unfortunately we cannot report the
+ duplicate key value to the DDL thread,
+ because the altered_table object is
+ private to its call stack. */
+ err = DB_SUCCESS;
+ }
+ /* fall through */
+ default:
+ if (dict_index_is_spatial(index)) {
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+ DBUG_RETURN(err);
+ }
+
+ row_ins_sec_mtr_start(&mtr, index);
+
+ DEBUG_SYNC_C("row_ins_sec_index_entry_dup_locks_created");
+
+ /* We did not find a duplicate and we have now
+ locked with s-locks the necessary records to
+ prevent any insertion of a duplicate by another
+ transaction. Let us now reposition the cursor and
+ continue the insertion (bypassing the change buffer). */
+ err = cursor.search_leaf(
+ entry, PAGE_CUR_LE,
+ btr_latch_mode(search_mode
+ & ~(BTR_INSERT
+ | BTR_IGNORE_SEC_UNIQUE)),
+ &mtr);
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+
+ if (row_ins_must_modify_rec(&cursor)) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+ offsets = rec_get_offsets(
+ btr_cur_get_rec(&cursor), index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+
+ err = row_ins_sec_index_entry_by_modify(
+ flags, mode, &cursor, &offsets,
+ offsets_heap, heap, entry, thr, &mtr);
+
+ if (err == DB_SUCCESS && dict_index_is_spatial(index)
+ && rtr_info.mbr_adj) {
+ err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+ }
+ } else {
+ rec_t* insert_rec;
+ big_rec_t* big_rec;
+
+ if (mode == BTR_MODIFY_LEAF) {
+ err = btr_cur_optimistic_insert(
+ flags, &cursor, &offsets, &offsets_heap,
+ entry, &insert_rec,
+ &big_rec, 0, thr, &mtr);
+ if (err == DB_SUCCESS
+ && dict_index_is_spatial(index)
+ && rtr_info.mbr_adj) {
+ err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+ }
+ } else {
+ if (buf_pool.running_out()) {
+ err = DB_LOCK_TABLE_FULL;
+ goto func_exit;
+ }
+
+ err = btr_cur_optimistic_insert(
+ flags, &cursor,
+ &offsets, &offsets_heap,
+ entry, &insert_rec,
+ &big_rec, 0, thr, &mtr);
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(
+ flags, &cursor,
+ &offsets, &offsets_heap,
+ entry, &insert_rec,
+ &big_rec, 0, thr, &mtr);
+ }
+ if (err == DB_SUCCESS
+ && dict_index_is_spatial(index)
+ && rtr_info.mbr_adj) {
+ err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+ }
+ }
+
+ if (err == DB_SUCCESS && trx_id) {
+ page_update_max_trx_id(
+ btr_cur_get_block(&cursor),
+ btr_cur_get_page_zip(&cursor),
+ trx_id, &mtr);
+ }
+
+ ut_ad(!big_rec);
+ }
+
+func_exit:
+ if (dict_index_is_spatial(index)) {
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+
+ mtr_commit(&mtr);
+ DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ dberr_t err;
+ ulint n_uniq;
+
+ DBUG_ENTER("row_ins_clust_index_entry");
+
+ if (!index->table->foreign_set.empty()) {
+ err = row_ins_check_foreign_constraints(
+ index->table, index, true, entry, thr);
+ if (err != DB_SUCCESS) {
+
+ DBUG_RETURN(err);
+ }
+ }
+
+ n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0;
+
+#ifdef WITH_WSREP
+ const bool skip_locking
+ = wsrep_thd_skip_locking(thr_get_trx(thr)->mysql_thd);
+ ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+ : (index->table->is_temporary() || skip_locking)
+ ? BTR_NO_LOCKING_FLAG : 0;
+#ifdef UNIV_DEBUG
+ if (skip_locking && strcmp(wsrep_get_sr_table_name(),
+ index->table->name.m_name)) {
+ WSREP_ERROR("Record locking is disabled in this thread, "
+ "but the table being modified is not "
+ "`%s`: `%s`.", wsrep_get_sr_table_name(),
+ index->table->name.m_name);
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+#else
+ ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+ : index->table->is_temporary()
+ ? BTR_NO_LOCKING_FLAG : 0;
+#endif /* WITH_WSREP */
+ const ulint orig_n_fields = entry->n_fields;
+
+ /* For intermediate table during copy alter table,
+ skip the undo log and record lock checking for
+ insertion operation.
+ */
+ if (index->table->skip_alter_undo) {
+ flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+ }
+
+ /* Try first optimistic descent to the B-tree */
+ log_free_check();
+
+ err = row_ins_clust_index_entry_low(
+ flags, BTR_MODIFY_LEAF, index, n_uniq, entry,
+ n_ext, thr);
+
+ entry->n_fields = orig_n_fields;
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "after_row_ins_clust_index_entry_leaf");
+
+ if (err != DB_FAIL) {
+ DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
+ DBUG_RETURN(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+ log_free_check();
+
+ err = row_ins_clust_index_entry_low(
+ flags, BTR_MODIFY_TREE, index, n_uniq, entry,
+ n_ext, thr);
+
+ entry->n_fields = orig_n_fields;
+
+ DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+ dict_index_t* index, /*!< in: secondary index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ bool check_foreign) /*!< in: true if check
+ foreign table is needed, false otherwise */
+{
+ dberr_t err = DB_SUCCESS;
+ mem_heap_t* offsets_heap;
+ mem_heap_t* heap;
+ trx_id_t trx_id = 0;
+
+ DBUG_EXECUTE_IF("row_ins_sec_index_entry_timeout", {
+ DBUG_SET("-d,row_ins_sec_index_entry_timeout");
+ return(DB_LOCK_WAIT);});
+
+ if (check_foreign && !index->table->foreign_set.empty()) {
+ err = row_ins_check_foreign_constraints(index->table, index,
+ false, entry, thr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(thr_get_trx(thr)->id != 0);
+
+ offsets_heap = mem_heap_create(1024);
+ heap = mem_heap_create(1024);
+
+ /* Try first optimistic descent to the B-tree */
+
+ log_free_check();
+ ulint flags = index->table->is_temporary()
+ ? BTR_NO_LOCKING_FLAG
+ : 0;
+
+ /* For intermediate table during copy alter table,
+ skip the undo log and record lock checking for
+ insertion operation.
+ */
+ if (index->table->skip_alter_undo) {
+ trx_id = thr_get_trx(thr)->id;
+ flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+ }
+
+ err = row_ins_sec_index_entry_low(
+ flags, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry,
+ trx_id, thr);
+ if (err == DB_FAIL) {
+ mem_heap_empty(heap);
+
+ if (index->table->space == fil_system.sys_space
+ && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+ ibuf_free_excess_pages();
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+ log_free_check();
+
+ err = row_ins_sec_index_entry_low(
+ flags, BTR_INSERT_TREE, index,
+ offsets_heap, heap, entry, 0, thr);
+ }
+
+ mem_heap_free(heap);
+ mem_heap_free(offsets_heap);
+ return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+static
+dberr_t
+row_ins_index_entry(
+/*================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in/out: index entry to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(trx->id || index->table->no_rollback()
+ || index->table->is_temporary());
+
+ DBUG_EXECUTE_IF("row_ins_index_entry_timeout", {
+ DBUG_SET("-d,row_ins_index_entry_timeout");
+ return(DB_LOCK_WAIT);});
+
+ if (index->is_btree()) {
+ if (auto t= trx->check_bulk_buffer(index->table)) {
+ /* MDEV-25036 FIXME: check also foreign key
+ constraints */
+ ut_ad(!trx->check_foreigns);
+ return t->bulk_insert_buffered(*entry, *index, trx);
+ }
+ }
+
+ if (index->is_primary()) {
+ return row_ins_clust_index_entry(index, entry, thr, 0);
+ } else {
+ return row_ins_sec_index_entry(index, entry, thr);
+ }
+}
+
+
+/*****************************************************************//**
+This function generate MBR (Minimum Bounding Box) for spatial objects
+and set it to spatial index field. */
+static
+void
+row_ins_spatial_index_entry_set_mbr_field(
+/*======================================*/
+ dfield_t* field, /*!< in/out: mbr field */
+ const dfield_t* row_field) /*!< in: row field */
+{
+ ulint dlen = 0;
+ double mbr[SPDIMS * 2];
+
+ /* This must be a GEOMETRY datatype */
+ ut_ad(DATA_GEOMETRY_MTYPE(field->type.mtype));
+
+ const byte* dptr = static_cast<const byte*>(
+ dfield_get_data(row_field));
+ dlen = dfield_get_len(row_field);
+
+ /* obtain the MBR */
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen - GEO_DATA_HEADER_SIZE),
+ SPDIMS, mbr);
+
+ /* Set mbr as index entry data */
+ dfield_write_mbr(field, mbr);
+}
+
+/** Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row.
+@param[in] index index handler
+@param[out] entry index entry to make
+@param[in] row row
+@return DB_SUCCESS if the set is successful */
+static
+dberr_t
+row_ins_index_entry_set_vals(
+ const dict_index_t* index,
+ dtuple_t* entry,
+ const dtuple_t* row)
+{
+ ulint n_fields;
+ ulint i;
+ ulint num_v = dtuple_get_n_v_fields(entry);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ for (i = 0; i < n_fields + num_v; i++) {
+ dict_field_t* ind_field = NULL;
+ dfield_t* field;
+ const dfield_t* row_field;
+ ulint len;
+ dict_col_t* col;
+
+ if (i >= n_fields) {
+ /* This is virtual field */
+ field = dtuple_get_nth_v_field(entry, i - n_fields);
+ col = &dict_table_get_nth_v_col(
+ index->table, i - n_fields)->m_col;
+ } else {
+ field = dtuple_get_nth_field(entry, i);
+ ind_field = dict_index_get_nth_field(index, i);
+ col = ind_field->col;
+ }
+
+ if (col->is_virtual()) {
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(col);
+ ut_ad(dtuple_get_n_fields(row)
+ == dict_table_get_n_cols(index->table));
+ row_field = dtuple_get_nth_v_field(row, v_col->v_pos);
+ } else if (col->is_dropped()) {
+ ut_ad(index->is_primary());
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ field->data = NULL;
+ field->len = UNIV_SQL_NULL;
+ field->type.prtype = DATA_BINARY_TYPE;
+ } else {
+ ut_ad(ind_field->fixed_len <= col->len);
+ dfield_set_data(field, field_ref_zero,
+ ind_field->fixed_len);
+ field->type.prtype = DATA_NOT_NULL;
+ }
+
+ field->type.mtype = col->len
+ ? DATA_FIXBINARY : DATA_BINARY;
+ continue;
+ } else {
+ row_field = dtuple_get_nth_field(
+ row, ind_field->col->ind);
+ }
+
+ len = dfield_get_len(row_field);
+
+ /* Check column prefix indexes */
+ if (ind_field != NULL && ind_field->prefix_len > 0
+ && len != UNIV_SQL_NULL) {
+
+ const dict_col_t* col
+ = dict_field_get_col(ind_field);
+
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen, col->mbmaxlen,
+ ind_field->prefix_len,
+ len,
+ static_cast<const char*>(
+ dfield_get_data(row_field)));
+
+ ut_ad(!dfield_is_ext(row_field));
+ }
+
+ /* Handle spatial index. For the first field, replace
+ the data with its MBR (Minimum Bounding Box). */
+ if ((i == 0) && dict_index_is_spatial(index)) {
+ if (!row_field->data
+ || row_field->len < GEO_DATA_HEADER_SIZE) {
+ return(DB_CANT_CREATE_GEOMETRY_OBJECT);
+ }
+ row_ins_spatial_index_entry_set_mbr_field(
+ field, row_field);
+ continue;
+ }
+
+ dfield_set_data(field, dfield_get_data(row_field), len);
+ if (dfield_is_ext(row_field)) {
+ ut_ad(dict_index_is_clust(index));
+ dfield_set_ext(field);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_index_entry_step(
+/*=====================*/
+ ins_node_t* node, /*!< in: row insert node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+
+ DBUG_ENTER("row_ins_index_entry_step");
+
+ ut_ad(dtuple_check_typed(node->row));
+
+ err = row_ins_index_entry_set_vals(node->index, *node->entry,
+ node->row);
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(err);
+ }
+
+ ut_ad(dtuple_check_typed(*node->entry));
+
+ err = row_ins_index_entry(node->index, *node->entry, thr);
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "after_row_ins_index_entry_step");
+
+ DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+ if (dict_table_get_first_index(node->table)->is_gen_clust())
+ dict_sys_write_row_id(node->sys_buf, dict_sys.get_new_row_id());
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->values_list;
+
+ while (list_node) {
+ eval_exp(list_node);
+
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->select->select_list;
+
+ while (list_node) {
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins(
+/*====*/
+ ins_node_t* node, /*!< in: row insert node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ DBUG_ENTER("row_ins");
+
+ DBUG_PRINT("row_ins", ("table: %s", node->table->name.m_name));
+
+ if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+ row_ins_alloc_row_id_step(node);
+
+ node->index = dict_table_get_first_index(node->table);
+ ut_ad(node->entry_list.empty() == false);
+ node->entry = node->entry_list.begin();
+
+ if (node->ins_type == INS_SEARCHED) {
+
+ row_ins_get_row_from_select(node);
+
+ } else if (node->ins_type == INS_VALUES) {
+
+ row_ins_get_row_from_values(node);
+ }
+
+ node->state = INS_NODE_INSERT_ENTRIES;
+ }
+
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+ while (dict_index_t *index = node->index) {
+ if (index->type & (DICT_FTS | DICT_CORRUPT)
+ || !index->is_committed()) {
+ } else if (dberr_t err = row_ins_index_entry_step(node, thr)) {
+ DBUG_RETURN(err);
+ }
+ node->index = dict_table_get_next_index(index);
+ ++node->entry;
+ }
+
+ ut_ad(node->entry == node->entry_list.end());
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ins_node_t* node;
+ que_node_t* parent;
+ sel_node_t* sel_node;
+ trx_t* trx;
+ dberr_t err;
+
+ ut_ad(thr);
+
+ DEBUG_SYNC_C("innodb_row_ins_step_enter");
+
+ trx = thr_get_trx(thr);
+
+ node = static_cast<ins_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+ parent = que_node_get_parent(node);
+ sel_node = node->select;
+
+ if (thr->prev_node == parent) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ }
+
+ /* If this is the first time this node is executed (or when
+ execution resumes after wait for the table IX lock), set an
+ IX lock on the table and reset the possible select node. MySQL's
+ partitioned table code may also call an insert within the same
+ SQL statement AFTER it has used this table handle to do a search.
+ This happens, for example, when a row update moves it to another
+ partition. In that case, we have already set the IX lock on the
+ table during the search operation, and there is no need to set
+ it again here. But we must write trx->id to node->sys_buf. */
+
+ if (node->table->no_rollback()) {
+ /* No-rollback tables should only be written to by a
+ single thread at a time, but there can be multiple
+ concurrent readers. We must hold an open table handle. */
+ DBUG_ASSERT(node->table->get_ref_count() > 0);
+ DBUG_ASSERT(node->ins_type == INS_DIRECT);
+ /* No-rollback tables can consist only of a single index. */
+ DBUG_ASSERT(node->entry_list.size() == 1);
+ DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1);
+ /* There should be no possibility for interruption and
+ restarting here. In theory, we could allow resumption
+ from the INS_NODE_INSERT_ENTRIES state here. */
+ DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK);
+ node->index = dict_table_get_first_index(node->table);
+ node->entry = node->entry_list.begin();
+ node->state = INS_NODE_INSERT_ENTRIES;
+ goto do_insert;
+ }
+
+ if (node->state == INS_NODE_SET_IX_LOCK) {
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ if (node->table->is_temporary()) {
+ node->trx_id = trx->id;
+ }
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ if (trx->id == node->trx_id) {
+ /* No need to do IX-locking */
+
+ goto same_trx;
+ }
+
+ err = lock_table(node->table, NULL, LOCK_IX, thr);
+
+ DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
+ err = DB_LOCK_WAIT;);
+
+ if (err != DB_SUCCESS) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ goto error_handling;
+ }
+
+ node->trx_id = trx->id;
+same_trx:
+ if (node->ins_type == INS_SEARCHED) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ if ((node->ins_type == INS_SEARCHED)
+ && (sel_node->state != SEL_NODE_FETCH)) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to insert */
+ thr->run_node = parent;
+
+ return(thr);
+ }
+do_insert:
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_ins(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* err == DB_LOCK_WAIT or SQL error detected */
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
new file mode 100644
index 00000000..c4f46304
--- /dev/null
+++ b/storage/innobase/row/row0log.cc
@@ -0,0 +1,4134 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0log.cc
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#include "row0log.h"
+#include "row0row.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0merge.h"
+#include "row0ext.h"
+#include "log0crypt.h"
+#include "data0data.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "handler0alter.h"
+#include "ut0stage.h"
+#include "trx0rec.h"
+
+#include <sql_class.h>
+#include <algorithm>
+#include <map>
+
+Atomic_counter<ulint> onlineddl_rowlog_rows;
+ulint onlineddl_rowlog_pct_used;
+ulint onlineddl_pct_progress;
+
+/** Table row modification operations during online table rebuild.
+Delete-marked records are not copied to the rebuilt table. */
+enum row_tab_op {
+ /** Insert a record */
+ ROW_T_INSERT = 0x41,
+ /** Update a record in place */
+ ROW_T_UPDATE,
+ /** Delete (purge) a record */
+ ROW_T_DELETE
+};
+
+/** Index record modification operations during online index creation */
+enum row_op {
+ /** Insert a record */
+ ROW_OP_INSERT = 0x61,
+ /** Delete a record */
+ ROW_OP_DELETE
+};
+
+/** Size of the modification log entry header, in bytes */
+#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/
+
+/** Log block for modifications during online ALTER TABLE */
+struct row_log_buf_t {
+ byte* block; /*!< file block buffer */
+ size_t size; /*!< length of block in bytes */
+ ut_new_pfx_t block_pfx; /*!< opaque descriptor of "block". Set
+ by ut_allocator::allocate_large() and fed to
+ ut_allocator::deallocate_large(). */
+ mrec_buf_t buf; /*!< buffer for accessing a record
+ that spans two blocks */
+ ulint blocks; /*!< current position in blocks */
+ ulint bytes; /*!< current position within block */
+ ulonglong total; /*!< logical position, in bytes from
+ the start of the row_log_table log;
+ 0 for row_log_online_op() and
+ row_log_apply(). */
+};
+
+/** @brief Buffer for logging modifications during online index creation
+
+All modifications to an index that is being created will be logged by
+row_log_online_op() to this buffer.
+
+All modifications to a table that is being rebuilt will be logged by
+row_log_table_delete(), row_log_table_update(), row_log_table_insert()
+to this buffer.
+
+When head.blocks == tail.blocks, the reader will access tail.block
+directly. When also head.bytes == tail.bytes, both counts will be
+reset to 0 and the file will be truncated. */
+struct row_log_t {
+ pfs_os_file_t fd; /*!< file descriptor */
+ mysql_mutex_t mutex; /*!< mutex protecting error,
+ max_trx and tail */
+ dict_table_t* table; /*!< table that is being rebuilt,
+ or NULL when this is a secondary
+ index that is being created online */
+ bool same_pk;/*!< whether the definition of the PRIMARY KEY
+ has remained the same */
+ const dtuple_t* defaults;
+ /*!< default values of added, changed columns,
+ or NULL */
+ const ulint* col_map;/*!< mapping of old column numbers to
+ new ones, or NULL if !table */
+ dberr_t error; /*!< error that occurred during online
+ table rebuild */
+ /** The transaction ID of the ALTER TABLE transaction. Any
+ concurrent DML would necessarily be logged with a larger
+ transaction ID, because ha_innobase::prepare_inplace_alter_table()
+ acts as a barrier that ensures that any concurrent transaction
+ that operates on the table would have been started after
+ ha_innobase::prepare_inplace_alter_table() returns and before
+ ha_innobase::commit_inplace_alter_table(commit=true) is invoked.
+
+ Due to the nondeterministic nature of purge and due to the
+ possibility of upgrading from an earlier version of MariaDB
+ or MySQL, it is possible that row_log_table_low() would be
+ fed DB_TRX_ID that precedes than min_trx. We must normalize
+ such references to reset_trx_id[]. */
+ trx_id_t min_trx;
+ trx_id_t max_trx;/*!< biggest observed trx_id in
+ row_log_online_op();
+ protected by mutex and index->lock S-latch,
+ or by index->lock X-latch only */
+ row_log_buf_t tail; /*!< writer context;
+ protected by mutex and index->lock S-latch,
+ or by index->lock X-latch only */
+ size_t crypt_tail_size; /*!< size of crypt_tail_size*/
+ byte* crypt_tail; /*!< writer context;
+ temporary buffer used in encryption,
+ decryption or NULL*/
+ row_log_buf_t head; /*!< reader context; protected by MDL only;
+ modifiable by row_log_apply_ops() */
+ size_t crypt_head_size; /*!< size of crypt_tail_size*/
+ byte* crypt_head; /*!< reader context;
+ temporary buffer used in encryption,
+ decryption or NULL */
+ const char* path; /*!< where to create temporary file during
+ log operation */
+ /** the number of core fields in the clustered index of the
+ source table; before row_log_table_apply() completes, the
+ table could be emptied, so that table->is_instant() no longer holds,
+ but all log records must be in the "instant" format. */
+ unsigned n_core_fields;
+ /** the default values of non-core fields when the operation started */
+ dict_col_t::def_t* non_core_fields;
+ bool allow_not_null; /*!< Whether the alter ignore is being
+ used or if the sql mode is non-strict mode;
+ if not, NULL values will not be converted to
+ defaults */
+ const TABLE* old_table; /*< Use old table in case of error. */
+
+ uint64_t n_rows; /*< Number of rows read from the table */
+
+ /** Alter table transaction. It can be used to apply the DML logs
+ into the table */
+ const trx_t* alter_trx;
+
+ /** Determine whether the log should be in the 'instant ADD' format
+ @param[in] index the clustered index of the source table
+ @return whether to use the 'instant ADD COLUMN' format */
+ bool is_instant(const dict_index_t* index) const
+ {
+ ut_ad(table);
+ ut_ad(n_core_fields <= index->n_fields);
+ return n_core_fields != index->n_fields;
+ }
+
+ const byte* instant_field_value(ulint n, ulint* len) const
+ {
+ ut_ad(n >= n_core_fields);
+ const dict_col_t::def_t& d= non_core_fields[n - n_core_fields];
+ *len = d.len;
+ return static_cast<const byte*>(d.data);
+ }
+};
+
+/** Create the file or online log if it does not exist.
+@param[in,out] log online rebuild log
+@return true if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+pfs_os_file_t
+row_log_tmpfile(
+ row_log_t* log)
+{
+ DBUG_ENTER("row_log_tmpfile");
+ if (log->fd == OS_FILE_CLOSED) {
+ log->fd = row_merge_file_create_low(log->path);
+ DBUG_EXECUTE_IF("row_log_tmpfile_fail",
+ if (log->fd != OS_FILE_CLOSED)
+ row_merge_file_destroy_low(log->fd);
+ log->fd = OS_FILE_CLOSED;);
+ if (log->fd != OS_FILE_CLOSED) {
+ MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_LOG_FILES);
+ }
+ }
+
+ DBUG_RETURN(log->fd);
+}
+
+/** Allocate the memory for the log buffer.
+@param[in,out] log_buf Buffer used for log operation
+@return TRUE if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_log_block_allocate(
+ row_log_buf_t& log_buf)
+{
+ DBUG_ENTER("row_log_block_allocate");
+ if (log_buf.block == NULL) {
+ DBUG_EXECUTE_IF(
+ "simulate_row_log_allocation_failure",
+ DBUG_RETURN(false);
+ );
+
+ log_buf.block = ut_allocator<byte>(mem_key_row_log_buf)
+ .allocate_large(srv_sort_buf_size,
+ &log_buf.block_pfx);
+
+ if (log_buf.block == NULL) {
+ DBUG_RETURN(false);
+ }
+ log_buf.size = srv_sort_buf_size;
+ }
+ DBUG_RETURN(true);
+}
+
+/** Free the log buffer.
+@param[in,out] log_buf Buffer used for log operation */
+static
+void
+row_log_block_free(
+ row_log_buf_t& log_buf)
+{
+ DBUG_ENTER("row_log_block_free");
+ if (log_buf.block != NULL) {
+ ut_allocator<byte>(mem_key_row_log_buf).deallocate_large(
+ log_buf.block, &log_buf.block_pfx);
+ log_buf.block = NULL;
+ }
+ DBUG_VOID_RETURN;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param index index, S or X latched
+@param tuple index tuple
+@param trx_id transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+ trx_id_t trx_id)
+{
+ byte* b;
+ ulint extra_size;
+ ulint size;
+ ulint mrec_size;
+ ulint avail_size;
+ row_log_t* log;
+ bool success= true;
+
+ ut_ad(dtuple_validate(tuple));
+ ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
+ ut_ad(index->lock.have_x() || index->lock.have_s());
+
+ if (index->is_corrupted()) {
+ return success;
+ }
+
+ ut_ad(dict_index_is_online_ddl(index)
+ || (index->online_log
+ && index->online_status == ONLINE_INDEX_COMPLETE));
+
+ /* Compute the size of the record. This differs from
+ row_merge_buf_encode(), because here we do not encode
+ extra_size+1 (and reserve 0 as the end-of-chunk marker). */
+
+ size = rec_get_converted_size_temp<false>(
+ index, tuple->fields, tuple->n_fields, &extra_size);
+ ut_ad(size >= extra_size);
+ ut_ad(size <= sizeof log->tail.buf);
+
+ mrec_size = ROW_LOG_HEADER_SIZE
+ + (extra_size >= 0x80) + size
+ + (trx_id ? DATA_TRX_ID_LEN : 0);
+
+ log = index->online_log;
+ mysql_mutex_lock(&log->mutex);
+
+start_log:
+ if (trx_id > log->max_trx) {
+ log->max_trx = trx_id;
+ }
+
+ if (!row_log_block_allocate(log->tail)) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+ ut_ad(log->tail.bytes < srv_sort_buf_size);
+ avail_size = srv_sort_buf_size - log->tail.bytes;
+
+ if (mrec_size > avail_size) {
+ b = log->tail.buf;
+ } else {
+ b = log->tail.block + log->tail.bytes;
+ }
+
+ if (trx_id != 0) {
+ *b++ = ROW_OP_INSERT;
+ trx_write_trx_id(b, trx_id);
+ b += DATA_TRX_ID_LEN;
+ } else {
+ *b++ = ROW_OP_DELETE;
+ }
+
+ if (extra_size < 0x80) {
+ *b++ = (byte) extra_size;
+ } else {
+ ut_ad(extra_size < 0x8000);
+ *b++ = (byte) (0x80 | (extra_size >> 8));
+ *b++ = (byte) extra_size;
+ }
+
+ rec_convert_dtuple_to_temp<false>(
+ b + extra_size, index, tuple->fields, tuple->n_fields);
+
+ b += size;
+
+ if (mrec_size >= avail_size) {
+ const os_offset_t byte_offset
+ = (os_offset_t) log->tail.blocks
+ * srv_sort_buf_size;
+ byte* buf = log->tail.block;
+
+ if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+ if (index->online_status != ONLINE_INDEX_COMPLETE)
+ goto write_failed;
+ /* About to run out of log, InnoDB has to
+ apply the online log for the completed index */
+ index->lock.s_unlock();
+ dberr_t error= row_log_apply(
+ log->alter_trx, index, nullptr, nullptr);
+ index->lock.s_lock(SRW_LOCK_CALL);
+ if (error != DB_SUCCESS) {
+ /* Mark all newly added indexes
+ as corrupted */
+ log->error = error;
+ success = false;
+ goto err_exit;
+ }
+
+ /* Recheck whether the index online log */
+ if (!index->online_log) {
+ goto err_exit;
+ }
+
+ goto start_log;
+ }
+
+ if (mrec_size == avail_size) {
+ ut_ad(b == &buf[srv_sort_buf_size]);
+ } else {
+ ut_ad(b == log->tail.buf + mrec_size);
+ memcpy(buf + log->tail.bytes,
+ log->tail.buf, avail_size);
+ }
+
+ MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+ if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ /* If encryption is enabled encrypt buffer before writing it
+ to file system. */
+ if (srv_encrypt_log) {
+ if (!log_tmp_block_encrypt(
+ buf, srv_sort_buf_size,
+ log->crypt_tail, byte_offset)) {
+ log->error = DB_DECRYPTION_FAILED;
+ goto write_failed;
+ }
+
+ srv_stats.n_rowlog_blocks_encrypted.inc();
+ buf = log->crypt_tail;
+ }
+
+ log->tail.blocks++;
+ if (os_file_write(
+ IORequestWrite,
+ "(modification log)",
+ log->fd,
+ buf, byte_offset, srv_sort_buf_size)
+ != DB_SUCCESS) {
+write_failed:
+ index->type |= DICT_CORRUPT;
+ }
+
+ MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+ MEM_UNDEFINED(buf, srv_sort_buf_size);
+
+ memcpy(log->tail.block, log->tail.buf + avail_size,
+ mrec_size - avail_size);
+ log->tail.bytes = mrec_size - avail_size;
+ } else {
+ log->tail.bytes += mrec_size;
+ ut_ad(b == log->tail.block + log->tail.bytes);
+ }
+
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+ mysql_mutex_unlock(&log->mutex);
+ return success;
+}
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+ const dict_index_t* index) /*!< in: clustered index of a table
+ that is being rebuilt online */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ return(index->online_log->error);
+}
+
+/******************************************************//**
+Starts logging an operation to a table that is being rebuilt.
+@return pointer to log, or NULL if no logging is necessary */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+byte*
+row_log_table_open(
+/*===============*/
+ row_log_t* log, /*!< in/out: online rebuild log */
+ ulint size, /*!< in: size of log record */
+ ulint* avail) /*!< out: available size for log record */
+{
+ mysql_mutex_lock(&log->mutex);
+
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+ if (log->error != DB_SUCCESS) {
+err_exit:
+ mysql_mutex_unlock(&log->mutex);
+ return(NULL);
+ }
+
+ if (!row_log_block_allocate(log->tail)) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ ut_ad(log->tail.bytes < srv_sort_buf_size);
+ *avail = srv_sort_buf_size - log->tail.bytes;
+
+ if (size > *avail) {
+ /* Make sure log->tail.buf is large enough */
+ ut_ad(size <= sizeof log->tail.buf);
+ return(log->tail.buf);
+ } else {
+ return(log->tail.block + log->tail.bytes);
+ }
+}
+
+/******************************************************//**
+Stops logging an operation to a table that is being rebuilt. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_table_close_func(
+/*=====================*/
+ dict_index_t* index, /*!< in/out: online rebuilt index */
+#ifdef UNIV_DEBUG
+ const byte* b, /*!< in: end of log record */
+#endif /* UNIV_DEBUG */
+ ulint size, /*!< in: size of log record */
+ ulint avail) /*!< in: available size for log record */
+{
+ row_log_t* log = index->online_log;
+
+ mysql_mutex_assert_owner(&log->mutex);
+
+ if (size >= avail) {
+ const os_offset_t byte_offset
+ = (os_offset_t) log->tail.blocks
+ * srv_sort_buf_size;
+ byte* buf = log->tail.block;
+
+ if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+ goto write_failed;
+ }
+
+ if (size == avail) {
+ ut_ad(b == &buf[srv_sort_buf_size]);
+ } else {
+ ut_ad(b == log->tail.buf + size);
+ memcpy(buf + log->tail.bytes, log->tail.buf, avail);
+ }
+
+ MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+ if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+ log->error = DB_OUT_OF_MEMORY;
+ goto err_exit;
+ }
+
+ /* If encryption is enabled encrypt buffer before writing it
+ to file system. */
+ if (srv_encrypt_log) {
+ if (!log_tmp_block_encrypt(
+ log->tail.block, srv_sort_buf_size,
+ log->crypt_tail, byte_offset,
+ index->table->space_id)) {
+ log->error = DB_DECRYPTION_FAILED;
+ goto err_exit;
+ }
+
+ srv_stats.n_rowlog_blocks_encrypted.inc();
+ buf = log->crypt_tail;
+ }
+
+ log->tail.blocks++;
+ if (os_file_write(
+ IORequestWrite,
+ "(modification log)",
+ log->fd,
+ buf, byte_offset, srv_sort_buf_size)
+ != DB_SUCCESS) {
+write_failed:
+ log->error = DB_ONLINE_LOG_TOO_BIG;
+ }
+
+ MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+ MEM_UNDEFINED(buf, srv_sort_buf_size);
+ memcpy(log->tail.block, log->tail.buf + avail, size - avail);
+ log->tail.bytes = size - avail;
+ } else {
+ log->tail.bytes += size;
+ ut_ad(b == log->tail.block + log->tail.bytes);
+ }
+
+ log->tail.total += size;
+ MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+ mysql_mutex_unlock(&log->mutex);
+
+ onlineddl_rowlog_rows++;
+ /* 10000 means 100.00%, 4525 means 45.25% */
+ onlineddl_rowlog_pct_used = static_cast<ulint>((log->tail.total * 10000) / srv_online_max_size);
+}
+
+#ifdef UNIV_DEBUG
+# define row_log_table_close(index, b, size, avail) \
+ row_log_table_close_func(index, b, size, avail)
+#else /* UNIV_DEBUG */
+# define row_log_table_close(log, b, size, avail) \
+ row_log_table_close_func(index, size, avail)
+#endif /* UNIV_DEBUG */
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in] index cluster index
+@param[in] v_no virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+ const dict_index_t* index,
+ ulint v_no)
+{
+ return(dict_table_get_nth_v_col(
+ index->online_log->table, v_no)->m_col.ord_part);
+}
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+ be logged, or NULL to use those in rec */
+{
+ ulint old_pk_extra_size;
+ ulint old_pk_size;
+ ulint mrec_size;
+ ulint avail_size;
+ mem_heap_t* heap = NULL;
+ const dtuple_t* old_pk;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+ ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+ ut_ad(index->lock.have_any());
+
+ if (index->online_status != ONLINE_INDEX_CREATION
+ || (index->type & DICT_CORRUPT) || index->table->corrupted
+ || index->online_log->error != DB_SUCCESS) {
+ return;
+ }
+
+ dict_table_t* new_table = index->online_log->table;
+ dict_index_t* new_index = dict_table_get_first_index(new_table);
+
+ ut_ad(dict_index_is_clust(new_index));
+ ut_ad(!dict_index_is_online_ddl(new_index));
+ ut_ad(index->online_log->min_trx);
+
+ /* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */
+ if (index->online_log->same_pk) {
+ dtuple_t* tuple;
+ ut_ad(new_index->n_uniq == index->n_uniq);
+
+ /* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first
+ fields of the record. */
+ heap = mem_heap_create(
+ DATA_TRX_ID_LEN
+ + DTUPLE_EST_ALLOC(new_index->first_user_field()));
+ old_pk = tuple = dtuple_create(heap,
+ new_index->first_user_field());
+ dict_index_copy_types(tuple, new_index, tuple->n_fields);
+ dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);
+
+ for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+ ulint len;
+ const void* field = rec_get_nth_field(
+ rec, offsets, i, &len);
+ dfield_t* dfield = dtuple_get_nth_field(
+ tuple, i);
+ ut_ad(len != UNIV_SQL_NULL);
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ dfield_set_data(dfield, field, len);
+ }
+
+ dfield_t* db_trx_id = dtuple_get_nth_field(
+ tuple, new_index->n_uniq);
+
+ const bool replace_sys_fields
+ = sys
+ || trx_read_trx_id(static_cast<byte*>(db_trx_id->data))
+ < index->online_log->min_trx;
+
+ if (replace_sys_fields) {
+ if (!sys || trx_read_trx_id(sys)
+ < index->online_log->min_trx) {
+ sys = reset_trx_id;
+ }
+
+ dfield_set_data(db_trx_id, sys, DATA_TRX_ID_LEN);
+ dfield_set_data(db_trx_id + 1, sys + DATA_TRX_ID_LEN,
+ DATA_ROLL_PTR_LEN);
+ }
+
+ ut_d(trx_id_check(db_trx_id->data,
+ index->online_log->min_trx));
+ } else {
+ /* The PRIMARY KEY has changed. Translate the tuple. */
+ old_pk = row_log_table_get_pk(
+ rec, index, offsets, NULL, &heap);
+
+ if (!old_pk) {
+ ut_ad(index->online_log->error != DB_SUCCESS);
+ if (heap) {
+ goto func_exit;
+ }
+ return;
+ }
+ }
+
+ ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 2)->len);
+ ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 1)->len);
+ old_pk_size = rec_get_converted_size_temp<false>(
+ new_index, old_pk->fields, old_pk->n_fields,
+ &old_pk_extra_size);
+ ut_ad(old_pk_extra_size < 0x100);
+
+ /* 2 = 1 (extra_size) + at least 1 byte payload */
+ mrec_size = 2 + old_pk_size;
+
+ if (byte* b = row_log_table_open(index->online_log,
+ mrec_size, &avail_size)) {
+ *b++ = ROW_T_DELETE;
+ *b++ = static_cast<byte>(old_pk_extra_size);
+
+ rec_convert_dtuple_to_temp<false>(
+ b + old_pk_extra_size, new_index,
+ old_pk->fields, old_pk->n_fields);
+
+ b += old_pk_size;
+
+ row_log_table_close(index, b, mrec_size, avail_size);
+ }
+
+func_exit:
+ mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low_redundant(
+/*========================*/
+ const rec_t* rec, /*!< in: clustered index leaf
+ page record in ROW_FORMAT=REDUNDANT,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ bool insert, /*!< in: true if insert,
+ false if update */
+ const dtuple_t* old_pk, /*!< in: old PRIMARY KEY value
+ (if !insert and a PRIMARY KEY
+ is being created) */
+ const dict_index_t* new_index)
+ /*!< in: clustered index of the
+ new table, not latched */
+{
+ ulint old_pk_size;
+ ulint old_pk_extra_size;
+ ulint size;
+ ulint extra_size;
+ ulint mrec_size;
+ ulint avail_size;
+ mem_heap_t* heap = NULL;
+ dtuple_t* tuple;
+ const ulint n_fields = rec_get_n_fields_old(rec);
+
+ ut_ad(index->n_fields >= n_fields);
+ ut_ad(index->n_fields == n_fields || index->is_instant());
+ ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2));
+ ut_ad(!dict_table_is_comp(index->table)); /* redundant row format */
+ ut_ad(dict_index_is_clust(new_index));
+
+ heap = mem_heap_create(DTUPLE_EST_ALLOC(n_fields));
+ tuple = dtuple_create(heap, n_fields);
+ dict_index_copy_types(tuple, index, n_fields);
+
+ dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ for (ulint i = 0; i < n_fields; i++) {
+ dfield_t* dfield;
+ ulint len;
+ const void* field;
+
+ dfield = dtuple_get_nth_field(tuple, i);
+ field = rec_get_nth_field_old(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+ } else {
+ for (ulint i = 0; i < n_fields; i++) {
+ dfield_t* dfield;
+ ulint len;
+ const void* field;
+
+ dfield = dtuple_get_nth_field(tuple, i);
+ field = rec_get_nth_field_old(rec, i, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ if (rec_2_is_field_extern(rec, i)) {
+ dfield_set_ext(dfield);
+ }
+ }
+ }
+
+ dfield_t* db_trx_id = dtuple_get_nth_field(tuple, index->n_uniq);
+ ut_ad(dfield_get_len(db_trx_id) == DATA_TRX_ID_LEN);
+ ut_ad(dfield_get_len(db_trx_id + 1) == DATA_ROLL_PTR_LEN);
+
+ if (trx_read_trx_id(static_cast<const byte*>
+ (dfield_get_data(db_trx_id)))
+ < index->online_log->min_trx) {
+ dfield_set_data(db_trx_id, reset_trx_id, DATA_TRX_ID_LEN);
+ dfield_set_data(db_trx_id + 1, reset_trx_id + DATA_TRX_ID_LEN,
+ DATA_ROLL_PTR_LEN);
+ }
+
+ const bool is_instant = index->online_log->is_instant(index);
+ rec_comp_status_t status = is_instant
+ ? REC_STATUS_INSTANT : REC_STATUS_ORDINARY;
+
+ size = rec_get_converted_size_temp<true>(
+ index, tuple->fields, tuple->n_fields, &extra_size, status);
+ if (is_instant) {
+ size++;
+ extra_size++;
+ }
+
+ mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);
+
+ if (insert || index->online_log->same_pk) {
+ ut_ad(!old_pk);
+ old_pk_extra_size = old_pk_size = 0;
+ } else {
+ ut_ad(old_pk);
+ ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+ ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 2)->len);
+ ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 1)->len);
+
+ old_pk_size = rec_get_converted_size_temp<false>(
+ new_index, old_pk->fields, old_pk->n_fields,
+ &old_pk_extra_size);
+ ut_ad(old_pk_extra_size < 0x100);
+ mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+ }
+
+ if (byte* b = row_log_table_open(index->online_log,
+ mrec_size, &avail_size)) {
+ if (insert) {
+ *b++ = ROW_T_INSERT;
+ } else {
+ *b++ = ROW_T_UPDATE;
+
+ if (old_pk_size) {
+ *b++ = static_cast<byte>(old_pk_extra_size);
+
+ rec_convert_dtuple_to_temp<false>(
+ b + old_pk_extra_size, new_index,
+ old_pk->fields, old_pk->n_fields);
+ b += old_pk_size;
+ }
+ }
+
+ if (extra_size < 0x80) {
+ *b++ = static_cast<byte>(extra_size);
+ } else {
+ ut_ad(extra_size < 0x8000);
+ *b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+ *b++ = static_cast<byte>(extra_size);
+ }
+
+ if (status == REC_STATUS_INSTANT) {
+ ut_ad(is_instant);
+ if (n_fields <= index->online_log->n_core_fields) {
+ status = REC_STATUS_ORDINARY;
+ }
+ *b = status;
+ }
+
+ rec_convert_dtuple_to_temp<true>(
+ b + extra_size, index, tuple->fields, tuple->n_fields,
+ status);
+ b += size;
+
+ row_log_table_close(index, b, mrec_size, avail_size);
+ }
+
+ mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low(
+/*==============*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ bool insert, /*!< in: true if insert, false if update */
+ const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert
+ and a PRIMARY KEY is being created) */
+{
+ ulint old_pk_size;
+ ulint old_pk_extra_size;
+ ulint extra_size;
+ ulint mrec_size;
+ ulint avail_size;
+ const dict_index_t* new_index;
+ row_log_t* log = index->online_log;
+
+ new_index = dict_table_get_first_index(log->table);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_clust(new_index));
+ ut_ad(!dict_index_is_online_ddl(new_index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+ ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf);
+ ut_ad(index->lock.have_any());
+
+ /* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix
+ of the clustered index record (PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR),
+ with no information on virtual columns */
+ ut_ad(!old_pk || !insert);
+ ut_ad(!old_pk || old_pk->n_v_fields == 0);
+
+ if (index->online_status != ONLINE_INDEX_CREATION
+ || (index->type & DICT_CORRUPT) || index->table->corrupted
+ || log->error != DB_SUCCESS) {
+ return;
+ }
+
+ if (!rec_offs_comp(offsets)) {
+ row_log_table_low_redundant(
+ rec, index, insert, old_pk, new_index);
+ return;
+ }
+
+ ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
+ || rec_get_status(rec) == REC_STATUS_INSTANT);
+
+ const ulint omit_size = REC_N_NEW_EXTRA_BYTES;
+
+ const ulint rec_extra_size = rec_offs_extra_size(offsets) - omit_size;
+ const bool is_instant = log->is_instant(index);
+ extra_size = rec_extra_size + is_instant;
+
+ unsigned fake_extra_size = 0;
+ byte fake_extra_buf[3];
+ if (is_instant && UNIV_UNLIKELY(!index->is_instant())) {
+ /* The source table was emptied after ALTER TABLE
+ started, and it was converted to non-instant format.
+ Because row_log_table_apply_op() expects to find
+ all records to be logged in the same way, we will
+ be unable to copy the rec_extra_size bytes from the
+ record header, but must convert them here. */
+ unsigned n_add = index->n_fields - 1 - log->n_core_fields;
+ fake_extra_size = rec_get_n_add_field_len(n_add);
+ ut_ad(fake_extra_size == 1 || fake_extra_size == 2);
+ extra_size += fake_extra_size;
+ byte* fake_extra = fake_extra_buf + fake_extra_size;
+ rec_set_n_add_field(fake_extra, n_add);
+ ut_ad(fake_extra == fake_extra_buf);
+ }
+
+ mrec_size = ROW_LOG_HEADER_SIZE
+ + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size
+ + is_instant + fake_extra_size;
+
+ if (insert || log->same_pk) {
+ ut_ad(!old_pk);
+ old_pk_extra_size = old_pk_size = 0;
+ } else {
+ ut_ad(old_pk);
+ ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+ ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 2)->len);
+ ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+ old_pk, old_pk->n_fields - 1)->len);
+
+ old_pk_size = rec_get_converted_size_temp<false>(
+ new_index, old_pk->fields, old_pk->n_fields,
+ &old_pk_extra_size);
+ ut_ad(old_pk_extra_size < 0x100);
+ mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+ }
+
+ if (byte* b = row_log_table_open(log, mrec_size, &avail_size)) {
+ if (insert) {
+ *b++ = ROW_T_INSERT;
+ } else {
+ *b++ = ROW_T_UPDATE;
+
+ if (old_pk_size) {
+ *b++ = static_cast<byte>(old_pk_extra_size);
+
+ rec_convert_dtuple_to_temp<false>(
+ b + old_pk_extra_size, new_index,
+ old_pk->fields, old_pk->n_fields);
+ b += old_pk_size;
+ }
+ }
+
+ if (extra_size < 0x80) {
+ *b++ = static_cast<byte>(extra_size);
+ } else {
+ ut_ad(extra_size < 0x8000);
+ *b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+ *b++ = static_cast<byte>(extra_size);
+ }
+
+ if (is_instant) {
+ *b++ = fake_extra_size
+ ? REC_STATUS_INSTANT
+ : rec_get_status(rec);
+ } else {
+ ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+ }
+
+ memcpy(b, rec - rec_extra_size - omit_size, rec_extra_size);
+ b += rec_extra_size;
+ memcpy(b, fake_extra_buf + 1, fake_extra_size);
+ b += fake_extra_size;
+ ulint len;
+ ulint trx_id_offs = rec_get_nth_field_offs(
+ offsets, index->n_uniq, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memcpy(b, rec, rec_offs_data_size(offsets));
+ if (trx_read_trx_id(b + trx_id_offs) < log->min_trx) {
+ memcpy(b + trx_id_offs,
+ reset_trx_id, sizeof reset_trx_id);
+ }
+ b += rec_offs_data_size(offsets);
+
+ row_log_table_close(index, b, mrec_size, avail_size);
+ }
+}
+
+/******************************************************//**
+Logs an update to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ const dtuple_t* old_pk) /*!< in: row_log_table_get_pk()
+ before the update */
+{
+ row_log_table_low(rec, index, offsets, false, old_pk);
+}
+
+/** Gets the old table column of a PRIMARY KEY column.
+@param table old table (before ALTER TABLE)
+@param col_map mapping of old column numbers to new ones
+@param col_no column position in the new table
+@return old table column, or NULL if this is an added column */
+static
+const dict_col_t*
+row_log_table_get_pk_old_col(
+/*=========================*/
+ const dict_table_t* table,
+ const ulint* col_map,
+ ulint col_no)
+{
+ for (ulint i = 0; i < table->n_cols; i++) {
+ if (col_no == col_map[i]) {
+ return(dict_table_get_nth_col(table, i));
+ }
+ }
+
+ return(NULL);
+}
+
+/** Maps an old table column of a PRIMARY KEY column.
+@param[in] ifield clustered index field in the new table (after
+ALTER TABLE)
+@param[in] index the clustered index of ifield
+@param[in,out] dfield clustered index tuple field in the new table
+@param[in,out] heap memory heap for allocating dfield contents
+@param[in] rec clustered index leaf page record in the old
+table
+@param[in] offsets rec_get_offsets(rec)
+@param[in] i rec field corresponding to col
+@param[in] zip_size ROW_FORMAT=COMPRESSED size of the old table
+@param[in] max_len maximum length of dfield
+@param[in] log row log for the table
+@retval DB_INVALID_NULL if a NULL value is encountered
+@retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */
+static
+dberr_t
+row_log_table_get_pk_col(
+ const dict_field_t* ifield,
+ const dict_index_t* index,
+ dfield_t* dfield,
+ mem_heap_t* heap,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint i,
+ ulint zip_size,
+ ulint max_len,
+ const row_log_t* log)
+{
+ const byte* field;
+ ulint len;
+
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len == UNIV_SQL_DEFAULT) {
+ field = log->instant_field_value(i, &len);
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ if (!log->allow_not_null) {
+ return(DB_INVALID_NULL);
+ }
+
+ unsigned col_no= ifield->col->ind;
+ ut_ad(col_no < log->defaults->n_fields);
+
+ field = static_cast<const byte*>(
+ log->defaults->fields[col_no].data);
+ if (!field) {
+ return(DB_INVALID_NULL);
+ }
+ len = log->defaults->fields[col_no].len;
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint field_len = ifield->prefix_len;
+ byte* blob_field;
+
+ if (!field_len) {
+ field_len = ifield->fixed_len;
+ if (!field_len) {
+ field_len = max_len + 1;
+ }
+ }
+
+ blob_field = static_cast<byte*>(
+ mem_heap_alloc(heap, field_len));
+
+ len = btr_copy_externally_stored_field_prefix(
+ blob_field, field_len, zip_size, field, len);
+ if (len >= max_len + 1) {
+ return(DB_TOO_BIG_INDEX_COL);
+ }
+
+ dfield_set_data(dfield, blob_field, len);
+ } else {
+ dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */
+ byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for
+ row_log_table_delete(), or NULL */
+ mem_heap_t** heap) /*!< in/out: memory heap where allocated */
+{
+ dtuple_t* tuple = NULL;
+ row_log_t* log = index->online_log;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
+ ut_ad(index->lock.have_any());
+ ut_ad(log);
+ ut_ad(log->table);
+ ut_ad(log->min_trx);
+
+ if (log->same_pk) {
+ /* The PRIMARY KEY columns are unchanged. */
+ if (sys) {
+ /* Store the DB_TRX_ID,DB_ROLL_PTR. */
+ ulint trx_id_offs = index->trx_id_offset;
+
+ if (!trx_id_offs) {
+ ulint len;
+
+ if (!offsets) {
+ offsets = rec_get_offsets(
+ rec, index, nullptr,
+ index->n_core_fields,
+ index->db_trx_id() + 1, heap);
+ }
+
+ trx_id_offs = rec_get_nth_field_offs(
+ offsets, index->db_trx_id(), &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ }
+
+ const byte* ptr = trx_read_trx_id(rec + trx_id_offs)
+ < log->min_trx
+ ? reset_trx_id
+ : rec + trx_id_offs;
+
+ memcpy(sys, ptr, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ ut_d(trx_id_check(sys, log->min_trx));
+ }
+
+ return(NULL);
+ }
+
+ mysql_mutex_lock(&log->mutex);
+
+ /* log->error is protected by log->mutex. */
+ if (log->error == DB_SUCCESS) {
+ dict_table_t* new_table = log->table;
+ dict_index_t* new_index
+ = dict_table_get_first_index(new_table);
+ const ulint new_n_uniq
+ = dict_index_get_n_unique(new_index);
+
+ if (!*heap) {
+ ulint size = 0;
+
+ if (!offsets) {
+ size += (1 + REC_OFFS_HEADER_SIZE
+ + unsigned(index->n_fields))
+ * sizeof *offsets;
+ }
+
+ for (ulint i = 0; i < new_n_uniq; i++) {
+ size += dict_col_get_min_size(
+ dict_index_get_nth_col(new_index, i));
+ }
+
+ *heap = mem_heap_create(
+ DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);
+ }
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, nullptr,
+ index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+ }
+
+ tuple = dtuple_create(*heap, new_n_uniq + 2);
+ dict_index_copy_types(tuple, new_index, tuple->n_fields);
+ dtuple_set_n_fields_cmp(tuple, new_n_uniq);
+
+ const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);
+
+ const ulint zip_size = index->table->space->zip_size();
+
+ for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {
+ dict_field_t* ifield;
+ dfield_t* dfield;
+ ulint prtype;
+ ulint mbminlen, mbmaxlen;
+
+ ifield = dict_index_get_nth_field(new_index, new_i);
+ dfield = dtuple_get_nth_field(tuple, new_i);
+
+ const ulint col_no
+ = dict_field_get_col(ifield)->ind;
+
+ if (const dict_col_t* col
+ = row_log_table_get_pk_old_col(
+ index->table, log->col_map, col_no)) {
+ ulint i = dict_col_get_clust_pos(col, index);
+
+ if (i == ULINT_UNDEFINED) {
+ ut_ad(0);
+ log->error = DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ log->error = row_log_table_get_pk_col(
+ ifield, new_index, dfield, *heap,
+ rec, offsets, i, zip_size, max_len,
+ log);
+
+ if (log->error != DB_SUCCESS) {
+err_exit:
+ tuple = NULL;
+ goto func_exit;
+ }
+
+ mbminlen = col->mbminlen;
+ mbmaxlen = col->mbmaxlen;
+ prtype = col->prtype;
+ } else {
+ /* No matching column was found in the old
+ table, so this must be an added column.
+ Copy the default value. */
+ ut_ad(log->defaults);
+
+ dfield_copy(dfield, dtuple_get_nth_field(
+ log->defaults, col_no));
+ mbminlen = dfield->type.mbminlen;
+ mbmaxlen = dfield->type.mbmaxlen;
+ prtype = dfield->type.prtype;
+ }
+
+ ut_ad(!dfield_is_ext(dfield));
+ ut_ad(!dfield_is_null(dfield));
+
+ if (ifield->prefix_len) {
+ ulint len = dtype_get_at_most_n_mbchars(
+ prtype, mbminlen, mbmaxlen,
+ ifield->prefix_len,
+ dfield_get_len(dfield),
+ static_cast<const char*>(
+ dfield_get_data(dfield)));
+
+ ut_ad(len <= dfield_get_len(dfield));
+ dfield_set_len(dfield, len);
+ }
+ }
+
+ const byte* trx_roll = rec
+ + row_get_trx_id_offset(index, offsets);
+
+ /* Copy the fields, because the fields will be updated
+ or the record may be moved somewhere else in the B-tree
+ as part of the upcoming operation. */
+ if (trx_read_trx_id(trx_roll) < log->min_trx) {
+ trx_roll = reset_trx_id;
+ if (sys) {
+ memcpy(sys, trx_roll,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ }
+ } else if (sys) {
+ memcpy(sys, trx_roll,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ trx_roll = sys;
+ } else {
+ trx_roll = static_cast<const byte*>(
+ mem_heap_dup(
+ *heap, trx_roll,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+ }
+
+ ut_d(trx_id_check(trx_roll, log->min_trx));
+
+ dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),
+ trx_roll, DATA_TRX_ID_LEN);
+ dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),
+ trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+ }
+
+func_exit:
+ mysql_mutex_unlock(&log->mutex);
+ return(tuple);
+}
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+ const rec_t* rec, /*!< in: clustered index leaf page record,
+ page X-latched */
+ dict_index_t* index, /*!< in/out: clustered index, S-latched
+ or X-latched */
+ const rec_offs* offsets)/*!< in: rec_get_offsets(rec,index) */
+{
+ row_log_table_low(rec, index, offsets, true, NULL);
+}
+
+/******************************************************//**
+Converts a log record to a table row.
+@return converted row, or NULL if the conversion fails */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const dtuple_t*
+row_log_table_apply_convert_mrec(
+/*=============================*/
+ const mrec_t* mrec, /*!< in: merge record */
+ dict_index_t* index, /*!< in: index of mrec */
+ const rec_offs* offsets, /*!< in: offsets of mrec */
+ row_log_t* log, /*!< in: rebuild context */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ dberr_t* error) /*!< out: DB_SUCCESS or
+ DB_MISSING_HISTORY or
+ reason of failure */
+{
+ dtuple_t* row;
+
+ log->n_rows++;
+ *error = DB_SUCCESS;
+
+ /* This is based on row_build(). */
+ if (log->defaults) {
+ row = dtuple_copy(log->defaults, heap);
+ /* dict_table_copy_types() would set the fields to NULL */
+ for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {
+ dict_col_copy_type(
+ dict_table_get_nth_col(log->table, i),
+ dfield_get_type(dtuple_get_nth_field(row, i)));
+ }
+ } else {
+ row = dtuple_create(heap, dict_table_get_n_cols(log->table));
+ dict_table_copy_types(row, log->table);
+ }
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+
+ if (ind_field->prefix_len) {
+ /* Column prefixes can only occur in key
+ fields, which cannot be stored externally. For
+ a column prefix, there should also be the full
+ field in the clustered index tuple. The row
+ tuple comprises full fields, not prefixes. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ continue;
+ }
+
+ const dict_col_t* col
+ = dict_field_get_col(ind_field);
+
+ if (col->is_dropped()) {
+ /* the column was instantly dropped earlier */
+ ut_ad(index->table->instant);
+ continue;
+ }
+
+ ulint col_no
+ = log->col_map[dict_col_get_no(col)];
+
+ if (col_no == ULINT_UNDEFINED) {
+ /* the column is being dropped now */
+ continue;
+ }
+
+ dfield_t* dfield
+ = dtuple_get_nth_field(row, col_no);
+
+ ulint len;
+ const byte* data;
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ ut_ad(rec_offs_any_extern(offsets));
+ index->lock.x_lock(SRW_LOCK_CALL);
+
+ data = btr_rec_copy_externally_stored_field(
+ mrec, offsets,
+ index->table->space->zip_size(),
+ i, &len, heap);
+ ut_a(data);
+ dfield_set_data(dfield, data, len);
+
+ index->lock.x_unlock();
+ } else {
+ data = rec_get_nth_field(mrec, offsets, i, &len);
+ if (len == UNIV_SQL_DEFAULT) {
+ data = log->instant_field_value(i, &len);
+ }
+ dfield_set_data(dfield, data, len);
+ }
+
+ if (len != UNIV_SQL_NULL && col->mtype == DATA_MYSQL
+ && col->len != len && !dict_table_is_comp(log->table)) {
+
+ ut_ad(col->len >= len);
+ if (dict_table_is_comp(index->table)) {
+ byte* buf = (byte*) mem_heap_alloc(heap,
+ col->len);
+ memcpy(buf, dfield->data, len);
+ memset(buf + len, 0x20, col->len - len);
+
+ dfield_set_data(dfield, buf, col->len);
+ } else {
+ /* field length mismatch should not happen
+ when rebuilding the redundant row format
+ table. */
+ ut_ad(0);
+ *error = DB_CORRUPTION;
+ return(NULL);
+ }
+ }
+
+ /* See if any columns were changed to NULL or NOT NULL. */
+ const dict_col_t* new_col
+ = dict_table_get_nth_col(log->table, col_no);
+ ut_ad(new_col->same_format(*col));
+
+ /* Assert that prtype matches except for nullability. */
+ ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)
+ & ~(DATA_NOT_NULL | DATA_VERSIONED
+ | CHAR_COLL_MASK << 16 | DATA_LONG_TRUE_VARCHAR)));
+
+ if (new_col->prtype == col->prtype) {
+ continue;
+ }
+
+ if ((new_col->prtype & DATA_NOT_NULL)
+ && dfield_is_null(dfield)) {
+
+ if (!log->allow_not_null) {
+ /* We got a NULL value for a NOT NULL column. */
+ *error = DB_INVALID_NULL;
+ return NULL;
+ }
+
+ const dfield_t& default_field
+ = log->defaults->fields[col_no];
+
+ Field* field = log->old_table->field[col->ind];
+
+ field->set_warning(Sql_condition::WARN_LEVEL_WARN,
+ WARN_DATA_TRUNCATED, 1,
+ ulong(log->n_rows));
+
+ *dfield = default_field;
+ }
+
+ /* Adjust the DATA_NOT_NULL flag in the parsed row. */
+ dfield_get_type(dfield)->prtype = new_col->prtype;
+
+ ut_ad(dict_col_type_assert_equal(new_col,
+ dfield_get_type(dfield)));
+ }
+
+ return(row);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert_low(
+/*===========================*/
+ que_thr_t* thr, /*!< in: query graph */
+ const dtuple_t* row, /*!< in: table row
+ in the old table definition */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ row_merge_dup_t* dup) /*!< in/out: for reporting
+ duplicate key errors */
+{
+ dberr_t error;
+ dtuple_t* entry;
+ const row_log_t*log = dup->index->online_log;
+ dict_index_t* index = dict_table_get_first_index(log->table);
+ ulint n_index = 0;
+
+ ut_ad(dtuple_validate(row));
+
+ DBUG_LOG("ib_alter_table",
+ "insert table " << index->table->id << " (index "
+ << index->id << "): " << rec_printer(row).str());
+
+ static const ulint flags
+ = (BTR_CREATE_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG);
+
+ entry = row_build_index_entry(row, NULL, index, heap);
+
+ error = row_ins_clust_index_entry_low(
+ flags, BTR_MODIFY_TREE, index, index->n_uniq,
+ entry, 0, thr);
+
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ /* The row had already been copied to the table. */
+ return(DB_SUCCESS);
+ default:
+ return(error);
+ }
+
+ ut_ad(dict_index_is_clust(index));
+
+ for (n_index += index->type != DICT_CLUSTERED;
+ (index = dict_table_get_next_index(index)); n_index++) {
+ if (index->type & DICT_FTS) {
+ continue;
+ }
+
+ entry = row_build_index_entry(row, NULL, index, heap);
+ error = row_ins_sec_index_entry_low(
+ flags, BTR_INSERT_TREE,
+ index, offsets_heap, heap, entry,
+ thr_get_trx(thr)->id, thr);
+
+ if (error != DB_SUCCESS) {
+ if (error == DB_DUPLICATE_KEY) {
+ thr_get_trx(thr)->error_key_num = n_index;
+ }
+ break;
+ }
+ }
+
+ return(error);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert(
+/*=======================*/
+ que_thr_t* thr, /*!< in: query graph */
+ const mrec_t* mrec, /*!< in: record to insert */
+ const rec_offs* offsets, /*!< in: offsets of mrec */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ row_merge_dup_t* dup) /*!< in/out: for reporting
+ duplicate key errors */
+{
+ row_log_t*log = dup->index->online_log;
+ dberr_t error;
+ const dtuple_t* row = row_log_table_apply_convert_mrec(
+ mrec, dup->index, offsets, log, heap, &error);
+
+ switch (error) {
+ case DB_SUCCESS:
+ ut_ad(row != NULL);
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case DB_INVALID_NULL:
+ ut_ad(row == NULL);
+ return(error);
+ }
+
+ error = row_log_table_apply_insert_low(
+ thr, row, offsets_heap, heap, dup);
+ if (error != DB_SUCCESS) {
+ /* Report the erroneous row using the new
+ version of the table. */
+ innobase_row_to_mysql(dup->table, log->table, row);
+ }
+ return(error);
+}
+
+/******************************************************//**
+Deletes a record from a table that is being rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete_low(
+/*===========================*/
+ btr_pcur_t* pcur, /*!< in/out: B-tree cursor,
+ will be trashed */
+ const rec_offs* offsets, /*!< in: offsets on pcur */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction,
+ will be committed */
+{
+ dberr_t error;
+ row_ext_t* ext;
+ dtuple_t* row;
+ dict_index_t* index = pcur->index();
+
+ ut_ad(dict_index_is_clust(index));
+
+ DBUG_LOG("ib_alter_table",
+ "delete table " << index->table->id << " (index "
+ << index->id << "): "
+ << rec_printer(btr_pcur_get_rec(pcur), offsets).str());
+
+ if (dict_table_get_next_index(index)) {
+ /* Build a row template for purging secondary index entries. */
+ row = row_build(
+ ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),
+ offsets, NULL, NULL, NULL, &ext, heap);
+ } else {
+ row = NULL;
+ }
+
+ btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
+ BTR_CREATE_FLAG, false, mtr);
+ if (error != DB_SUCCESS) {
+err_exit:
+ mtr->commit();
+ return error;
+ }
+
+ mtr->commit();
+
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ if (index->type & DICT_FTS) {
+ continue;
+ }
+
+ const dtuple_t* entry = row_build_index_entry(
+ row, ext, index, heap);
+ mtr->start();
+ index->set_modified(*mtr);
+ pcur->btr_cur.page_cur.index = index;
+ error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_PURGE_TREE, pcur,
+ mtr);
+ if (error) {
+ goto err_exit;
+ }
+#ifdef UNIV_DEBUG
+ switch (btr_pcur_get_btr_cur(pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ /* We did not request buffering. */
+ break;
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ goto flag_ok;
+ }
+ ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+ if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
+ || btr_pcur_get_low_match(pcur) < index->n_uniq) {
+ /* All secondary index entries should be
+ found, because new_table is being modified by
+ this thread only, and all indexes should be
+ updated in sync. */
+ mtr->commit();
+ return(DB_INDEX_CORRUPT);
+ }
+
+ btr_cur_pessimistic_delete(&error, FALSE,
+ btr_pcur_get_btr_cur(pcur),
+ BTR_CREATE_FLAG, false, mtr);
+ mtr->commit();
+ }
+
+ return(error);
+}
+
+/******************************************************//**
+Replays a delete operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete(
+/*=======================*/
+ ulint trx_id_col, /*!< in: position of
+ DB_TRX_ID in the new
+ clustered index */
+ const mrec_t* mrec, /*!< in: merge record */
+ const rec_offs* moffsets, /*!< in: offsets of mrec */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const row_log_t* log) /*!< in: online log */
+{
+ dict_table_t* new_table = log->table;
+ dict_index_t* index = dict_table_get_first_index(new_table);
+ dtuple_t* old_pk;
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ rec_offs* offsets;
+
+ pcur.btr_cur.page_cur.index = index;
+ ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field());
+ ut_ad(!rec_offs_any_extern(moffsets));
+
+ /* Convert the row to a search tuple. */
+ old_pk = dtuple_create(heap, index->n_uniq);
+ dict_index_copy_types(old_pk, index, index->n_uniq);
+
+ for (ulint i = 0; i < index->n_uniq; i++) {
+ ulint len;
+ const void* field;
+ field = rec_get_nth_field(mrec, moffsets, i, &len);
+ ut_ad(len != UNIV_SQL_NULL);
+ dfield_set_data(dtuple_get_nth_field(old_pk, i),
+ field, len);
+ }
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ dberr_t err = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_PURGE_TREE, &pcur,
+ &mtr);
+ if (err != DB_SUCCESS) {
+ goto all_done;
+ }
+#ifdef UNIV_DEBUG
+ switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ /* We did not request buffering. */
+ break;
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ goto flag_ok;
+ }
+ ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+ if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+ || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+all_done:
+ mtr_commit(&mtr);
+ /* The record was not found. All done. */
+ /* This should only happen when an earlier
+ ROW_T_INSERT was skipped or
+ ROW_T_UPDATE was interpreted as ROW_T_DELETE
+ due to BLOBs having been freed by rollback. */
+ return err;
+ }
+
+ offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, nullptr,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ /* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */
+
+ {
+ ulint len;
+ const byte* mrec_trx_id
+ = rec_get_nth_field(mrec, moffsets, trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ const byte* rec_trx_id
+ = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+ trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_d(trx_id_check(rec_trx_id, log->min_trx));
+ ut_d(trx_id_check(mrec_trx_id, log->min_trx));
+
+ ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len)
+ == mrec_trx_id + DATA_TRX_ID_LEN);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+ trx_id_col + 1, &len)
+ == rec_trx_id + DATA_TRX_ID_LEN);
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ if (memcmp(mrec_trx_id, rec_trx_id,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+ /* The ROW_T_DELETE was logged for a different
+ PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR.
+ This is possible if a ROW_T_INSERT was skipped
+ or a ROW_T_UPDATE was interpreted as ROW_T_DELETE
+ because some BLOBs were missing due to
+ (1) rolling back the initial insert, or
+ (2) purging the BLOB for a later ROW_T_DELETE
+ (3) purging 'old values' for a later ROW_T_UPDATE
+ or ROW_T_DELETE. */
+ ut_ad(!log->same_pk);
+ goto all_done;
+ }
+ }
+
+ return row_log_table_apply_delete_low(&pcur, offsets, heap, &mtr);
+}
+
+/******************************************************//**
+Replays an update operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_update(
+/*=======================*/
+ que_thr_t* thr, /*!< in: query graph */
+ ulint new_trx_id_col, /*!< in: position of
+ DB_TRX_ID in the new
+ clustered index */
+ const mrec_t* mrec, /*!< in: new value */
+ const rec_offs* offsets, /*!< in: offsets of mrec */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ row_merge_dup_t* dup, /*!< in/out: for reporting
+ duplicate key errors */
+ const dtuple_t* old_pk) /*!< in: PRIMARY KEY and
+ DB_TRX_ID,DB_ROLL_PTR
+ of the old value,
+ or PRIMARY KEY if same_pk */
+{
+ row_log_t* log = dup->index->online_log;
+ const dtuple_t* row;
+ dict_index_t* index = dict_table_get_first_index(log->table);
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ dberr_t error;
+ ulint n_index = 0;
+
+ pcur.btr_cur.page_cur.index = index;
+
+ ut_ad(dtuple_get_n_fields_cmp(old_pk)
+ == dict_index_get_n_unique(index));
+ ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2)
+ == dict_index_get_n_unique(index));
+
+ row = row_log_table_apply_convert_mrec(
+ mrec, dup->index, offsets, log, heap, &error);
+
+ switch (error) {
+ case DB_SUCCESS:
+ ut_ad(row != NULL);
+ break;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case DB_INVALID_NULL:
+ ut_ad(row == NULL);
+ return(error);
+ }
+
+ mtr.start();
+ index->set_modified(mtr);
+ error = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur,
+ &mtr);
+ if (error != DB_SUCCESS) {
+func_exit:
+ mtr.commit();
+func_exit_committed:
+ ut_ad(mtr.has_committed());
+ ut_free(pcur.old_rec_buf);
+
+ if (error != DB_SUCCESS) {
+ /* Report the erroneous row using the new
+ version of the table. */
+ innobase_row_to_mysql(dup->table, log->table, row);
+ }
+
+ return error;
+ }
+#ifdef UNIV_DEBUG
+ switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ ut_ad(0);/* We did not request buffering. */
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ break;
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(!page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+ && btr_pcur_get_low_match(&pcur) >= index->n_uniq);
+
+ /* Prepare to update (or delete) the record. */
+ rec_offs* cur_offsets = rec_get_offsets(
+ btr_pcur_get_rec(&pcur), index, nullptr, index->n_core_fields,
+ ULINT_UNDEFINED, &offsets_heap);
+
+#ifdef UNIV_DEBUG
+ if (!log->same_pk) {
+ ulint len;
+ const byte* rec_trx_id
+ = rec_get_nth_field(btr_pcur_get_rec(&pcur),
+ cur_offsets, index->n_uniq, &len);
+ const dfield_t* old_pk_trx_id
+ = dtuple_get_nth_field(old_pk, index->n_uniq);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_d(trx_id_check(rec_trx_id, log->min_trx));
+ ut_ad(old_pk_trx_id->len == DATA_TRX_ID_LEN);
+ ut_ad(old_pk_trx_id[1].len == DATA_ROLL_PTR_LEN);
+ ut_ad(DATA_TRX_ID_LEN
+ + static_cast<const char*>(old_pk_trx_id->data)
+ == old_pk_trx_id[1].data);
+ ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx));
+ ut_ad(!memcmp(rec_trx_id, old_pk_trx_id->data,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+ }
+#endif
+
+ dtuple_t* entry = row_build_index_entry_low(
+ row, NULL, index, heap, ROW_BUILD_NORMAL);
+ upd_t* update = row_upd_build_difference_binary(
+ index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
+ false, false, NULL, heap, dup->table, &error);
+ if (error != DB_SUCCESS || !update->n_fields) {
+ goto func_exit;
+ }
+
+ const bool pk_updated
+ = upd_get_nth_field(update, 0)->field_no < new_trx_id_col;
+
+ if (pk_updated || rec_offs_any_extern(cur_offsets)) {
+ /* If the record contains any externally stored
+ columns, perform the update by delete and insert,
+ because we will not write any undo log that would
+ allow purge to free any orphaned externally stored
+ columns. */
+
+ if (pk_updated && log->same_pk) {
+ /* The ROW_T_UPDATE log record should only be
+ written when the PRIMARY KEY fields of the
+ record did not change in the old table. We
+ can only get a change of PRIMARY KEY columns
+ in the rebuilt table if the PRIMARY KEY was
+ redefined (!same_pk). */
+ ut_ad(0);
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ error = row_log_table_apply_delete_low(
+ &pcur, cur_offsets, heap, &mtr);
+ ut_ad(mtr.has_committed());
+
+ if (error == DB_SUCCESS) {
+ error = row_log_table_apply_insert_low(
+ thr, row, offsets_heap, heap, dup);
+ }
+
+ goto func_exit_committed;
+ }
+
+ dtuple_t* old_row;
+ row_ext_t* old_ext;
+
+ if (dict_table_get_next_index(index)) {
+ /* Construct the row corresponding to the old value of
+ the record. */
+ old_row = row_build(
+ ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),
+ cur_offsets, NULL, NULL, NULL, &old_ext, heap);
+ ut_ad(old_row);
+
+ DBUG_LOG("ib_alter_table",
+ "update table " << index->table->id
+ << " (index " << index->id
+ << ": " << rec_printer(old_row).str()
+ << " to " << rec_printer(row).str());
+ } else {
+ old_row = NULL;
+ old_ext = NULL;
+ }
+
+ big_rec_t* big_rec;
+
+ error = btr_cur_pessimistic_update(
+ BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG
+ | BTR_KEEP_POS_FLAG,
+ btr_pcur_get_btr_cur(&pcur),
+ &cur_offsets, &offsets_heap, heap, &big_rec,
+ update, 0, thr, 0, &mtr);
+
+ if (big_rec) {
+ if (error == DB_SUCCESS) {
+ error = btr_store_big_rec_extern_fields(
+ &pcur, cur_offsets, big_rec, &mtr,
+ BTR_STORE_UPDATE);
+ }
+
+ dtuple_big_rec_free(big_rec);
+ }
+
+ for (n_index += index->type != DICT_CLUSTERED;
+ (index = dict_table_get_next_index(index)); n_index++) {
+ if (!index->is_btree()) {
+ continue;
+ }
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ if (!row_upd_changes_ord_field_binary(
+ index, update, thr, old_row, NULL)) {
+ continue;
+ }
+
+ if (dict_index_has_virtual(index)) {
+ dtuple_copy_v_fields(old_row, old_pk);
+ }
+
+ mtr.commit();
+
+ entry = row_build_index_entry(old_row, old_ext, index, heap);
+ if (!entry) {
+ ut_ad(0);
+ error = DB_CORRUPTION;
+ goto func_exit_committed;
+ }
+
+ mtr.start();
+ index->set_modified(mtr);
+ pcur.btr_cur.page_cur.index = index;
+
+ ut_free(pcur.old_rec_buf);
+ pcur.old_rec_buf = nullptr;
+
+ if (ROW_FOUND != row_search_index_entry(
+ entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+ ut_ad(0);
+ error = DB_CORRUPTION;
+ break;
+ }
+
+ btr_cur_pessimistic_delete(
+ &error, FALSE, btr_pcur_get_btr_cur(&pcur),
+ BTR_CREATE_FLAG, false, &mtr);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ mtr.commit();
+
+ entry = row_build_index_entry(row, NULL, index, heap);
+ error = row_ins_sec_index_entry_low(
+ BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
+ BTR_INSERT_TREE, index, offsets_heap, heap,
+ entry, thr_get_trx(thr)->id, thr);
+
+ /* Report correct index name for duplicate key error. */
+ if (error == DB_DUPLICATE_KEY) {
+ thr_get_trx(thr)->error_key_num = n_index;
+ }
+
+ mtr.start();
+ index->set_modified(mtr);
+ }
+
+ goto func_exit;
+}
+
+/******************************************************//**
+Applies an operation to a table that was rebuilt.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_table_apply_op(
+/*===================*/
+ que_thr_t* thr, /*!< in: query graph */
+ ulint new_trx_id_col, /*!< in: position of
+ DB_TRX_ID in new index */
+ row_merge_dup_t* dup, /*!< in/out: for reporting
+ duplicate key errors */
+ dberr_t* error, /*!< out: DB_SUCCESS
+ or error code */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap
+ that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ const mrec_t* mrec, /*!< in: merge record */
+ const mrec_t* mrec_end, /*!< in: end of buffer */
+ rec_offs* offsets) /*!< in/out: work area
+ for parsing mrec */
+{
+ row_log_t* log = dup->index->online_log;
+ dict_index_t* new_index = dict_table_get_first_index(log->table);
+ ulint extra_size;
+ const mrec_t* next_mrec;
+ dtuple_t* old_pk;
+
+ ut_ad(dict_index_is_clust(dup->index));
+ ut_ad(dup->index->table != log->table);
+ ut_ad(log->head.total <= log->tail.total);
+
+ *error = DB_SUCCESS;
+
+ /* 3 = 1 (op type) + 1 (extra_size) + at least 1 byte payload */
+ if (mrec + 3 >= mrec_end) {
+ return(NULL);
+ }
+
+ const bool is_instant = log->is_instant(dup->index);
+ const mrec_t* const mrec_start = mrec;
+
+ switch (*mrec++) {
+ default:
+ ut_ad(0);
+ *error = DB_CORRUPTION;
+ return(NULL);
+ case ROW_T_INSERT:
+ extra_size = *mrec++;
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ ut_ad(extra_size || !is_instant);
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_offs_set_n_fields(offsets, dup->index->n_fields);
+ rec_init_offsets_temp(mrec, dup->index, offsets,
+ log->n_core_fields, log->non_core_fields,
+ is_instant
+ ? static_cast<rec_comp_status_t>(
+ *(mrec - extra_size))
+ : REC_STATUS_ORDINARY);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ } else {
+ log->head.total += ulint(next_mrec - mrec_start);
+ *error = row_log_table_apply_insert(
+ thr, mrec, offsets, offsets_heap,
+ heap, dup);
+ }
+ break;
+
+ case ROW_T_DELETE:
+ extra_size = *mrec++;
+ ut_ad(mrec < mrec_end);
+
+ /* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
+ For fixed-length PRIMARY key columns, it is 0. */
+ mrec += extra_size;
+
+ /* The ROW_T_DELETE record was converted by
+ rec_convert_dtuple_to_temp() using new_index. */
+ ut_ad(!new_index->is_instant());
+ rec_offs_set_n_fields(offsets, new_index->first_user_field());
+ rec_init_offsets_temp(mrec, new_index, offsets);
+ next_mrec = mrec + rec_offs_data_size(offsets);
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ }
+
+ log->head.total += ulint(next_mrec - mrec_start);
+
+ *error = row_log_table_apply_delete(
+ new_trx_id_col,
+ mrec, offsets, offsets_heap, heap, log);
+ break;
+
+ case ROW_T_UPDATE:
+ /* Logically, the log entry consists of the
+ (PRIMARY KEY,DB_TRX_ID) of the old value (converted
+ to the new primary key definition) followed by
+ the new value in the old table definition. If the
+ definition of the columns belonging to PRIMARY KEY
+ is not changed, the log will only contain
+ DB_TRX_ID,new_row. */
+
+ if (log->same_pk) {
+ ut_ad(new_index->n_uniq == dup->index->n_uniq);
+
+ extra_size = *mrec++;
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ ut_ad(extra_size || !is_instant);
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_offs_set_n_fields(offsets, dup->index->n_fields);
+ rec_init_offsets_temp(mrec, dup->index, offsets,
+ log->n_core_fields,
+ log->non_core_fields,
+ is_instant
+ ? static_cast<rec_comp_status_t>(
+ *(mrec - extra_size))
+ : REC_STATUS_ORDINARY);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ }
+
+ old_pk = dtuple_create(heap, new_index->n_uniq);
+ dict_index_copy_types(
+ old_pk, new_index, old_pk->n_fields);
+
+ /* Copy the PRIMARY KEY fields from mrec to old_pk. */
+ for (ulint i = 0; i < new_index->n_uniq; i++) {
+ const void* field;
+ ulint len;
+ dfield_t* dfield;
+
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+
+ field = rec_get_nth_field(
+ mrec, offsets, i, &len);
+ ut_ad(len != UNIV_SQL_NULL);
+
+ dfield = dtuple_get_nth_field(old_pk, i);
+ dfield_set_data(dfield, field, len);
+ }
+ } else {
+ /* We assume extra_size < 0x100
+ for the PRIMARY KEY prefix. */
+ mrec += *mrec + 1;
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ /* Get offsets for PRIMARY KEY,
+ DB_TRX_ID, DB_ROLL_PTR. */
+ /* The old_pk prefix was converted by
+ rec_convert_dtuple_to_temp() using new_index. */
+ ut_ad(!new_index->is_instant());
+ rec_offs_set_n_fields(offsets,
+ new_index->first_user_field());
+ rec_init_offsets_temp(mrec, new_index, offsets);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+ if (next_mrec + 2 > mrec_end) {
+ return(NULL);
+ }
+
+ /* Copy the PRIMARY KEY fields and
+ DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */
+ old_pk = dtuple_create(heap,
+ new_index->first_user_field());
+ dict_index_copy_types(old_pk, new_index,
+ old_pk->n_fields);
+
+ for (ulint i = 0; i < new_index->first_user_field();
+ i++) {
+ const void* field;
+ ulint len;
+ dfield_t* dfield;
+
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+
+ field = rec_get_nth_field(
+ mrec, offsets, i, &len);
+ ut_ad(len != UNIV_SQL_NULL);
+
+ dfield = dtuple_get_nth_field(old_pk, i);
+ dfield_set_data(dfield, field, len);
+ }
+
+ mrec = next_mrec;
+
+ /* Fetch the new value of the row as it was
+ in the old table definition. */
+ extra_size = *mrec++;
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ ut_ad(extra_size || !is_instant);
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_offs_set_n_fields(offsets, dup->index->n_fields);
+ rec_init_offsets_temp(mrec, dup->index, offsets,
+ log->n_core_fields,
+ log->non_core_fields,
+ is_instant
+ ? static_cast<rec_comp_status_t>(
+ *(mrec - extra_size))
+ : REC_STATUS_ORDINARY);
+
+ next_mrec = mrec + rec_offs_data_size(offsets);
+
+ if (next_mrec > mrec_end) {
+ return(NULL);
+ }
+ }
+
+ ut_ad(next_mrec <= mrec_end);
+ log->head.total += ulint(next_mrec - mrec_start);
+ dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);
+
+ *error = row_log_table_apply_update(
+ thr, new_trx_id_col,
+ mrec, offsets, offsets_heap, heap, dup, old_pk);
+ break;
+ }
+
+ ut_ad(log->head.total <= log->tail.total);
+ mem_heap_empty(offsets_heap);
+ mem_heap_empty(heap);
+ return(next_mrec);
+}
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much an ALTER TABLE progress should be incremented per
+one block of log applied.
+For the other phases of ALTER TABLE we increment the progress with 1 per
+page processed.
+@return amount of abstract units to add to work_completed when one block
+of log is applied.
+*/
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+ /* We must increment the progress once per page (as in
+ srv_page_size, default = innodb_page_size=16KiB).
+ One block here is srv_sort_buf_size (usually 1MiB). */
+ const ulint pages_per_block = std::max<ulint>(
+ ulint(srv_sort_buf_size >> srv_page_size_shift), 1);
+
+ /* Multiply by an artificial factor of 6 to even the pace with
+ the rest of the ALTER TABLE phases, they process page_size amount
+ of data faster. */
+ return(pages_per_block * 6);
+}
+
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in] index index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+ const dict_index_t* index)
+{
+ if (index == NULL || index->online_log == NULL
+ || index->online_log_is_dummy()) {
+ return(0);
+ }
+
+ const row_log_t* l = index->online_log;
+ const ulint bytes_left =
+ static_cast<ulint>(l->tail.total - l->head.total);
+ const ulint blocks_left = bytes_left / srv_sort_buf_size;
+
+ return(blocks_left * row_log_progress_inc_per_block());
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+ return(0);
+}
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Applies operations to a table was rebuilt.
+@param[in] thr query graph
+@param[in,out] dup for reporting duplicate key errors
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_log_table_apply_ops(
+ que_thr_t* thr,
+ row_merge_dup_t* dup,
+ ut_stage_alter_t* stage)
+{
+ dberr_t error;
+ const mrec_t* mrec = NULL;
+ const mrec_t* next_mrec;
+ const mrec_t* mrec_end = NULL; /* silence bogus warning */
+ const mrec_t* next_mrec_end;
+ mem_heap_t* heap;
+ mem_heap_t* offsets_heap;
+ rec_offs* offsets;
+ bool has_index_lock;
+ dict_index_t* index = const_cast<dict_index_t*>(
+ dup->index);
+ dict_table_t* new_table = index->online_log->table;
+ dict_index_t* new_index = dict_table_get_first_index(
+ new_table);
+ const ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + std::max<ulint>(index->n_fields,
+ new_index->first_user_field());
+ const ulint new_trx_id_col = dict_col_get_clust_pos(
+ dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);
+ trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_is_online_ddl(index));
+ ut_ad(trx->mysql_thd);
+ ut_ad(index->lock.have_x());
+ ut_ad(!dict_index_is_online_ddl(new_index));
+ ut_ad(dict_col_get_clust_pos(
+ dict_table_get_sys_col(index->table, DATA_TRX_ID), index)
+ != ULINT_UNDEFINED);
+ ut_ad(new_trx_id_col > 0);
+ ut_ad(new_trx_id_col != ULINT_UNDEFINED);
+
+ MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+ offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+ rec_offs_set_n_alloc(offsets, i);
+ rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+ heap = mem_heap_create(srv_page_size);
+ offsets_heap = mem_heap_create(srv_page_size);
+ has_index_lock = true;
+
+next_block:
+ ut_ad(has_index_lock);
+ ut_ad(index->lock.have_u_or_x());
+ ut_ad(index->online_log->head.bytes == 0);
+
+ stage->inc(row_log_progress_inc_per_block());
+
+ if (trx_is_interrupted(trx)) {
+ goto interrupted;
+ }
+
+ if (index->is_corrupted()) {
+ error = DB_INDEX_CORRUPT;
+ goto func_exit;
+ }
+
+ ut_ad(dict_index_is_online_ddl(index));
+
+ error = index->online_log->error;
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(index->online_log->head.blocks
+ > index->online_log->tail.blocks)) {
+unexpected_eof:
+ ib::error() << "Unexpected end of temporary file for table "
+ << index->table->name;
+corruption:
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ if (index->online_log->head.blocks
+ == index->online_log->tail.blocks) {
+ if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+ /* Truncate the file in order to save space. */
+ if (index->online_log->fd > 0
+ && ftruncate(index->online_log->fd, 0) == -1) {
+ ib::error()
+ << "\'" << index->name + 1
+ << "\' failed with error "
+ << errno << ":" << strerror(errno);
+
+ goto corruption;
+ }
+#endif /* HAVE_FTRUNCATE */
+ index->online_log->head.blocks
+ = index->online_log->tail.blocks = 0;
+ }
+
+ next_mrec = index->online_log->tail.block;
+ next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+ if (next_mrec_end == next_mrec) {
+ /* End of log reached. */
+all_done:
+ ut_ad(has_index_lock);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ index->online_log->head.bytes = 0;
+ index->online_log->tail.bytes = 0;
+ error = DB_SUCCESS;
+ goto func_exit;
+ }
+ } else {
+ os_offset_t ofs;
+
+ ofs = (os_offset_t) index->online_log->head.blocks
+ * srv_sort_buf_size;
+
+ ut_ad(has_index_lock);
+ has_index_lock = false;
+ index->lock.x_unlock();
+
+ log_free_check();
+
+ ut_ad(dict_index_is_online_ddl(index));
+
+ if (!row_log_block_allocate(index->online_log->head)) {
+ error = DB_OUT_OF_MEMORY;
+ goto func_exit;
+ }
+
+ byte* buf = index->online_log->head.block;
+
+ if (DB_SUCCESS
+ != os_file_read(IORequestRead, index->online_log->fd,
+ buf, ofs, srv_sort_buf_size, nullptr)) {
+ ib::error()
+ << "Unable to read temporary file"
+ " for table " << index->table->name;
+ goto corruption;
+ }
+
+ if (srv_encrypt_log) {
+ if (!log_tmp_block_decrypt(
+ buf, srv_sort_buf_size,
+ index->online_log->crypt_head, ofs)) {
+ error = DB_DECRYPTION_FAILED;
+ goto func_exit;
+ }
+
+ srv_stats.n_rowlog_blocks_decrypted.inc();
+ memcpy(buf, index->online_log->crypt_head,
+ srv_sort_buf_size);
+ }
+
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(index->online_log->fd,
+ ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ next_mrec = index->online_log->head.block;
+ next_mrec_end = next_mrec + srv_sort_buf_size;
+ }
+
+ /* This read is not protected by index->online_log->mutex for
+ performance reasons. We will eventually notice any error that
+ was flagged by a DML thread. */
+ error = index->online_log->error;
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (mrec) {
+ /* A partial record was read from the previous block.
+ Copy the temporary buffer full, as we do not know the
+ length of the record. Parse subsequent records from
+ the bigger buffer index->online_log->head.block
+ or index->online_log->tail.block. */
+
+ ut_ad(mrec == index->online_log->head.buf);
+ ut_ad(mrec_end > mrec);
+ ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+ memcpy((mrec_t*) mrec_end, next_mrec,
+ ulint((&index->online_log->head.buf)[1] - mrec_end));
+ mrec = row_log_table_apply_op(
+ thr, new_trx_id_col,
+ dup, &error, offsets_heap, heap,
+ index->online_log->head.buf,
+ (&index->online_log->head.buf)[1], offsets);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (UNIV_UNLIKELY(mrec == NULL)) {
+ /* The record was not reassembled properly. */
+ goto corruption;
+ }
+ /* The record was previously found out to be
+ truncated. Now that the parse buffer was extended,
+ it should proceed beyond the old end of the buffer. */
+ ut_a(mrec > mrec_end);
+
+ index->online_log->head.bytes = ulint(mrec - mrec_end);
+ next_mrec += index->online_log->head.bytes;
+ }
+
+ ut_ad(next_mrec <= next_mrec_end);
+ /* The following loop must not be parsing the temporary
+ buffer, but head.block or tail.block. */
+
+ /* mrec!=NULL means that the next record starts from the
+ middle of the block */
+ ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+ if (next_mrec_end == index->online_log->head.block
+ + srv_sort_buf_size) {
+ /* If tail.bytes == 0, next_mrec_end can also be at
+ the end of tail.block. */
+ if (index->online_log->tail.bytes == 0) {
+ ut_ad(next_mrec == next_mrec_end);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes == 0);
+ } else {
+ ut_ad(next_mrec == index->online_log->head.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks
+ > index->online_log->head.blocks);
+ }
+ } else if (next_mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes) {
+ ut_ad(next_mrec == index->online_log->tail.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes
+ <= index->online_log->tail.bytes);
+ } else {
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ mrec_end = next_mrec_end;
+
+ while (!trx_is_interrupted(trx)) {
+ mrec = next_mrec;
+ ut_ad(mrec <= mrec_end);
+
+ if (mrec == mrec_end) {
+ /* We are at the end of the log.
+ Mark the replay all_done. */
+ if (has_index_lock) {
+ goto all_done;
+ }
+ }
+
+ if (!has_index_lock) {
+ /* We are applying operations from a different
+ block than the one that is being written to.
+ We do not hold index->lock in order to
+ allow other threads to concurrently buffer
+ modifications. */
+ ut_ad(mrec >= index->online_log->head.block);
+ ut_ad(mrec_end == index->online_log->head.block
+ + srv_sort_buf_size);
+ ut_ad(index->online_log->head.bytes
+ < srv_sort_buf_size);
+
+ /* Take the opportunity to do a redo log
+ checkpoint if needed. */
+ log_free_check();
+ } else {
+ /* We are applying operations from the last block.
+ Do not allow other threads to buffer anything,
+ so that we can finally catch up and synchronize. */
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(mrec >= index->online_log->tail.block);
+ }
+
+ /* This read is not protected by index->online_log->mutex
+ for performance reasons. We will eventually notice any
+ error that was flagged by a DML thread. */
+ error = index->online_log->error;
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ next_mrec = row_log_table_apply_op(
+ thr, new_trx_id_col,
+ dup, &error, offsets_heap, heap,
+ mrec, mrec_end, offsets);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (next_mrec == next_mrec_end) {
+ /* The record happened to end on a block boundary.
+ Do we have more blocks left? */
+ if (has_index_lock) {
+ /* The index will be locked while
+ applying the last block. */
+ goto all_done;
+ }
+
+ mrec = NULL;
+process_next_block:
+ index->lock.x_lock(SRW_LOCK_CALL);
+ has_index_lock = true;
+
+ index->online_log->head.bytes = 0;
+ index->online_log->head.blocks++;
+ goto next_block;
+ } else if (next_mrec != NULL) {
+ ut_ad(next_mrec < next_mrec_end);
+ index->online_log->head.bytes
+ += ulint(next_mrec - mrec);
+ } else if (has_index_lock) {
+ /* When mrec is within tail.block, it should
+ be a complete record, because we are holding
+ index->lock and thus excluding the writer. */
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(0);
+ goto unexpected_eof;
+ } else {
+ memcpy(index->online_log->head.buf, mrec,
+ ulint(mrec_end - mrec));
+ mrec_end += ulint(index->online_log->head.buf - mrec);
+ mrec = index->online_log->head.buf;
+ goto process_next_block;
+ }
+ }
+
+interrupted:
+ error = DB_INTERRUPTED;
+func_exit:
+ if (!has_index_lock) {
+ index->lock.x_lock(SRW_LOCK_CALL);
+ }
+
+ mem_heap_free(offsets_heap);
+ mem_heap_free(heap);
+ row_log_block_free(index->online_log->head);
+ ut_free(offsets);
+ return(error);
+}
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in] thr query graph
+@param[in] old_table old table
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in] new_table Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+ que_thr_t* thr,
+ dict_table_t* old_table,
+ struct TABLE* table,
+ ut_stage_alter_t* stage,
+ dict_table_t* new_table)
+{
+ dberr_t error;
+ dict_index_t* clust_index;
+
+ thr_get_trx(thr)->error_key_num = 0;
+ DBUG_EXECUTE_IF("innodb_trx_duplicates",
+ thr_get_trx(thr)->duplicates = TRX_DUP_REPLACE;);
+
+ stage->begin_phase_log_table();
+
+ clust_index = dict_table_get_first_index(old_table);
+
+ if (clust_index->online_log->n_rows == 0) {
+ clust_index->online_log->n_rows = new_table->stat_n_rows;
+ }
+
+ clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+ if (!clust_index->online_log) {
+ ut_ad(dict_index_get_online_status(clust_index)
+ == ONLINE_INDEX_COMPLETE);
+ /* This function should not be called unless
+ rebuilding a table online. Build in some fault
+ tolerance. */
+ ut_ad(0);
+ error = DB_ERROR;
+ } else {
+ row_merge_dup_t dup = {
+ clust_index, table,
+ clust_index->online_log->col_map, 0
+ };
+
+ error = row_log_table_apply_ops(thr, &dup, stage);
+
+ ut_ad(error != DB_SUCCESS
+ || clust_index->online_log->head.total
+ == clust_index->online_log->tail.total);
+ }
+
+ clust_index->lock.x_unlock();
+ DBUG_EXECUTE_IF("innodb_trx_duplicates",
+ thr_get_trx(thr)->duplicates = 0;);
+
+ return(error);
+}
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+ const trx_t* trx, /*!< in: the ALTER TABLE transaction */
+ dict_index_t* index, /*!< in/out: index */
+ dict_table_t* table, /*!< in/out: new table being rebuilt,
+ or NULL when creating a secondary index */
+ bool same_pk,/*!< in: whether the definition of the
+ PRIMARY KEY has remained the same */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added, changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL if !table */
+ const char* path, /*!< in: where to create temporary file */
+ const TABLE* old_table, /*!< in: table definition before alter */
+ const bool allow_not_null) /*!< in: allow null to not-null
+ conversion */
+{
+ row_log_t* log;
+ DBUG_ENTER("row_log_allocate");
+
+ ut_ad(!dict_index_is_online_ddl(index));
+ ut_ad(dict_index_is_clust(index) == !!table);
+ ut_ad(!table || index->table != table);
+ ut_ad(same_pk || table);
+ ut_ad(!table || col_map);
+ ut_ad(!defaults || col_map);
+ ut_ad(index->lock.have_u_or_x());
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->id);
+
+ log = static_cast<row_log_t*>(ut_malloc_nokey(sizeof *log));
+
+ if (log == NULL) {
+ DBUG_RETURN(false);
+ }
+
+ log->fd = OS_FILE_CLOSED;
+ mysql_mutex_init(index_online_log_key, &log->mutex, nullptr);
+
+ log->table = table;
+ log->same_pk = same_pk;
+ log->defaults = defaults;
+ log->col_map = col_map;
+ log->error = DB_SUCCESS;
+ log->min_trx = trx->id;
+ log->max_trx = 0;
+ log->tail.blocks = log->tail.bytes = 0;
+ log->tail.total = 0;
+ log->tail.block = log->head.block = NULL;
+ log->crypt_tail = log->crypt_head = NULL;
+ log->head.blocks = log->head.bytes = 0;
+ log->head.total = 0;
+ log->path = path;
+ log->n_core_fields = index->n_core_fields;
+ ut_ad(!table || log->is_instant(index)
+ == (index->n_core_fields < index->n_fields));
+ log->allow_not_null = allow_not_null;
+ log->old_table = old_table;
+ log->n_rows = 0;
+
+ if (table && index->is_instant()) {
+ const unsigned n = log->n_core_fields;
+ log->non_core_fields = UT_NEW_ARRAY_NOKEY(
+ dict_col_t::def_t, index->n_fields - n);
+ for (unsigned i = n; i < index->n_fields; i++) {
+ log->non_core_fields[i - n]
+ = index->fields[i].col->def_val;
+ }
+ } else {
+ log->non_core_fields = NULL;
+ }
+
+ dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
+
+ if (srv_encrypt_log) {
+ log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size;
+ log->crypt_head = static_cast<byte *>(
+ my_large_malloc(&log->crypt_head_size, MYF(MY_WME)));
+ log->crypt_tail = static_cast<byte *>(
+ my_large_malloc(&log->crypt_tail_size, MYF(MY_WME)));
+
+ if (!log->crypt_head || !log->crypt_tail) {
+ row_log_free(log);
+ DBUG_RETURN(false);
+ }
+ }
+
+ index->online_log = log;
+
+ if (!table) {
+ /* Assign the clustered index online log to table.
+ It can be used by concurrent DML to identify whether
+ the table has any online DDL */
+ index->table->indexes.start->online_log_make_dummy();
+ log->alter_trx = trx;
+ }
+
+ /* While we might be holding an exclusive data dictionary lock
+ here, in row_log_abort_sec() we will not always be holding it. Use
+ atomic operations in both cases. */
+ MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);
+
+ DBUG_RETURN(true);
+}
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+ row_log_t* log) /*!< in,own: row log */
+{
+ MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
+
+ UT_DELETE_ARRAY(log->non_core_fields);
+ row_log_block_free(log->tail);
+ row_log_block_free(log->head);
+ row_merge_file_destroy_low(log->fd);
+
+ if (log->crypt_head) {
+ my_large_free(log->crypt_head, log->crypt_head_size);
+ }
+
+ if (log->crypt_tail) {
+ my_large_free(log->crypt_tail, log->crypt_tail_size);
+ }
+
+ mysql_mutex_destroy(&log->mutex);
+ ut_free(log);
+}
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+ dict_index_t* index) /*!< in: index, must be locked */
+{
+ ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
+#ifdef SAFE_MUTEX
+ ut_ad(index->lock.have_x()
+ || (index->lock.have_s()
+ && mysql_mutex_is_owner(&index->online_log->mutex)));
+#endif
+ return(index->online_log->max_trx);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_apply_op_low(
+/*=================*/
+ dict_index_t* index, /*!< in/out: index */
+ row_merge_dup_t*dup, /*!< in/out: for reporting
+ duplicate key errors */
+ dberr_t* error, /*!< out: DB_SUCCESS or error code */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap for
+ allocating offsets; can be emptied */
+ bool has_index_lock, /*!< in: true if holding index->lock
+ in exclusive mode */
+ enum row_op op, /*!< in: operation being applied */
+ trx_id_t trx_id, /*!< in: transaction identifier */
+ const dtuple_t* entry) /*!< in: row */
+{
+ mtr_t mtr;
+ btr_cur_t cursor;
+ rec_offs* offsets = NULL;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ ut_ad(index->lock.have_x() == has_index_lock);
+
+ ut_ad(!index->is_corrupted());
+ ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
+
+ DBUG_LOG("ib_create_index",
+ (op == ROW_OP_INSERT ? "insert " : "delete ")
+ << (has_index_lock ? "locked index " : "unlocked index ")
+ << index->id << ',' << ib::hex(trx_id) << ": "
+ << rec_printer(entry).str());
+
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ cursor.page_cur.index = index;
+ if (has_index_lock) {
+ mtr_x_lock_index(index, &mtr);
+ }
+
+ /* We perform the pessimistic variant of the operations if we
+ already hold index->lock exclusively. First, search the
+ record. The operation may already have been performed,
+ depending on when the row in the clustered index was
+ scanned. */
+ *error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock
+ ? BTR_MODIFY_TREE_ALREADY_LATCHED
+ : BTR_MODIFY_LEAF, &mtr);
+ if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+ goto func_exit;
+ }
+
+ ut_ad(dict_index_get_n_unique(index) > 0);
+ /* This test is somewhat similar to row_ins_must_modify_rec(),
+ but not identical for unique secondary indexes. */
+ if (cursor.low_match >= dict_index_get_n_unique(index)
+ && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {
+ /* We have a matching record. */
+ bool exists = (cursor.low_match
+ == dict_index_get_n_fields(index));
+#ifdef UNIV_DEBUG
+ rec_t* rec = btr_cur_get_rec(&cursor);
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+#endif /* UNIV_DEBUG */
+
+ ut_ad(exists || dict_index_is_unique(index));
+
+ switch (op) {
+ case ROW_OP_DELETE:
+ if (!exists) {
+ /* The existing record matches the
+ unique secondary index key, but the
+ PRIMARY KEY columns differ. So, this
+ exact record does not exist. For
+ example, we could detect a duplicate
+ key error in some old index before
+ logging an ROW_OP_INSERT for our
+ index. This ROW_OP_DELETE could have
+ been logged for rolling back
+ TRX_UNDO_INSERT_REC. */
+ goto func_exit;
+ }
+
+ *error = btr_cur_optimistic_delete(
+ &cursor, BTR_CREATE_FLAG, &mtr);
+
+ if (*error != DB_FAIL) {
+ break;
+ }
+
+ if (!has_index_lock) {
+ /* This needs a pessimistic operation.
+ Lock the index tree exclusively. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ *error = cursor.search_leaf(entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE,
+ &mtr);
+ if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+ goto func_exit;
+ }
+ /* No other thread than the current one
+ is allowed to modify the index tree.
+ Thus, the record should still exist. */
+ ut_ad(cursor.low_match
+ >= dict_index_get_n_fields(index));
+ ut_ad(page_rec_is_user_rec(
+ btr_cur_get_rec(&cursor)));
+ }
+
+ /* As there are no externally stored fields in
+ a secondary index record, the parameter
+ rollback=false will be ignored. */
+
+ btr_cur_pessimistic_delete(
+ error, FALSE, &cursor,
+ BTR_CREATE_FLAG, false, &mtr);
+ break;
+ case ROW_OP_INSERT:
+ if (exists) {
+ /* The record already exists. There
+ is nothing to be inserted.
+ This could happen when processing
+ TRX_UNDO_DEL_MARK_REC in statement
+ rollback:
+
+ UPDATE of PRIMARY KEY can lead to
+ statement rollback if the updated
+ value of the PRIMARY KEY already
+ exists. In this case, the UPDATE would
+ be mapped to DELETE;INSERT, and we
+ only wrote undo log for the DELETE
+ part. The duplicate key error would be
+ triggered before logging the INSERT
+ part.
+
+ Theoretically, we could also get a
+ similar situation when a DELETE operation
+ is blocked by a FOREIGN KEY constraint. */
+ goto func_exit;
+ }
+
+ if (dtuple_contains_null(entry)) {
+ /* The UNIQUE KEY columns match, but
+ there is a NULL value in the key, and
+ NULL!=NULL. */
+ goto insert_the_rec;
+ }
+
+ goto duplicate;
+ }
+ } else {
+ switch (op) {
+ rec_t* rec;
+ big_rec_t* big_rec;
+ case ROW_OP_DELETE:
+ /* The record does not exist. For example, we
+ could detect a duplicate key error in some old
+ index before logging an ROW_OP_INSERT for our
+ index. This ROW_OP_DELETE could be logged for
+ rolling back TRX_UNDO_INSERT_REC. */
+ goto func_exit;
+ case ROW_OP_INSERT:
+ if (dict_index_is_unique(index)
+ && (cursor.up_match
+ >= dict_index_get_n_unique(index)
+ || cursor.low_match
+ >= dict_index_get_n_unique(index))
+ && (!index->n_nullable
+ || !dtuple_contains_null(entry))) {
+duplicate:
+ /* Duplicate key */
+ ut_ad(dict_index_is_unique(index));
+ row_merge_dup_report(dup, entry->fields);
+ *error = DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+insert_the_rec:
+ /* Insert the record. As we are inserting into
+ a secondary index, there cannot be externally
+ stored columns (!big_rec). */
+ *error = btr_cur_optimistic_insert(
+ BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG,
+ &cursor, &offsets, &offsets_heap,
+ const_cast<dtuple_t*>(entry),
+ &rec, &big_rec, 0, NULL, &mtr);
+ ut_ad(!big_rec);
+ if (*error != DB_FAIL) {
+ break;
+ }
+
+ if (!has_index_lock) {
+ /* This needs a pessimistic operation.
+ Lock the index tree exclusively. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ index->set_modified(mtr);
+ *error = cursor.search_leaf(entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE,
+ &mtr);
+ if (*error != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ /* We already determined that the
+ record did not exist. No other thread
+ than the current one is allowed to
+ modify the index tree. Thus, the
+ record should still not exist. */
+
+ *error = btr_cur_pessimistic_insert(
+ BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG,
+ &cursor, &offsets, &offsets_heap,
+ const_cast<dtuple_t*>(entry),
+ &rec, &big_rec,
+ 0, NULL, &mtr);
+ ut_ad(!big_rec);
+ break;
+ }
+ mem_heap_empty(offsets_heap);
+ }
+
+ if (*error == DB_SUCCESS && trx_id) {
+ page_update_max_trx_id(btr_cur_get_block(&cursor),
+ btr_cur_get_page_zip(&cursor),
+ trx_id, &mtr);
+ }
+
+func_exit:
+ mtr_commit(&mtr);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_apply_op(
+/*=============*/
+ dict_index_t* index, /*!< in/out: index */
+ row_merge_dup_t*dup, /*!< in/out: for reporting
+ duplicate key errors */
+ dberr_t* error, /*!< out: DB_SUCCESS or error code */
+ mem_heap_t* offsets_heap, /*!< in/out: memory heap for
+ allocating offsets; can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap for
+ allocating data tuples */
+ bool has_index_lock, /*!< in: true if holding index->lock
+ in exclusive mode */
+ const mrec_t* mrec, /*!< in: merge record */
+ const mrec_t* mrec_end, /*!< in: end of buffer */
+ rec_offs* offsets) /*!< in/out: work area for
+ rec_init_offsets_temp() */
+
+{
+ enum row_op op;
+ ulint extra_size;
+ ulint data_size;
+ dtuple_t* entry;
+ trx_id_t trx_id;
+
+ /* Online index creation is only used for secondary indexes. */
+ ut_ad(!dict_index_is_clust(index));
+
+ ut_ad(index->lock.have_x() == has_index_lock);
+
+ if (index->is_corrupted()) {
+ *error = DB_INDEX_CORRUPT;
+ return(NULL);
+ }
+
+ *error = DB_SUCCESS;
+
+ if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {
+ return(NULL);
+ }
+
+ switch (*mrec) {
+ case ROW_OP_INSERT:
+ if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {
+ return(NULL);
+ }
+
+ op = static_cast<enum row_op>(*mrec++);
+ trx_id = trx_read_trx_id(mrec);
+ mrec += DATA_TRX_ID_LEN;
+ break;
+ case ROW_OP_DELETE:
+ op = static_cast<enum row_op>(*mrec++);
+ trx_id = 0;
+ break;
+ default:
+corrupted:
+ ut_ad(0);
+ *error = DB_CORRUPTION;
+ return(NULL);
+ }
+
+ extra_size = *mrec++;
+
+ ut_ad(mrec < mrec_end);
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *mrec++;
+ }
+
+ mrec += extra_size;
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ rec_init_offsets_temp(mrec, index, offsets);
+
+ if (rec_offs_any_extern(offsets)) {
+ /* There should never be any externally stored fields
+ in a secondary index, which is what online index
+ creation is used for. Therefore, the log file must be
+ corrupted. */
+ goto corrupted;
+ }
+
+ data_size = rec_offs_data_size(offsets);
+
+ mrec += data_size;
+
+ if (mrec > mrec_end) {
+ return(NULL);
+ }
+
+ entry = row_rec_to_index_entry_low(
+ mrec - data_size, index, offsets, heap);
+ /* Online index creation is only implemented for secondary
+ indexes, which never contain off-page columns. */
+ ut_ad(dtuple_get_n_ext(entry) == 0);
+
+ row_log_apply_op_low(index, dup, error, offsets_heap,
+ has_index_lock, op, trx_id, entry);
+ return(mrec);
+}
+
+/** Applies operations to a secondary index that was being created.
+@param[in] trx transaction (for checking if the operation was
+interrupted)
+@param[in,out] index index
+@param[in,out] dup for reporting duplicate key errors
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied or nullptr when row log applied done by DML
+thread.
+@return DB_SUCCESS, or error code on failure */
+static
+dberr_t
+row_log_apply_ops(
+ const trx_t* trx,
+ dict_index_t* index,
+ row_merge_dup_t* dup,
+ ut_stage_alter_t* stage)
+{
+ dberr_t error;
+ const mrec_t* mrec = NULL;
+ const mrec_t* next_mrec;
+ const mrec_t* mrec_end= NULL; /* silence bogus warning */
+ const mrec_t* next_mrec_end;
+ mem_heap_t* offsets_heap;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+ bool has_index_lock;
+ const ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+
+ ut_ad(dict_index_is_online_ddl(index)
+ || (index->online_log
+ && index->online_status == ONLINE_INDEX_COMPLETE));
+ ut_ad(!index->is_committed());
+ ut_ad(index->lock.have_x());
+ ut_ad(index->online_log);
+
+ MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+ offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+ rec_offs_set_n_alloc(offsets, i);
+ rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+ offsets_heap = mem_heap_create(srv_page_size);
+ heap = mem_heap_create(srv_page_size);
+ has_index_lock = true;
+
+next_block:
+ ut_ad(has_index_lock);
+ ut_ad(index->lock.have_x());
+ ut_ad(index->online_log->head.bytes == 0);
+
+ if (stage) {
+ stage->inc(row_log_progress_inc_per_block());
+ }
+
+ if (trx_is_interrupted(trx)) {
+ goto interrupted;
+ }
+
+ error = index->online_log->error;
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (index->is_corrupted()) {
+ error = DB_INDEX_CORRUPT;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(index->online_log->head.blocks
+ > index->online_log->tail.blocks)) {
+unexpected_eof:
+ ib::error() << "Unexpected end of temporary file for index "
+ << index->name;
+corruption:
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ if (index->online_log->head.blocks
+ == index->online_log->tail.blocks) {
+ if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+ /* Truncate the file in order to save space. */
+ if (index->online_log->fd > 0
+ && ftruncate(index->online_log->fd, 0) == -1) {
+ ib::error()
+ << "\'" << index->name + 1
+ << "\' failed with error "
+ << errno << ":" << strerror(errno);
+
+ goto corruption;
+ }
+#endif /* HAVE_FTRUNCATE */
+ index->online_log->head.blocks
+ = index->online_log->tail.blocks = 0;
+ }
+
+ next_mrec = index->online_log->tail.block;
+ next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+ if (next_mrec_end == next_mrec) {
+ /* End of log reached. */
+all_done:
+ ut_ad(has_index_lock);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ index->online_log->tail.bytes = 0;
+ index->online_log->head.bytes = 0;
+ error = DB_SUCCESS;
+ goto func_exit;
+ }
+ } else {
+ os_offset_t ofs = static_cast<os_offset_t>(
+ index->online_log->head.blocks)
+ * srv_sort_buf_size;
+ ut_ad(has_index_lock);
+ has_index_lock = false;
+ index->lock.x_unlock();
+
+ log_free_check();
+
+ if (!row_log_block_allocate(index->online_log->head)) {
+ error = DB_OUT_OF_MEMORY;
+ goto func_exit;
+ }
+
+ byte* buf = index->online_log->head.block;
+
+ if (DB_SUCCESS
+ != os_file_read(IORequestRead, index->online_log->fd,
+ buf, ofs, srv_sort_buf_size, nullptr)) {
+ ib::error()
+ << "Unable to read temporary file"
+ " for index " << index->name;
+ goto corruption;
+ }
+
+ if (srv_encrypt_log) {
+ if (!log_tmp_block_decrypt(
+ buf, srv_sort_buf_size,
+ index->online_log->crypt_head, ofs)) {
+ error = DB_DECRYPTION_FAILED;
+ goto func_exit;
+ }
+
+ srv_stats.n_rowlog_blocks_decrypted.inc();
+ memcpy(buf, index->online_log->crypt_head, srv_sort_buf_size);
+ }
+
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(index->online_log->fd,
+ ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ next_mrec = index->online_log->head.block;
+ next_mrec_end = next_mrec + srv_sort_buf_size;
+ }
+
+ if (mrec) {
+ /* A partial record was read from the previous block.
+ Copy the temporary buffer full, as we do not know the
+ length of the record. Parse subsequent records from
+ the bigger buffer index->online_log->head.block
+ or index->online_log->tail.block. */
+
+ ut_ad(mrec == index->online_log->head.buf);
+ ut_ad(mrec_end > mrec);
+ ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+ memcpy((mrec_t*) mrec_end, next_mrec,
+ ulint((&index->online_log->head.buf)[1] - mrec_end));
+ mrec = row_log_apply_op(
+ index, dup, &error, offsets_heap, heap,
+ has_index_lock, index->online_log->head.buf,
+ (&index->online_log->head.buf)[1], offsets);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (UNIV_UNLIKELY(mrec == NULL)) {
+ /* The record was not reassembled properly. */
+ goto corruption;
+ }
+ /* The record was previously found out to be
+ truncated. Now that the parse buffer was extended,
+ it should proceed beyond the old end of the buffer. */
+ ut_a(mrec > mrec_end);
+
+ index->online_log->head.bytes = ulint(mrec - mrec_end);
+ next_mrec += index->online_log->head.bytes;
+ }
+
+ ut_ad(next_mrec <= next_mrec_end);
+ /* The following loop must not be parsing the temporary
+ buffer, but head.block or tail.block. */
+
+ /* mrec!=NULL means that the next record starts from the
+ middle of the block */
+ ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+ if (next_mrec_end == index->online_log->head.block
+ + srv_sort_buf_size) {
+ /* If tail.bytes == 0, next_mrec_end can also be at
+ the end of tail.block. */
+ if (index->online_log->tail.bytes == 0) {
+ ut_ad(next_mrec == next_mrec_end);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes == 0);
+ } else {
+ ut_ad(next_mrec == index->online_log->head.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks
+ > index->online_log->head.blocks);
+ }
+ } else if (next_mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes) {
+ ut_ad(next_mrec == index->online_log->tail.block
+ + index->online_log->head.bytes);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->head.bytes
+ <= index->online_log->tail.bytes);
+ } else {
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ mrec_end = next_mrec_end;
+
+ while (!trx_is_interrupted(trx)) {
+ mrec = next_mrec;
+ ut_ad(mrec < mrec_end);
+
+ if (!has_index_lock) {
+ /* We are applying operations from a different
+ block than the one that is being written to.
+ We do not hold index->lock in order to
+ allow other threads to concurrently buffer
+ modifications. */
+ ut_ad(mrec >= index->online_log->head.block);
+ ut_ad(mrec_end == index->online_log->head.block
+ + srv_sort_buf_size);
+ ut_ad(index->online_log->head.bytes
+ < srv_sort_buf_size);
+
+ /* Take the opportunity to do a redo log
+ checkpoint if needed. */
+ log_free_check();
+ } else {
+ /* We are applying operations from the last block.
+ Do not allow other threads to buffer anything,
+ so that we can finally catch up and synchronize. */
+ ut_ad(index->online_log->head.blocks == 0);
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(mrec >= index->online_log->tail.block);
+ }
+
+ next_mrec = row_log_apply_op(
+ index, dup, &error, offsets_heap, heap,
+ has_index_lock, mrec, mrec_end, offsets);
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ } else if (next_mrec == next_mrec_end) {
+ /* The record happened to end on a block boundary.
+ Do we have more blocks left? */
+ if (has_index_lock) {
+ /* The index will be locked while
+ applying the last block. */
+ goto all_done;
+ }
+
+ mrec = NULL;
+process_next_block:
+ index->lock.x_lock(SRW_LOCK_CALL);
+ has_index_lock = true;
+
+ index->online_log->head.bytes = 0;
+ index->online_log->head.blocks++;
+ goto next_block;
+ } else if (next_mrec != NULL) {
+ ut_ad(next_mrec < next_mrec_end);
+ index->online_log->head.bytes
+ += ulint(next_mrec - mrec);
+ } else if (has_index_lock) {
+ /* When mrec is within tail.block, it should
+ be a complete record, because we are holding
+ index->lock and thus excluding the writer. */
+ ut_ad(index->online_log->tail.blocks == 0);
+ ut_ad(mrec_end == index->online_log->tail.block
+ + index->online_log->tail.bytes);
+ ut_ad(0);
+ goto unexpected_eof;
+ } else {
+ memcpy(index->online_log->head.buf, mrec,
+ ulint(mrec_end - mrec));
+ mrec_end += ulint(index->online_log->head.buf - mrec);
+ mrec = index->online_log->head.buf;
+ goto process_next_block;
+ }
+ }
+
+interrupted:
+ error = DB_INTERRUPTED;
+func_exit:
+ if (!has_index_lock) {
+ index->lock.x_lock(SRW_LOCK_CALL);
+ }
+
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ case DB_INDEX_CORRUPT:
+ if (((os_offset_t) index->online_log->tail.blocks + 1)
+ * srv_sort_buf_size >= srv_online_max_size) {
+ /* The log file grew too big. */
+ error = DB_ONLINE_LOG_TOO_BIG;
+ }
+ /* fall through */
+ default:
+ index->type |= DICT_CORRUPT;
+ }
+
+ mem_heap_free(heap);
+ mem_heap_free(offsets_heap);
+ row_log_block_free(index->online_log->head);
+ ut_free(offsets);
+ return(error);
+}
+
+/** Apply the row log to the index upon completing index creation.
+@param[in] trx transaction (for checking if the operation was
+interrupted)
+@param[in,out] index secondary index
+@param[in,out] table MySQL table (for reporting duplicates)
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied or nullptr
+when row log has been applied by DML thread.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+ const trx_t* trx,
+ dict_index_t* index,
+ struct TABLE* table,
+ ut_stage_alter_t* stage)
+{
+ dberr_t error;
+ row_merge_dup_t dup = { index, table, NULL, 0 };
+ DBUG_ENTER("row_log_apply");
+
+ ut_ad(dict_index_is_online_ddl(index)
+ || (index->online_log
+ && index->online_status == ONLINE_INDEX_COMPLETE));
+ ut_ad(!dict_index_is_clust(index));
+
+ if (stage) {
+ stage->begin_phase_log_index();
+ }
+
+ log_free_check();
+
+ index->lock.x_lock(SRW_LOCK_CALL);
+
+ if (index->online_log && !index->table->corrupted) {
+ error = row_log_apply_ops(trx, index, &dup, stage);
+ } else {
+ error = DB_SUCCESS;
+ }
+
+ if (error != DB_SUCCESS) {
+ ut_ad(index->table->space);
+ index->type |= DICT_CORRUPT;
+ index->table->drop_aborted = TRUE;
+
+ dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+ } else if (stage) {
+ /* Mark the index as completed only when it is
+ being called by DDL thread */
+ ut_ad(dup.n_dup == 0);
+ dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
+ }
+
+ index->lock.x_unlock();
+
+ DBUG_RETURN(error);
+}
+
+unsigned row_log_get_n_core_fields(const dict_index_t *index)
+{
+ ut_ad(index->online_log);
+ return index->online_log->n_core_fields;
+}
+
+dberr_t row_log_get_error(const dict_index_t *index)
+{
+ ut_ad(index->online_log);
+ return index->online_log->error;
+}
+
+dberr_t dict_table_t::clear(que_thr_t *thr)
+{
+ dberr_t err= DB_SUCCESS;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(indexes); index;
+ index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (index->type & DICT_FTS)
+ continue;
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ continue;
+ case ONLINE_INDEX_COMPLETE:
+ break;
+ case ONLINE_INDEX_CREATION:
+ ut_ad("invalid type" == 0);
+ MY_ASSERT_UNREACHABLE();
+ break;
+ }
+ if (dberr_t err_index= index->clear(thr))
+ err= err_index;
+ }
+ return err;
+}
+
+inline bool UndorecApplier::is_same(roll_ptr_t roll_ptr) const
+{
+ return uint16_t(roll_ptr) == offset &&
+ uint32_t(roll_ptr >> 16) == page_id.page_no();
+}
+
+const rec_t *
+UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+ const rec_t **clust_rec, rec_offs **offsets)
+{
+ ut_ad(index->is_primary());
+ btr_pcur_t pcur;
+
+ bool found= row_search_on_row_ref(&pcur, BTR_MODIFY_LEAF,
+ index->table, &tuple, &mtr);
+ ut_a(found);
+ *clust_rec= btr_pcur_get_rec(&pcur);
+
+ ulint len= 0;
+ rec_t *prev_version;
+ const rec_t *version= *clust_rec;
+ do
+ {
+ *offsets= rec_get_offsets(version, index, *offsets,
+ index->n_core_fields, ULINT_UNDEFINED,
+ &heap);
+ roll_ptr_t roll_ptr= trx_read_roll_ptr(
+ rec_get_nth_field(version, *offsets, index->db_roll_ptr(), &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ if (is_same(roll_ptr))
+ return version;
+ trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version,
+ nullptr, nullptr, 0);
+ version= prev_version;
+ }
+ while (version);
+
+ return nullptr;
+}
+
+/** Clear out all online log of other online indexes after
+encountering the error during row_log_apply() in DML thread
+@param table table which does online DDL */
+static void row_log_mark_other_online_index_abort(dict_table_t *table)
+{
+ dict_index_t *clust_index= dict_table_get_first_index(table);
+ for (dict_index_t *index= dict_table_get_next_index(clust_index);
+ index; index= dict_table_get_next_index(index))
+ {
+ if (index->online_log &&
+ index->online_status <= ONLINE_INDEX_CREATION &&
+ !index->is_corrupted())
+ {
+ index->lock.x_lock(SRW_LOCK_CALL);
+ row_log_abort_sec(index);
+ index->type|= DICT_CORRUPT;
+ index->lock.x_unlock();
+ MONITOR_ATOMIC_INC(MONITOR_BACKGROUND_DROP_INDEX);
+ }
+ }
+
+ clust_index->lock.x_lock(SRW_LOCK_CALL);
+ clust_index->online_log= nullptr;
+ clust_index->lock.x_unlock();
+ table->drop_aborted= TRUE;
+}
+
+void dtype_t::assign(const dict_col_t &col)
+{
+ prtype= col.prtype;
+ mtype= col.mtype;
+ len= col.len;
+ mbminlen= col.mbminlen;
+ mbmaxlen= col.mbmaxlen;
+}
+
+inline void dtuple_t::copy_field_types(const dict_index_t &index)
+{
+ ut_ad(index.n_fields == n_fields);
+ if (UNIV_LIKELY_NULL(index.change_col_info))
+ for (ulint i= 0; i < n_fields; i++)
+ fields[i].type.assign(*index.fields[i].col);
+}
+
+void UndorecApplier::log_insert(const dtuple_t &tuple,
+ dict_index_t *clust_index)
+{
+ DEBUG_SYNC_C("row_log_insert_handle");
+ ut_ad(clust_index->is_primary());
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+
+ rec_offs_init(offsets_);
+ mtr.start();
+ const rec_t *rec;
+ const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+ if (!match_rec)
+ {
+ mtr.commit();
+ return;
+ }
+ const rec_t *copy_rec= match_rec;
+ if (match_rec == rec)
+ {
+ copy_rec= rec_copy(mem_heap_alloc(
+ heap, rec_offs_size(offsets)), match_rec, offsets);
+ rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+ }
+ mtr.commit();
+
+ dict_table_t *table= clust_index->table;
+ clust_index->lock.s_lock(SRW_LOCK_CALL);
+ if (clust_index->online_log &&
+ !clust_index->online_log_is_dummy() &&
+ clust_index->online_status <= ONLINE_INDEX_CREATION)
+ {
+ row_log_table_insert(copy_rec, clust_index, offsets);
+ clust_index->lock.s_unlock();
+ }
+ else
+ {
+ clust_index->lock.s_unlock();
+ row_ext_t *ext;
+ dtuple_t *row= row_build(ROW_COPY_POINTERS, clust_index,
+ copy_rec, offsets, table, nullptr, nullptr, &ext, heap);
+
+ if (table->n_v_cols)
+ {
+ /* Update the row with virtual column values present
+ in the undo log or update vector */
+ if (type == TRX_UNDO_UPD_DEL_REC)
+ row_upd_replace_vcol(row, table, update, false, nullptr,
+ (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+ ? nullptr : undo_rec);
+ else
+ trx_undo_read_v_cols(table, undo_rec, row, false);
+ }
+
+ bool success= true;
+ for (dict_index_t *index= clust_index;
+ (index= dict_table_get_next_index(index)) != nullptr; )
+ {
+ index->lock.s_lock(SRW_LOCK_CALL);
+ if (index->online_log &&
+ index->online_status <= ONLINE_INDEX_CREATION &&
+ !index->is_corrupted())
+ {
+ dtuple_t *entry= row_build_index_entry_low(row, ext, index,
+ heap, ROW_BUILD_NORMAL);
+ entry->copy_field_types(*index);
+ success= row_log_online_op(index, entry, trx_id);
+ }
+
+ index->lock.s_unlock();
+ if (!success)
+ {
+ row_log_mark_other_online_index_abort(index->table);
+ return;
+ }
+ }
+ }
+}
+
+void UndorecApplier::log_update(const dtuple_t &tuple,
+ dict_index_t *clust_index)
+{
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ rec_offs *prev_offsets= offsets2_;
+
+ rec_offs_init(offsets_);
+ rec_offs_init(offsets2_);
+
+ dict_table_t *table= clust_index->table;
+
+ clust_index->lock.s_lock(SRW_LOCK_CALL);
+ bool table_rebuild=
+ (clust_index->online_log
+ && !clust_index->online_log_is_dummy()
+ && clust_index->online_status <= ONLINE_INDEX_CREATION);
+ clust_index->lock.s_unlock();
+
+ mtr.start();
+ const rec_t *rec;
+ rec_t *prev_version;
+ bool is_update= (type == TRX_UNDO_UPD_EXIST_REC);
+ const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+ if (!match_rec)
+ {
+ mtr.commit();
+ return;
+ }
+
+ if (table_rebuild)
+ {
+ const rec_t *copy_rec= match_rec;
+ if (match_rec == rec)
+ copy_rec= rec_copy(mem_heap_alloc(
+ heap, rec_offs_size(offsets)), match_rec, offsets);
+ trx_undo_prev_version_build(match_rec, clust_index, offsets, heap,
+ &prev_version, nullptr, nullptr, 0);
+
+ prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+ mtr.commit();
+
+ clust_index->lock.s_lock(SRW_LOCK_CALL);
+ /* Recheck whether clustered index online log has been cleared */
+ if (clust_index->online_log)
+ {
+ if (is_update)
+ {
+ const dtuple_t *rebuilt_old_pk= row_log_table_get_pk(
+ prev_version, clust_index, prev_offsets, nullptr, &heap);
+ row_log_table_update(copy_rec, clust_index, offsets, rebuilt_old_pk);
+ }
+ else
+ row_log_table_delete(prev_version, clust_index, prev_offsets, nullptr);
+ }
+ clust_index->lock.s_unlock();
+ return;
+ }
+
+ dtuple_t *row= nullptr;
+ row_ext_t *new_ext;
+ if (match_rec != rec)
+ row= row_build(ROW_COPY_POINTERS, clust_index, match_rec, offsets,
+ clust_index->table, NULL, NULL, &new_ext, heap);
+ else
+ row= row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+ clust_index->table, NULL, NULL, &new_ext, heap);
+ mtr.commit();
+ row_ext_t *old_ext;
+ dtuple_t *old_row= nullptr;
+ if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE))
+ {
+ for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++)
+ dfield_get_type(
+ dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING;
+ }
+
+ if (is_update)
+ {
+ old_row= dtuple_copy(row, heap);
+ row_upd_replace(old_row, &old_ext, clust_index, update, heap);
+ }
+
+ if (table->n_v_cols)
+ row_upd_replace_vcol(row, table, update, false, nullptr,
+ (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+ ? nullptr : undo_rec);
+
+ bool success= true;
+ dict_index_t *index= dict_table_get_next_index(clust_index);
+ while (index)
+ {
+ index->lock.s_lock(SRW_LOCK_CALL);
+ if (index->online_log &&
+ index->online_status <= ONLINE_INDEX_CREATION &&
+ !index->is_corrupted())
+ {
+ if (is_update)
+ {
+ /* Ignore the index if the update doesn't affect the index */
+ if (!row_upd_changes_ord_field_binary(index, update,
+ nullptr,
+ row, new_ext))
+ goto next_index;
+ dtuple_t *old_entry= row_build_index_entry_low(
+ old_row, old_ext, index, heap, ROW_BUILD_NORMAL);
+
+ old_entry->copy_field_types(*index);
+
+ success= row_log_online_op(index, old_entry, 0);
+
+ dtuple_t *new_entry= row_build_index_entry_low(
+ row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+ new_entry->copy_field_types(*index);
+
+ if (success)
+ success= row_log_online_op(index, new_entry, trx_id);
+ }
+ else
+ {
+ dtuple_t *old_entry= row_build_index_entry_low(
+ row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+ old_entry->copy_field_types(*index);
+
+ success= row_log_online_op(index, old_entry, 0);
+ }
+ }
+next_index:
+ index->lock.s_unlock();
+ if (!success)
+ {
+ row_log_mark_other_online_index_abort(index->table);
+ return;
+ }
+ index= dict_table_get_next_index(index);
+ }
+}
+
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
new file mode 100644
index 00000000..5df93fe6
--- /dev/null
+++ b/storage/innobase/row/row0merge.cc
@@ -0,0 +1,5406 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.cc
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+#include <my_global.h>
+#include <log.h>
+#include <sql_class.h>
+#include <math.h>
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0log.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "log0crypt.h"
+#include "dict0crea.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "ut0sort.h"
+#include "row0ftsort.h"
+#include "row0import.h"
+#include "row0vers.h"
+#include "handler0alter.h"
+#include "btr0bulk.h"
+#ifdef BTR_CUR_ADAPT
+# include "btr0sea.h"
+#endif /* BTR_CUR_ADAPT */
+#include "ut0stage.h"
+#include "fil0crypt.h"
+#include "srv0mon.h"
+
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined _WIN32
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* _WIN32 */
+
+/* Whether to disable file system cache */
+char srv_disable_sort_file_cache;
+
+/** Class that caches spatial index row tuples made from a single cluster
+index page scan, and then insert into corresponding index tree */
+class spatial_index_info {
+public:
+ /** constructor
+ @param index spatial index to be created */
+ spatial_index_info(dict_index_t *index) : index(index)
+ {
+ ut_ad(index->is_spatial());
+ }
+
+ /** Caches an index row into index tuple vector
+ @param[in] row table row
+ @param[in] ext externally stored column prefixes, or NULL */
+ void add(const dtuple_t *row, const row_ext_t *ext, mem_heap_t *heap)
+ {
+ dtuple_t *dtuple= row_build_index_entry(row, ext, index, heap);
+ ut_ad(dtuple);
+ ut_ad(dtuple->n_fields == index->n_fields);
+ if (ext)
+ {
+ /* Replace any references to ext, because ext will be allocated
+ from row_heap. */
+ for (ulint i= 1; i < dtuple->n_fields; i++)
+ {
+ dfield_t &dfield= dtuple->fields[i];
+ if (dfield.data >= ext->buf &&
+ dfield.data <= &ext->buf[ext->n_ext * ext->max_len])
+ dfield_dup(&dfield, heap);
+ }
+ }
+ m_dtuple_vec.push_back(dtuple);
+ }
+
+ /** Insert spatial index rows cached in vector into spatial index
+ @param[in] trx_id transaction id
+ @param[in] pcur cluster index scanning cursor
+ @param[in,out] mtr_started whether scan_mtr is active
+ @param[in,out] heap temporary memory heap
+ @param[in,out] scan_mtr mini-transaction for pcur
+ @return DB_SUCCESS if successful, else error number */
+ dberr_t insert(trx_id_t trx_id, btr_pcur_t* pcur,
+ bool& mtr_started, mem_heap_t* heap, mtr_t* scan_mtr)
+ {
+ big_rec_t* big_rec;
+ rec_t* rec;
+ btr_cur_t ins_cur;
+ mtr_t mtr;
+ rtr_info_t rtr_info;
+ rec_offs* ins_offsets = NULL;
+ dberr_t error = DB_SUCCESS;
+ dtuple_t* dtuple;
+ const ulint flag = BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG;
+
+ ut_ad(mtr_started == scan_mtr->is_active());
+
+ DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush",
+ log_sys.set_check_flush_or_checkpoint(););
+
+ for (idx_tuple_vec::iterator it = m_dtuple_vec.begin();
+ it != m_dtuple_vec.end();
+ ++it) {
+ dtuple = *it;
+ ut_ad(dtuple);
+
+ if (log_sys.check_flush_or_checkpoint()) {
+ if (mtr_started) {
+ if (!btr_pcur_move_to_prev_on_page(pcur)) {
+ error = DB_CORRUPTION;
+ break;
+ }
+ btr_pcur_store_position(pcur, scan_mtr);
+ scan_mtr->commit();
+ mtr_started = false;
+ }
+
+ log_free_check();
+ }
+
+ mtr.start();
+ index->set_modified(mtr);
+
+ ins_cur.page_cur.index = index;
+ rtr_init_rtr_info(&rtr_info, false, &ins_cur, index,
+ false);
+ rtr_info_update_btr(&ins_cur, &rtr_info);
+
+ error = rtr_insert_leaf(&ins_cur, dtuple,
+ BTR_MODIFY_LEAF, &mtr);
+
+ /* It need to update MBR in parent entry,
+ so change search mode to BTR_MODIFY_TREE */
+ if (error == DB_SUCCESS && rtr_info.mbr_adj) {
+ mtr.commit();
+ rtr_clean_rtr_info(&rtr_info, true);
+ rtr_init_rtr_info(&rtr_info, false, &ins_cur,
+ index, false);
+ rtr_info_update_btr(&ins_cur, &rtr_info);
+ mtr.start();
+ index->set_modified(mtr);
+ error = rtr_insert_leaf(&ins_cur, dtuple,
+ BTR_MODIFY_TREE, &mtr);
+ }
+
+ if (error == DB_SUCCESS) {
+ error = btr_cur_optimistic_insert(
+ flag, &ins_cur, &ins_offsets,
+ &heap, dtuple, &rec, &big_rec,
+ 0, NULL, &mtr);
+ }
+
+ ut_ad(!big_rec);
+
+ if (error == DB_FAIL) {
+ mtr.commit();
+ mtr.start();
+ index->set_modified(mtr);
+
+ rtr_clean_rtr_info(&rtr_info, true);
+ rtr_init_rtr_info(&rtr_info, false,
+ &ins_cur, index, false);
+
+ rtr_info_update_btr(&ins_cur, &rtr_info);
+ error = rtr_insert_leaf(&ins_cur, dtuple,
+ BTR_MODIFY_TREE, &mtr);
+
+ if (error == DB_SUCCESS) {
+ error = btr_cur_pessimistic_insert(
+ flag, &ins_cur, &ins_offsets,
+ &heap, dtuple, &rec,
+ &big_rec, 0, NULL, &mtr);
+ }
+ }
+
+ ut_ad(!big_rec);
+
+ DBUG_EXECUTE_IF(
+ "row_merge_ins_spatial_fail",
+ error = DB_FAIL;
+ );
+
+ if (error == DB_SUCCESS) {
+ if (rtr_info.mbr_adj) {
+ error = rtr_ins_enlarge_mbr(
+ &ins_cur, &mtr);
+ }
+
+ if (error == DB_SUCCESS) {
+ page_update_max_trx_id(
+ btr_cur_get_block(&ins_cur),
+ btr_cur_get_page_zip(&ins_cur),
+ trx_id, &mtr);
+ }
+ }
+
+ mtr.commit();
+
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+
+ m_dtuple_vec.clear();
+
+ return(error);
+ }
+
+private:
+ /** Cache index rows made from a cluster index scan. Usually
+ for rows on single cluster index page */
+ typedef std::vector<dtuple_t*, ut_allocator<dtuple_t*> > idx_tuple_vec;
+
+ /** vector used to cache index rows made from cluster index scan */
+ idx_tuple_vec m_dtuple_vec;
+public:
+ /** the index being built */
+ dict_index_t*const index;
+};
+
+/* Maximum pending doc memory limit in bytes for a fts tokenization thread */
+#define FTS_PENDING_DOC_MEMORY_LIMIT 1000000
+
+/** Insert sorted data tuples to the index.
+@param[in] index index to be inserted
+@param[in] old_table old table
+@param[in] fd file descriptor
+@param[in,out] block file buffer
+@param[in] row_buf row_buf the sorted data tuples,
+or NULL if fd, block will be used instead
+@param[in,out] btr_bulk btr bulk instance
+@param[in] table_total_rows total rows of old table
+@param[in] pct_progress total progress percent untill now
+@param[in] pct_cost current progress percent
+@param[in] crypt_block buffer for encryption or NULL
+@param[in] space space id
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->begin_phase_insert() will be called initially
+and then stage->inc() will be called for each record that is processed.
+@param[in] blob_file To read big column field data from
+ the given blob file. It is
+ applicable only for bulk insert
+ operation
+@return DB_SUCCESS or error number */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+ dict_index_t* index,
+ const dict_table_t* old_table,
+ const pfs_os_file_t& fd,
+ row_merge_block_t* block,
+ const row_merge_buf_t* row_buf,
+ BtrBulk* btr_bulk,
+ const ib_uint64_t table_total_rows,
+ double pct_progress,
+ double pct_cost,
+ row_merge_block_t* crypt_block,
+ ulint space,
+ ut_stage_alter_t* stage= nullptr,
+ merge_file_t* blob_file= nullptr);
+
+/** Encode an index record.
+@return size of the record */
+static MY_ATTRIBUTE((nonnull))
+ulint
+row_merge_buf_encode(
+/*=================*/
+ byte** b, /*!< in/out: pointer to
+ current end of output buffer */
+ const dict_index_t* index, /*!< in: index */
+ const mtuple_t* entry, /*!< in: index fields
+ of the record to encode */
+ ulint n_fields) /*!< in: number of fields
+ in the entry */
+{
+ ulint size;
+ ulint extra_size;
+
+ size = rec_get_converted_size_temp<false>(
+ index, entry->fields, n_fields, &extra_size);
+ ut_ad(size >= extra_size);
+
+ /* Encode extra_size + 1 */
+ if (extra_size + 1 < 0x80) {
+ *(*b)++ = (byte) (extra_size + 1);
+ } else {
+ ut_ad((extra_size + 1) < 0x8000);
+ *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+ *(*b)++ = (byte) (extra_size + 1);
+ }
+
+ rec_convert_dtuple_to_temp<false>(*b + extra_size, index,
+ entry->fields, n_fields);
+
+ *b += size;
+ return size;
+}
+
+static MY_ATTRIBUTE((malloc, nonnull))
+row_merge_buf_t*
+row_merge_buf_create_low(
+ row_merge_buf_t *buf, mem_heap_t *heap, dict_index_t *index)
+{
+ ulint max_tuples = srv_sort_buf_size
+ / std::max<ulint>(1, dict_index_get_min_size(index));
+ ut_ad(max_tuples > 0);
+ ut_ad(max_tuples <= srv_sort_buf_size);
+
+ buf->heap = heap;
+ buf->index = index;
+ buf->max_tuples = max_tuples;
+ buf->tuples = static_cast<mtuple_t*>(
+ ut_malloc_nokey(2 * max_tuples * sizeof *buf->tuples));
+ buf->tmp_tuples = buf->tuples + max_tuples;
+ return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+ dict_index_t* index) /*!< in: secondary index */
+{
+ row_merge_buf_t* buf;
+ ulint buf_size;
+ mem_heap_t* heap;
+
+ buf_size = (sizeof *buf);
+
+ heap = mem_heap_create(buf_size);
+
+ buf = static_cast<row_merge_buf_t*>(
+ mem_heap_zalloc(heap, buf_size));
+ row_merge_buf_create_low(buf, heap, index);
+
+ return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer */
+{
+ ulint buf_size = sizeof *buf;
+ ulint max_tuples = buf->max_tuples;
+ mem_heap_t* heap = buf->heap;
+ dict_index_t* index = buf->index;
+ mtuple_t* tuples = buf->tuples;
+
+ mem_heap_empty(heap);
+
+ buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+ buf->heap = heap;
+ buf->index = index;
+ buf->max_tuples = max_tuples;
+ buf->tuples = tuples;
+ buf->tmp_tuples = buf->tuples + max_tuples;
+
+ return(buf);
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */
+{
+ ut_free(buf->tuples);
+ mem_heap_free(buf->heap);
+}
+
+/** Convert the field data from compact to redundant format.
+@param[in] row_field field to copy from
+@param[out] field field to copy to
+@param[in] len length of the field data
+@param[in] zip_size compressed BLOB page size,
+ zero for uncompressed BLOBs
+@param[in,out] heap memory heap where to allocate data when
+ converting to ROW_FORMAT=REDUNDANT, or NULL
+ when not to invoke
+ row_merge_buf_redundant_convert(). */
+static
+void
+row_merge_buf_redundant_convert(
+ const dfield_t* row_field,
+ dfield_t* field,
+ ulint len,
+ ulint zip_size,
+ mem_heap_t* heap)
+{
+ ut_ad(field->type.mbminlen == 1);
+ ut_ad(field->type.mbmaxlen > 1);
+
+ byte* buf = (byte*) mem_heap_alloc(heap, len);
+ ulint field_len = row_field->len;
+ ut_ad(field_len <= len);
+
+ if (row_field->ext) {
+ const byte* field_data = static_cast<const byte*>(
+ dfield_get_data(row_field));
+ ulint ext_len;
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(memcmp(field_data + field_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ byte* data = btr_copy_externally_stored_field(
+ &ext_len, field_data, zip_size, field_len, heap);
+
+ ut_ad(ext_len < len);
+
+ memcpy(buf, data, ext_len);
+ field_len = ext_len;
+ } else {
+ memcpy(buf, row_field->data, field_len);
+ }
+
+ memset(buf + field_len, 0x20, len - field_len);
+
+ dfield_set_data(field, buf, len);
+}
+
+/** Insert the tuple into bulk buffer insert operation
+@param buf merge buffer for the index operation
+@param table bulk insert operation for the table
+@param row tuple to be inserted
+@return number of rows inserted */
+static ulint row_merge_bulk_buf_add(row_merge_buf_t* buf,
+ const dict_table_t &table,
+ const dtuple_t &row)
+{
+ if (buf->n_tuples >= buf->max_tuples)
+ return 0;
+
+ const dict_index_t *index= buf->index;
+ ulint n_fields= dict_index_get_n_fields(index);
+ mtuple_t *entry= &buf->tuples[buf->n_tuples];
+ ulint data_size= 0;
+ ulint extra_size= UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+ dfield_t *field= entry->fields= static_cast<dfield_t*>(
+ mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+ const dict_field_t *ifield= dict_index_get_nth_field(index, 0);
+
+ for (ulint i = 0; i < n_fields; i++, field++, ifield++)
+ {
+ dfield_copy(field, &row.fields[i]);
+ ulint len= dfield_get_len(field);
+ const dict_col_t* const col= ifield->col;
+
+ if (dfield_is_null(field))
+ continue;
+
+ ulint fixed_len= ifield->fixed_len;
+
+ /* CHAR in ROW_FORMAT=REDUNDANT is always
+ fixed-length, but in the temporary file it is
+ variable-length for variable-length character sets. */
+ if (fixed_len && !index->table->not_redundant() &&
+ col->mbminlen != col->mbmaxlen)
+ fixed_len= 0;
+
+ if (fixed_len);
+ else if (len < 128 || (!DATA_BIG_COL(col)))
+ extra_size++;
+ else
+ extra_size += 2;
+ data_size += len;
+ }
+
+ /* Add to the total size of the record in row_merge_block_t
+ the encoded length of extra_size and the extra bytes (extra_size).
+ See row_merge_buf_write() for the variable-length encoding
+ of extra_size. */
+ data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+ /* Reserve bytes for the end marker of row_merge_block_t. */
+ if (buf->total_size + data_size >= srv_sort_buf_size)
+ return 0;
+
+ buf->total_size += data_size;
+ buf->n_tuples++;
+
+ field= entry->fields;
+
+ do
+ dfield_dup(field++, buf->heap);
+ while (--n_fields);
+
+ return 1;
+}
+
+/** Insert a data tuple into a sort buffer.
+@param[in,out] buf sort buffer
+@param[in] fts_index fts index to be created
+@param[in] old_table original table
+@param[in] new_table new table
+@param[in,out] psort_info parallel sort info
+@param[in,out] row table row
+@param[in] ext cache of externally stored
+ column prefixes, or NULL
+@param[in] history_fts row is historical in a system-versioned table
+ on which a FTS_DOC_ID_INDEX(FTS_DOC_ID) exists
+@param[in,out] doc_id Doc ID if we are creating
+ FTS index
+@param[in,out] conv_heap memory heap where to allocate data when
+ converting to ROW_FORMAT=REDUNDANT, or NULL
+ when not to invoke
+ row_merge_buf_redundant_convert()
+@param[in,out] err set if error occurs
+@param[in,out] v_heap heap memory to process data for virtual column
+@param[in,out] my_table mysql table object
+@param[in] trx transaction object
+@param[in] col_collate columns whose collations changed, or nullptr
+@return number of rows added, 0 if out of space */
+static
+ulint
+row_merge_buf_add(
+ row_merge_buf_t* buf,
+ dict_index_t* fts_index,
+ const dict_table_t* old_table,
+ const dict_table_t* new_table,
+ fts_psort_t* psort_info,
+ dtuple_t* row,
+ const row_ext_t* ext,
+ const bool history_fts,
+ doc_id_t* doc_id,
+ mem_heap_t* conv_heap,
+ dberr_t* err,
+ mem_heap_t** v_heap,
+ TABLE* my_table,
+ trx_t* trx,
+ const col_collations* col_collate)
+{
+ ulint i;
+ const dict_index_t* index;
+ mtuple_t* entry;
+ dfield_t* field;
+ const dict_field_t* ifield;
+ ulint n_fields;
+ ulint data_size;
+ ulint extra_size;
+ ulint bucket = 0;
+ doc_id_t write_doc_id;
+ ulint n_row_added = 0;
+ VCOL_STORAGE vcol_storage;
+
+ DBUG_ENTER("row_merge_buf_add");
+
+ if (buf->n_tuples >= buf->max_tuples) {
+error:
+ n_row_added = 0;
+ goto end;
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_row_merge_buf_add_two",
+ if (buf->n_tuples >= 2) DBUG_RETURN(0););
+
+ UNIV_PREFETCH_R(row->fields);
+
+ /* If we are building FTS index, buf->index points to
+ the 'fts_sort_idx', and real FTS index is stored in
+ fts_index */
+ index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
+
+ /* create spatial index should not come here */
+ ut_ad(!dict_index_is_spatial(index));
+
+ n_fields = dict_index_get_n_fields(index);
+
+ entry = &buf->tuples[buf->n_tuples];
+ field = entry->fields = static_cast<dfield_t*>(
+ mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+
+ data_size = 0;
+ extra_size = UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+
+ ifield = dict_index_get_nth_field(index, 0);
+
+ for (i = 0; i < n_fields; i++, field++, ifield++) {
+ ulint len;
+ ulint fixed_len;
+ const dfield_t* row_field;
+ const dict_col_t* const col = ifield->col;
+ const dict_v_col_t* const v_col = col->is_virtual()
+ ? reinterpret_cast<const dict_v_col_t*>(col)
+ : NULL;
+
+ /* Process the Doc ID column */
+ if (!v_col && (history_fts || *doc_id)
+ && col->ind == index->table->fts->doc_col) {
+ fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+ /* Note: field->data now points to a value on the
+ stack: &write_doc_id after dfield_set_data(). Because
+ there is only one doc_id per row, it shouldn't matter.
+ We allocate a new buffer before we leave the function
+ later below. */
+
+ dfield_set_data(
+ field, &write_doc_id, sizeof(write_doc_id));
+
+ field->type.mtype = ifield->col->mtype;
+ field->type.prtype = ifield->col->prtype;
+ field->type.mbminlen = 0;
+ field->type.mbmaxlen = 0;
+ field->type.len = ifield->col->len;
+ } else {
+ /* Use callback to get the virtual column value */
+ if (v_col) {
+ dict_index_t* clust_index
+ = dict_table_get_first_index(new_table);
+
+ if (!vcol_storage.innobase_record &&
+ !innobase_allocate_row_for_vcol(
+ trx->mysql_thd, clust_index,
+ v_heap, &my_table,
+ &vcol_storage)) {
+ *err = DB_OUT_OF_MEMORY;
+ goto error;
+ }
+
+ row_field = innobase_get_computed_value(
+ row, v_col, clust_index,
+ v_heap, NULL, ifield, trx->mysql_thd,
+ my_table, vcol_storage.innobase_record,
+ old_table, NULL);
+
+ if (row_field == NULL) {
+ *err = DB_COMPUTE_VALUE_FAILED;
+ goto error;
+ }
+ dfield_copy(field, row_field);
+ } else {
+ row_field = dtuple_get_nth_field(row,
+ col->ind);
+ dfield_copy(field, row_field);
+
+ /* Copy the column collation to the
+ tuple field */
+ if (col_collate) {
+ auto it = col_collate->find(col->ind);
+ if (it != col_collate->end()) {
+ field->type
+ .assign(*it->second);
+ }
+ }
+ }
+
+ /* Tokenize and process data for FTS */
+ if (!history_fts && (index->type & DICT_FTS)) {
+ fts_doc_item_t* doc_item;
+ byte* value;
+ void* ptr;
+ const ulint max_trial_count = 10000;
+ ulint trial_count = 0;
+
+ /* fetch Doc ID if it already exists
+ in the row, and not supplied by the
+ caller. Even if the value column is
+ NULL, we still need to get the Doc
+ ID so to maintain the correct max
+ Doc ID */
+ if (*doc_id == 0) {
+ const dfield_t* doc_field;
+ doc_field = dtuple_get_nth_field(
+ row,
+ index->table->fts->doc_col);
+ *doc_id = (doc_id_t) mach_read_from_8(
+ static_cast<const byte*>(
+ dfield_get_data(doc_field)));
+
+ if (*doc_id == 0) {
+ ib::warn() << "FTS Doc ID is"
+ " zero. Record"
+ " skipped";
+ goto error;
+ }
+ }
+
+ if (dfield_is_null(field)) {
+ n_row_added = 1;
+ continue;
+ }
+
+ ptr = ut_malloc_nokey(sizeof(*doc_item)
+ + field->len);
+
+ doc_item = static_cast<fts_doc_item_t*>(ptr);
+ value = static_cast<byte*>(ptr)
+ + sizeof(*doc_item);
+ memcpy(value, field->data, field->len);
+ field->data = value;
+
+ doc_item->field = field;
+ doc_item->doc_id = *doc_id;
+
+ bucket = static_cast<ulint>(
+ *doc_id % fts_sort_pll_degree);
+
+ /* Add doc item to fts_doc_list */
+ mysql_mutex_lock(&psort_info[bucket].mutex);
+
+ if (psort_info[bucket].error == DB_SUCCESS) {
+ UT_LIST_ADD_LAST(
+ psort_info[bucket].fts_doc_list,
+ doc_item);
+ psort_info[bucket].memory_used +=
+ sizeof(*doc_item) + field->len;
+ } else {
+ ut_free(doc_item);
+ }
+
+ mysql_mutex_unlock(&psort_info[bucket].mutex);
+
+ /* Sleep when memory used exceeds limit*/
+ while (psort_info[bucket].memory_used
+ > FTS_PENDING_DOC_MEMORY_LIMIT
+ && trial_count++ < max_trial_count) {
+ std::this_thread::sleep_for(
+ std::chrono::milliseconds(1));
+ }
+
+ n_row_added = 1;
+ continue;
+ }
+
+ /* innobase_get_computed_value() sets the
+ length of the virtual column field. */
+ if (v_col == NULL
+ && field->len != UNIV_SQL_NULL
+ && col->mtype == DATA_MYSQL
+ && col->len != field->len) {
+ if (conv_heap != NULL) {
+ row_merge_buf_redundant_convert(
+ row_field, field, col->len,
+ old_table->space->zip_size(),
+ conv_heap);
+ }
+ }
+ }
+
+ len = dfield_get_len(field);
+
+ if (dfield_is_null(field)) {
+ ut_ad(!(col->prtype & DATA_NOT_NULL));
+ continue;
+ } else if (!ext) {
+ } else if (dict_index_is_clust(index)) {
+ /* Flag externally stored fields. */
+ const byte* buf = row_ext_lookup(ext, col->ind,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ ut_a(buf != field_ref_zero);
+ if (i < dict_index_get_n_unique(index)) {
+ dfield_set_data(field, buf, len);
+ } else {
+ dfield_set_ext(field);
+ len = dfield_get_len(field);
+ }
+ }
+ } else if (!v_col) {
+ /* Only non-virtual column are stored externally */
+ const byte* buf = row_ext_lookup(ext, col->ind,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ ut_a(buf != field_ref_zero);
+ dfield_set_data(field, buf, len);
+ }
+ }
+
+ /* If a column prefix index, take only the prefix */
+
+ if (ifield->prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ ifield->prefix_len,
+ len,
+ static_cast<char*>(dfield_get_data(field)));
+ dfield_set_len(field, len);
+ }
+
+ ut_ad(len <= col->len
+ || DATA_LARGE_MTYPE(col->mtype));
+
+ fixed_len = ifield->fixed_len;
+ if (fixed_len && !dict_table_is_comp(index->table)
+ && col->mbminlen != col->mbmaxlen) {
+ /* CHAR in ROW_FORMAT=REDUNDANT is always
+ fixed-length, but in the temporary file it is
+ variable-length for variable-length character
+ sets. */
+ fixed_len = 0;
+ }
+
+ if (fixed_len) {
+#ifdef UNIV_DEBUG
+ /* len should be between size calcualted base on
+ mbmaxlen and mbminlen */
+ ut_ad(len <= fixed_len);
+ ut_ad(!col->mbmaxlen || len >= col->mbminlen
+ * (fixed_len / col->mbmaxlen));
+
+ ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+ } else if (dfield_is_ext(field)) {
+ extra_size += 2;
+ } else if (len < 128
+ || (!DATA_BIG_COL(col))) {
+ extra_size++;
+ } else {
+ /* For variable-length columns, we look up the
+ maximum length from the column itself. If this
+ is a prefix index column shorter than 256 bytes,
+ this will waste one byte. */
+ extra_size += 2;
+ }
+ data_size += len;
+ }
+
+ /* If this is FTS index, we already populated the sort buffer, return
+ here */
+ if (index->type & DICT_FTS) {
+ goto end;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ ulint size;
+ ulint extra;
+
+ size = rec_get_converted_size_temp<false>(
+ index, entry->fields, n_fields, &extra);
+
+ ut_ad(data_size + extra_size == size);
+ ut_ad(extra_size == extra);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Add to the total size of the record in row_merge_block_t
+ the encoded length of extra_size and the extra bytes (extra_size).
+ See row_merge_buf_write() for the variable-length encoding
+ of extra_size. */
+ data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+ /* Record size can exceed page size while converting to
+ redundant row format. But there is assert
+ ut_ad(size < srv_page_size) in rec_offs_data_size().
+ It may hit the assert before attempting to insert the row. */
+ if (conv_heap != NULL && data_size > srv_page_size) {
+ *err = DB_TOO_BIG_RECORD;
+ }
+
+ ut_ad(data_size < srv_sort_buf_size);
+
+ /* Reserve bytes for the end marker of row_merge_block_t. */
+ if (buf->total_size + data_size >= srv_sort_buf_size) {
+ goto error;
+ }
+
+ buf->total_size += data_size;
+ buf->n_tuples++;
+ n_row_added++;
+
+ field = entry->fields;
+
+ /* Copy the data fields. */
+
+ do {
+ dfield_dup(field++, buf->heap);
+ } while (--n_fields);
+
+ if (conv_heap != NULL) {
+ mem_heap_empty(conv_heap);
+ }
+
+end:
+ if (vcol_storage.innobase_record)
+ innobase_free_row_for_vcol(&vcol_storage);
+ DBUG_RETURN(n_row_added);
+}
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
+ const dfield_t* entry) /*!< in: duplicate index entry */
+{
+ if (!dup->n_dup++ && dup->table) {
+ /* Only report the first duplicate record,
+ but count all duplicate records. */
+ innobase_fields_to_mysql(dup->table, dup->index, entry);
+ }
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return positive, 0, negative if a is greater, equal, less, than b,
+respectively */
+static MY_ATTRIBUTE((warn_unused_result))
+int
+row_merge_tuple_cmp(
+/*================*/
+ const dict_index_t* index, /*< in: index tree */
+ ulint n_uniq, /*!< in: number of unique fields */
+ ulint n_field,/*!< in: number of fields */
+ const mtuple_t& a, /*!< in: first tuple to be compared */
+ const mtuple_t& b, /*!< in: second tuple to be compared */
+ row_merge_dup_t* dup) /*!< in/out: for reporting duplicates,
+ NULL if non-unique index */
+{
+ int cmp;
+ const dfield_t* af = a.fields;
+ const dfield_t* bf = b.fields;
+ ulint n = n_uniq;
+ const dict_field_t* f = index->fields;
+
+ ut_ad(n_uniq > 0);
+ ut_ad(n_uniq <= n_field);
+
+ /* Compare the fields of the tuples until a difference is
+ found or we run out of fields to compare. If !cmp at the
+ end, the tuples are equal. */
+ do {
+ cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending);
+ } while (!cmp && --n);
+
+ if (cmp) {
+ return(cmp);
+ }
+
+ if (dup) {
+ /* Report a duplicate value error if the tuples are
+ logically equal. NULL columns are logically inequal,
+ although they are equal in the sorting order. Find
+ out if any of the fields are NULL. */
+ for (const dfield_t* df = a.fields; df != af; df++) {
+ if (dfield_is_null(df)) {
+ goto no_report;
+ }
+ }
+
+ row_merge_dup_report(dup, a.fields);
+ }
+
+no_report:
+ /* The n_uniq fields were equal, but we compare all fields so
+ that we will get the same (internal) order as in the B-tree. */
+ for (n = n_field - n_uniq + 1; --n; ) {
+ cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending);
+ if (cmp) {
+ return(cmp);
+ }
+ }
+
+ /* This should never be reached, except in a secondary index
+ when creating a secondary index and a PRIMARY KEY, and there
+ is a duplicate in the PRIMARY KEY that has not been detected
+ yet. Internally, an index must never contain duplicates. */
+ return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param tuples array of tuples that being sorted
+@param aux work area, same size as tuples[]
+@param low lower bound of the sorting area, inclusive
+@param high upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(tuples, aux, low, high) \
+ row_merge_tuple_sort(index,n_uniq,n_field,dup, tuples, aux, low, high)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a first tuple to be compared
+@param b second tuple to be compared
+@return positive, 0, negative, if a is greater, equal, less, than b,
+respectively */
+#define row_merge_tuple_cmp_ctx(a,b) \
+ row_merge_tuple_cmp(index, n_uniq, n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+ const dict_index_t* index, /*!< in: index tree */
+ ulint n_uniq, /*!< in: number of unique fields */
+ ulint n_field,/*!< in: number of fields */
+ row_merge_dup_t* dup, /*!< in/out: reporter of duplicates
+ (NULL if non-unique index) */
+ mtuple_t* tuples, /*!< in/out: tuples */
+ mtuple_t* aux, /*!< in/out: work area */
+ ulint low, /*!< in: lower bound of the
+ sorting area, inclusive */
+ ulint high) /*!< in: upper bound of the
+ sorting area, exclusive */
+{
+ ut_ad(n_field > 0);
+ ut_ad(n_uniq <= n_field);
+
+ UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+ tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+ row_merge_buf_t* buf, /*!< in/out: sort buffer */
+ row_merge_dup_t* dup) /*!< in/out: reporter of duplicates
+ (NULL if non-unique index) */
+{
+ ut_ad(!buf->index->is_spatial());
+ row_merge_tuple_sort(buf->index, buf->index->n_uniq, buf->index->n_fields,
+ dup, buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/** Write the blob field data to temporary file and fill the offset,
+length in the field data
+@param field tuple field
+@param blob_file file to store the blob data
+@param heap heap to store the blob offset and length
+@return DB_SUCCESS if successful */
+static dberr_t row_merge_write_blob_to_tmp_file(
+ dfield_t *field, merge_file_t *blob_file,mem_heap_t **heap)
+{
+ if (blob_file->fd == OS_FILE_CLOSED)
+ {
+ blob_file->fd= row_merge_file_create_low(nullptr);
+ if (blob_file->fd == OS_FILE_CLOSED)
+ return DB_OUT_OF_MEMORY;
+ }
+ uint64_t val= blob_file->offset;
+ uint32_t len= field->len;
+ dberr_t err= os_file_write(
+ IORequestWrite, "(bulk insert)", blob_file->fd,
+ field->data, blob_file->offset, len);
+
+ if (err != DB_SUCCESS)
+ return err;
+
+ byte *data= static_cast<byte*>
+ (mem_heap_alloc(*heap, BTR_EXTERN_FIELD_REF_SIZE));
+
+ /* Write zeroes for first 8 bytes */
+ memset(data, 0, 8);
+ /* Write offset for next 8 bytes */
+ mach_write_to_8(data + 8, val);
+ /* Write length of the blob in 4 bytes */
+ mach_write_to_4(data + 16, len);
+ blob_file->offset+= field->len;
+ blob_file->n_rec++;
+ dfield_set_data(field, data, BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_set_ext(field);
+ return err;
+}
+
+/** This function is invoked when tuple size is greater than
+innodb_sort_buffer_size. Basically it recreates the tuple
+by writing the blob field to the temporary file.
+@param entry index fields to be encode the blob
+@param blob_file file to store the blob data
+@param heap heap to store the blob offset and blob length
+@return tuple which fits into sort_buffer_size */
+static dtuple_t* row_merge_buf_large_tuple(const dtuple_t &entry,
+ merge_file_t *blob_file,
+ mem_heap_t **heap)
+{
+ if (!*heap)
+ *heap= mem_heap_create(DTUPLE_EST_ALLOC(entry.n_fields));
+
+ dtuple_t *tuple= dtuple_copy(&entry, *heap);
+ for (ulint i= 0; i < tuple->n_fields; i++)
+ {
+ dfield_t *field= &tuple->fields[i];
+ if (dfield_is_null(field) || field->len <= 2000)
+ continue;
+
+ dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap);
+ if (err != DB_SUCCESS)
+ return nullptr;
+ }
+
+ return tuple;
+}
+
+
+/** Write the field data whose length is more than 2000 bytes
+into blob temporary file and write offset, length into the
+tuple field
+@param entry index fields to be encode the blob
+@param n_fields number of fields in the entry
+@param heap heap to store the blob offset and blob length
+@param blob_file file to store the blob data */
+static dberr_t row_merge_buf_blob(const mtuple_t *entry, ulint n_fields,
+ mem_heap_t **heap, merge_file_t *blob_file)
+{
+
+ if (!*heap)
+ *heap= mem_heap_create(100);
+
+ for (ulint i= 0; i < n_fields; i++)
+ {
+ dfield_t *field= &entry->fields[i];
+ if (dfield_is_null(field) || field->len <= 2000)
+ continue;
+
+ dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap);
+ if (err != DB_SUCCESS)
+ return err;
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Write a buffer to a block.
+@param buf sorted buffer
+@param block buffer for writing to file
+@param blob_file blob file handle for doing bulk insert operation */
+dberr_t row_merge_buf_write(const row_merge_buf_t *buf,
+#ifndef DBUG_OFF
+ const merge_file_t *of, /*!< output file */
+#endif
+ row_merge_block_t *block,
+ merge_file_t *blob_file)
+{
+ const dict_index_t* index = buf->index;
+ ulint n_fields= dict_index_get_n_fields(index);
+ byte* b = &block[0];
+ mem_heap_t* blob_heap = nullptr;
+ dberr_t err = DB_SUCCESS;
+
+ DBUG_ENTER("row_merge_buf_write");
+
+ for (ulint i = 0; i < buf->n_tuples; i++) {
+ const mtuple_t* entry = &buf->tuples[i];
+
+ if (blob_file) {
+ ut_ad(buf->index->is_primary());
+ err = row_merge_buf_blob(
+ entry, n_fields, &blob_heap, blob_file);
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+
+ ulint rec_size= row_merge_buf_encode(
+ &b, index, entry, n_fields);
+ if (blob_file && rec_size > srv_page_size) {
+ err = DB_TOO_BIG_RECORD;
+ goto func_exit;
+ }
+
+ ut_ad(b < &block[srv_sort_buf_size]);
+
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ','
+ << of->fd << ',' << of->offset << ' ' <<
+ i << ": " <<
+ rec_printer(entry->fields, n_fields).str());
+ }
+
+ /* Write an "end-of-chunk" marker. */
+ ut_a(b < &block[srv_sort_buf_size]);
+ ut_a(b == &block[0] + buf->total_size || blob_file);
+ *b++ = 0;
+#ifdef HAVE_valgrind
+ /* The rest of the block is uninitialized. Initialize it
+ to avoid bogus warnings. */
+ memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* HAVE_valgrind */
+ DBUG_LOG("ib_merge_sort",
+ "write " << reinterpret_cast<const void*>(b) << ','
+ << of->fd << ',' << of->offset << " EOF");
+func_exit:
+ if (blob_heap) {
+ mem_heap_free(blob_heap);
+ }
+
+ DBUG_RETURN(err);
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+ const dict_index_t* index, /*!< in: record descriptor */
+ mrec_buf_t** buf, /*!< out: 3 buffers */
+ rec_offs** offsets1, /*!< out: offsets */
+ rec_offs** offsets2) /*!< out: offsets */
+{
+ ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
+ + 3 * sizeof **buf);
+
+ *buf = static_cast<mrec_buf_t*>(
+ mem_heap_alloc(heap, 3 * sizeof **buf));
+ *offsets1 = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof **offsets1));
+ *offsets2 = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof **offsets2));
+
+ rec_offs_set_n_alloc(*offsets1, i);
+ rec_offs_set_n_alloc(*offsets2, i);
+ rec_offs_set_n_fields(*offsets1, dict_index_get_n_fields(index));
+ rec_offs_set_n_fields(*offsets2, dict_index_get_n_fields(index));
+
+ return(heap);
+}
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to read
+ in number of row_merge_block_t
+ elements */
+ row_merge_block_t* buf, /*!< out: data */
+ row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ os_offset_t ofs = ((os_offset_t) offset) * srv_sort_buf_size;
+
+ DBUG_ENTER("row_merge_read");
+ DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+ DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE););
+
+ const dberr_t err = os_file_read(
+ IORequestRead, fd, buf, ofs, srv_sort_buf_size, nullptr);
+
+ /* If encryption is enabled decrypt buffer */
+ if (err == DB_SUCCESS && srv_encrypt_log) {
+ if (!log_tmp_block_decrypt(buf, srv_sort_buf_size,
+ crypt_buf, ofs)) {
+ DBUG_RETURN(false);
+ }
+
+ srv_stats.n_merge_blocks_decrypted.inc();
+ memcpy(buf, crypt_buf, srv_sort_buf_size);
+ }
+
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ DBUG_RETURN(err == DB_SUCCESS);
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval false on error
+@retval true on success */
+bool
+row_merge_write(
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to write,
+ in number of row_merge_block_t elements */
+ const void* buf, /*!< in: data */
+ void* crypt_buf, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ size_t buf_len = srv_sort_buf_size;
+ os_offset_t ofs = buf_len * (os_offset_t) offset;
+ void* out_buf = (void *)buf;
+
+ DBUG_ENTER("row_merge_write");
+ DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+ DBUG_EXECUTE_IF("row_merge_write_failure", DBUG_RETURN(FALSE););
+
+ /* For encrypted tables, encrypt data before writing */
+ if (srv_encrypt_log) {
+ if (!log_tmp_block_encrypt(static_cast<const byte*>(buf),
+ buf_len,
+ static_cast<byte*>(crypt_buf),
+ ofs)) {
+ DBUG_RETURN(false);
+ }
+
+ srv_stats.n_merge_blocks_encrypted.inc();
+ out_buf = crypt_buf;
+ }
+
+ const bool success = DB_SUCCESS == os_file_write(
+ IORequestWrite, "(merge)", fd, out_buf, ofs, buf_len);
+
+#ifdef POSIX_FADV_DONTNEED
+ /* The block will be needed on the next merge pass,
+ but it can be evicted from the file cache meanwhile. */
+ posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ DBUG_RETURN(success);
+}
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ const byte* b, /*!< in: pointer to record */
+ const dict_index_t* index, /*!< in: index of the record */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t** mrec, /*!< out: pointer to merge record,
+ or NULL on end of list
+ (non-NULL on I/O error) */
+ rec_offs* offsets,/*!< out: offsets of mrec */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ ulint extra_size;
+ ulint data_size;
+ ulint avail_size;
+
+ ut_ad(b >= &block[0]);
+ ut_ad(b < &block[srv_sort_buf_size]);
+
+ ut_ad(rec_offs_get_n_alloc(offsets) == 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index));
+
+ DBUG_ENTER("row_merge_read_rec");
+
+ extra_size = *b++;
+
+ if (UNIV_UNLIKELY(!extra_size)) {
+ /* End of list */
+ *mrec = NULL;
+ DBUG_LOG("ib_merge_sort",
+ "read " << reinterpret_cast<const void*>(b) << ',' <<
+ reinterpret_cast<const void*>(block) << ',' <<
+ fd << ',' << *foffs << " EOF");
+ DBUG_RETURN(NULL);
+ }
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
+ if (!row_merge_read(fd, ++(*foffs), block,
+ crypt_block,
+ space)) {
+err_exit:
+ /* Signal I/O error. */
+ *mrec = b;
+ DBUG_RETURN(NULL);
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = &block[0];
+ }
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *b++;
+ }
+
+ /* Normalize extra_size. Above, value 0 signals "end of list". */
+ extra_size--;
+
+ /* Read the extra bytes. */
+
+ if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
+ /* The record spans two blocks. Copy the entire record
+ to the auxiliary buffer and handle this as a special
+ case. */
+
+ avail_size = ulint(&block[srv_sort_buf_size] - b);
+ ut_ad(avail_size < sizeof *buf);
+ memcpy(*buf, b, avail_size);
+
+ if (!row_merge_read(fd, ++(*foffs), block,
+ crypt_block,
+ space)) {
+
+ goto err_exit;
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = &block[0];
+
+ /* Copy the record. */
+ memcpy(*buf + avail_size, b, extra_size - avail_size);
+ b += extra_size - avail_size;
+
+ *mrec = *buf + extra_size;
+
+ rec_init_offsets_temp(*mrec, index, offsets);
+
+ data_size = rec_offs_data_size(offsets);
+
+ /* These overflows should be impossible given that
+ records are much smaller than either buffer, and
+ the record starts near the beginning of each buffer. */
+ ut_a(extra_size + data_size < sizeof *buf);
+ ut_a(b + data_size < &block[srv_sort_buf_size]);
+
+ /* Copy the data bytes. */
+ memcpy(*buf + extra_size, b, data_size);
+ b += data_size;
+
+ goto func_exit;
+ }
+
+ *mrec = b + extra_size;
+
+ rec_init_offsets_temp(*mrec, index, offsets);
+
+ data_size = rec_offs_data_size(offsets);
+ ut_ad(extra_size + data_size < sizeof *buf);
+
+ b += extra_size + data_size;
+
+ if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
+ /* The record fits entirely in the block.
+ This is the normal case. */
+ goto func_exit;
+ }
+
+ /* The record spans two blocks. Copy it to buf. */
+
+ b -= extra_size + data_size;
+ avail_size = ulint(&block[srv_sort_buf_size] - b);
+ memcpy(*buf, b, avail_size);
+ *mrec = *buf + extra_size;
+
+ rec_init_offsets_temp(*mrec, index, offsets);
+
+ if (!row_merge_read(fd, ++(*foffs), block,
+ crypt_block,
+ space)) {
+
+ goto err_exit;
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = &block[0];
+
+ /* Copy the rest of the record. */
+ memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+ b += extra_size + data_size - avail_size;
+
+func_exit:
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ',' <<
+ reinterpret_cast<const void*>(block)
+ << ",fd=" << fd << ',' << *foffs << ": "
+ << rec_printer(*mrec, 0, offsets).str());
+ DBUG_RETURN(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+ byte* b, /*!< out: buffer */
+ ulint e, /*!< in: encoded extra_size */
+#ifndef DBUG_OFF
+ ulint size, /*!< in: total size to write */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint foffs, /*!< in: file offset */
+#endif /* !DBUG_OFF */
+ const mrec_t* mrec, /*!< in: record to write */
+ const rec_offs* offsets)/*!< in: offsets of mrec */
+#ifdef DBUG_OFF
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
+ row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* DBUG_OFF */
+{
+ DBUG_ENTER("row_merge_write_rec_low");
+
+#ifndef DBUG_OFF
+ const byte* const end = b + size;
+#endif /* DBUG_OFF */
+ DBUG_ASSERT(e == rec_offs_extra_size(offsets) + 1);
+
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ",fd=" << fd << ','
+ << foffs << ": " << rec_printer(mrec, 0, offsets).str());
+
+ if (e < 0x80) {
+ *b++ = (byte) e;
+ } else {
+ *b++ = (byte) (0x80 | (e >> 8));
+ *b++ = (byte) e;
+ }
+
+ memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+ DBUG_SLOW_ASSERT(b + rec_offs_size(offsets) == end);
+ DBUG_VOID_RETURN;
+}
+
+/********************************************************************//**
+Write a merge record.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ byte* b, /*!< in: pointer to end of block */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t* mrec, /*!< in: record to write */
+ const rec_offs* offsets,/*!< in: offsets of mrec */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ ulint extra_size;
+ ulint size;
+ ulint avail_size;
+
+ ut_ad(block);
+ ut_ad(buf);
+ ut_ad(b >= &block[0]);
+ ut_ad(b < &block[srv_sort_buf_size]);
+ ut_ad(mrec);
+ ut_ad(foffs);
+ ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
+ ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+ /* Normalize extra_size. Value 0 signals "end of list". */
+ extra_size = rec_offs_extra_size(offsets) + 1;
+
+ size = extra_size + (extra_size >= 0x80)
+ + rec_offs_data_size(offsets);
+
+ if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
+ /* The record spans two blocks.
+ Copy it to the temporary buffer first. */
+ avail_size = ulint(&block[srv_sort_buf_size] - b);
+
+ row_merge_write_rec_low(buf[0],
+ extra_size, size, fd, *foffs,
+ mrec, offsets);
+
+ /* Copy the head of the temporary buffer, write
+ the completed block, and copy the tail of the
+ record to the head of the new block. */
+ memcpy(b, buf[0], avail_size);
+
+ if (!row_merge_write(fd, (*foffs)++, block,
+ crypt_block,
+ space)) {
+ return(NULL);
+ }
+
+ MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+
+ /* Copy the rest. */
+ b = &block[0];
+ memcpy(b, buf[0] + avail_size, size - avail_size);
+ b += size - avail_size;
+ } else {
+ row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+ mrec, offsets);
+ b += size;
+ }
+
+ return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ byte* b, /*!< in: pointer to end of block */
+ const pfs_os_file_t& fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space) /*!< in: space id */
+{
+ ut_ad(block);
+ ut_ad(b >= &block[0]);
+ ut_ad(b < &block[srv_sort_buf_size]);
+ ut_ad(foffs);
+
+ DBUG_ENTER("row_merge_write_eof");
+ DBUG_LOG("ib_merge_sort",
+ reinterpret_cast<const void*>(b) << ',' <<
+ reinterpret_cast<const void*>(block) <<
+ ",fd=" << fd << ',' << *foffs);
+
+ *b++ = 0;
+ MEM_CHECK_DEFINED(&block[0], b - &block[0]);
+ MEM_CHECK_ADDRESSABLE(&block[0], srv_sort_buf_size);
+
+ /* The rest of the block is uninitialized. Silence warnings. */
+ MEM_MAKE_DEFINED(b, &block[srv_sort_buf_size] - b);
+
+ if (!row_merge_write(fd, (*foffs)++, block, crypt_block, space)) {
+ DBUG_RETURN(NULL);
+ }
+
+ MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+ DBUG_RETURN(&block[0]);
+}
+
+/** Create a temporary file if it has not been created already.
+@param[in,out] tmpfd temporary file handle
+@param[in] path location for creating temporary file
+@return true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_tmpfile_if_needed(
+ pfs_os_file_t* tmpfd,
+ const char* path)
+{
+ if (*tmpfd == OS_FILE_CLOSED) {
+ *tmpfd = row_merge_file_create_low(path);
+ if (*tmpfd != OS_FILE_CLOSED) {
+ MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+ }
+ }
+
+ return(*tmpfd != OS_FILE_CLOSED);
+}
+
+/** Create a temporary file for merge sort if it was not created already.
+@param[in,out] file merge file structure
+@param[in] nrec number of records in the file
+@param[in] path location for creating temporary file
+@return true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_file_create_if_needed(
+ merge_file_t* file,
+ pfs_os_file_t* tmpfd,
+ ulint nrec,
+ const char* path)
+{
+ ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+ if (file->fd == OS_FILE_CLOSED && row_merge_file_create(file, path)!= OS_FILE_CLOSED) {
+ MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+ if (!row_merge_tmpfile_if_needed(tmpfd, path) ) {
+ return(false);
+ }
+
+ file->n_rec = nrec;
+ }
+
+ ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+ return(file->fd != OS_FILE_CLOSED);
+}
+
+/** Copy the merge data tuple from another merge data tuple.
+@param[in] mtuple source merge data tuple
+@param[in,out] prev_mtuple destination merge data tuple
+@param[in] n_unique number of unique fields exist in the mtuple
+@param[in,out] heap memory heap where last_mtuple allocated */
+static
+void
+row_mtuple_create(
+ const mtuple_t* mtuple,
+ mtuple_t* prev_mtuple,
+ ulint n_unique,
+ mem_heap_t* heap)
+{
+ memcpy(prev_mtuple->fields, mtuple->fields,
+ n_unique * sizeof *mtuple->fields);
+
+ dfield_t* field = prev_mtuple->fields;
+
+ for (ulint i = 0; i < n_unique; i++) {
+ dfield_dup(field++, heap);
+ }
+}
+
+/** Compare two merge data tuples.
+@param[in] prev_mtuple merge data tuple
+@param[in] current_mtuple merge data tuple
+@param[in,out] dup reporter of duplicates
+@retval positive, 0, negative if current_mtuple is greater, equal, less, than
+last_mtuple. */
+static
+int
+row_mtuple_cmp(
+ const mtuple_t* prev_mtuple,
+ const mtuple_t* current_mtuple,
+ row_merge_dup_t* dup)
+{
+ ut_ad(dup->index->is_primary());
+ const ulint n_uniq= dup->index->n_uniq;
+ return row_merge_tuple_cmp(dup->index, n_uniq, n_uniq,
+ *current_mtuple, *prev_mtuple, dup);
+}
+
+/** Insert cached spatial index rows.
+@param[in] trx_id transaction id
+@param[in] sp_tuples cached spatial rows
+@param[in] num_spatial number of spatial indexes
+@param[in,out] heap temporary memory heap
+@param[in,out] pcur cluster index cursor
+@param[in,out] started whether mtr is active
+@param[in,out] mtr mini-transaction
+@return DB_SUCCESS or error number */
+static
+dberr_t
+row_merge_spatial_rows(
+ trx_id_t trx_id,
+ spatial_index_info** sp_tuples,
+ ulint num_spatial,
+ mem_heap_t* heap,
+ btr_pcur_t* pcur,
+ bool& started,
+ mtr_t* mtr)
+{
+ if (!sp_tuples)
+ return DB_SUCCESS;
+
+ for (ulint j= 0; j < num_spatial; j++)
+ if (dberr_t err= sp_tuples[j]->insert(trx_id, pcur, started, heap, mtr))
+ return err;
+
+ mem_heap_empty(heap);
+ return DB_SUCCESS;
+}
+
+/** Check if the geometry field is valid.
+@param[in] row the row
+@param[in] index spatial index
+@return true if it's valid, false if it's invalid. */
+static
+bool
+row_geo_field_is_valid(
+ const dtuple_t* row,
+ dict_index_t* index)
+{
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, 0);
+ const dict_col_t* col
+ = ind_field->col;
+ ulint col_no
+ = dict_col_get_no(col);
+ const dfield_t* dfield
+ = dtuple_get_nth_field(row, col_no);
+
+ if (dfield_is_null(dfield)
+ || dfield_get_len(dfield) < GEO_DATA_HEADER_SIZE) {
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@param[in] trx transaction
+@param[in,out] table MySQL table object, for reporting erroneous
+ records
+@param[in] old_table table where rows are read from
+@param[in] new_table table where indexes are created; identical to
+ old_table unless creating a PRIMARY KEY
+@param[in] online true if creating indexes online
+@param[in] index indexes to be created
+@param[in] fts_sort_idx full-text index to be created, or NULL
+@param[in] psort_info parallel sort info for fts_sort_idx creation,
+ or NULL
+@param[in] files temporary files
+@param[in] key_numbers MySQL key numbers to create
+@param[in] n_index number of indexes to create
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] add_v newly added virtual columns along with indexes
+@param[in] col_map mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out] sequence autoinc sequence
+@param[in,out] block file buffer
+@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
+existing order
+@param[in,out] tmpfd temporary file handle
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->n_pk_recs_inc() will be called for each record read and
+stage->inc() will be called for each page read.
+@param[in] pct_cost percent of task weight out of total alter job
+@param[in,out] crypt_block crypted file buffer
+@param[in] eval_table mysql table used to evaluate virtual column
+ value, see innobase_get_computed_value().
+@param[in] allow_not_null allow null to not-null conversion
+@param[in] col_collate columns whose collations changed, or nullptr
+@return DB_SUCCESS or error */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_read_clustered_index(
+ trx_t* trx,
+ struct TABLE* table,
+ const dict_table_t* old_table,
+ dict_table_t* new_table,
+ bool online,
+ dict_index_t** index,
+ dict_index_t* fts_sort_idx,
+ fts_psort_t* psort_info,
+ merge_file_t* files,
+ const ulint* key_numbers,
+ ulint n_index,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ ulint add_autoinc,
+ ib_sequence_t& sequence,
+ row_merge_block_t* block,
+ bool skip_pk_sort,
+ pfs_os_file_t* tmpfd,
+ ut_stage_alter_t* stage,
+ double pct_cost,
+ row_merge_block_t* crypt_block,
+ struct TABLE* eval_table,
+ bool allow_not_null,
+ const col_collations* col_collate)
+{
+ dict_index_t* clust_index; /* Clustered index */
+ mem_heap_t* row_heap = NULL;/* Heap memory to create
+ clustered index tuples */
+ row_merge_buf_t** merge_buf; /* Temporary list for records*/
+ mem_heap_t* v_heap = NULL; /* Heap memory to process large
+ data for virtual column */
+ btr_pcur_t pcur; /* Cursor on the clustered
+ index */
+ mtr_t mtr; /* Mini transaction */
+ bool mtr_started = false;
+ dberr_t err = DB_SUCCESS;/* Return code */
+ ulint n_nonnull = 0; /* number of columns
+ changed to NOT NULL */
+ ulint* nonnull = NULL; /* NOT NULL columns */
+ dict_index_t* fts_index = NULL;/* FTS index */
+ doc_id_t doc_id = 0;
+ doc_id_t max_doc_id = 0;
+ ibool add_doc_id = FALSE;
+ pthread_cond_t* fts_parallel_sort_cond = nullptr;
+ spatial_index_info** sp_tuples = nullptr;
+ ulint num_spatial = 0;
+ BtrBulk* clust_btr_bulk = NULL;
+ bool clust_temp_file = false;
+ mem_heap_t* mtuple_heap = NULL;
+ mtuple_t prev_mtuple;
+ mem_heap_t* conv_heap = NULL;
+ double curr_progress = 0.0;
+ ib_uint64_t read_rows = 0;
+ ib_uint64_t table_total_rows = 0;
+ char new_sys_trx_start[8];
+ char new_sys_trx_end[8];
+ byte any_autoinc_data[8] = {0};
+ bool vers_update_trt = false;
+
+ DBUG_ENTER("row_merge_read_clustered_index");
+
+ ut_ad((old_table == new_table) == !col_map);
+ ut_ad(!defaults || col_map);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->id);
+
+ table_total_rows = dict_table_get_n_rows(old_table);
+ if(table_total_rows == 0) {
+ /* We don't know total row count */
+ table_total_rows = 1;
+ }
+
+ trx->op_info = "reading clustered index";
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
+ /* Create and initialize memory for record buffers */
+
+ merge_buf = static_cast<row_merge_buf_t**>(
+ ut_malloc_nokey(n_index * sizeof *merge_buf));
+
+ row_merge_dup_t clust_dup = {index[0], table, col_map, 0};
+ dfield_t* prev_fields = nullptr;
+ const ulint n_uniq = dict_index_get_n_unique(index[0]);
+
+ ut_ad(trx->mysql_thd != NULL);
+
+ const char* path = thd_innodb_tmpdir(trx->mysql_thd);
+
+ ut_ad(!skip_pk_sort || dict_index_is_clust(index[0]));
+ /* There is no previous tuple yet. */
+ prev_mtuple.fields = NULL;
+
+ for (ulint i = 0; i < n_index; i++) {
+ if (index[i]->type & DICT_FTS) {
+
+ /* We are building a FT index, make sure
+ we have the temporary 'fts_sort_idx' */
+ ut_a(fts_sort_idx);
+
+ fts_index = index[i];
+
+ merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+ add_doc_id = DICT_TF2_FLAG_IS_SET(
+ new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+ /* If Doc ID does not exist in the table itself,
+ fetch the first FTS Doc ID */
+ if (add_doc_id) {
+ fts_get_next_doc_id(
+ (dict_table_t*) new_table,
+ &doc_id);
+ ut_ad(doc_id > 0);
+ }
+
+ row_fts_start_psort(psort_info);
+ fts_parallel_sort_cond =
+ &psort_info[0].psort_common->sort_cond;
+ } else {
+ if (dict_index_is_spatial(index[i])) {
+ num_spatial++;
+ }
+
+ merge_buf[i] = row_merge_buf_create(index[i]);
+ }
+ }
+
+ if (num_spatial > 0) {
+ ulint count = 0;
+
+ sp_tuples = static_cast<spatial_index_info**>(
+ ut_malloc_nokey(num_spatial
+ * sizeof(*sp_tuples)));
+
+ for (ulint i = 0; i < n_index; i++) {
+ if (dict_index_is_spatial(index[i])) {
+ sp_tuples[count]
+ = UT_NEW_NOKEY(
+ spatial_index_info(index[i]));
+ count++;
+ }
+ }
+
+ ut_ad(count == num_spatial);
+ }
+
+ mtr.start();
+ mtr_started = true;
+
+ /* Find the clustered index and create a persistent cursor
+ based on that. */
+
+ clust_index = dict_table_get_first_index(old_table);
+ const ulint old_trx_id_col = ulint(old_table->n_cols)
+ - (DATA_N_SYS_COLS - DATA_TRX_ID);
+ ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS);
+ ut_ad(old_table->cols[old_trx_id_col].prtype
+ == (DATA_TRX_ID | DATA_NOT_NULL));
+ ut_ad(old_table->cols[old_trx_id_col + 1].mtype == DATA_SYS);
+ ut_ad(old_table->cols[old_trx_id_col + 1].prtype
+ == (DATA_ROLL_PTR | DATA_NOT_NULL));
+ const ulint new_trx_id_col = col_map
+ ? col_map[old_trx_id_col] : old_trx_id_col;
+ uint64_t n_rows = 0;
+
+ err = pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr);
+ if (err != DB_SUCCESS) {
+err_exit:
+ trx->error_key_num = 0;
+ goto func_exit;
+ } else {
+ rec_t* rec = page_rec_get_next(btr_pcur_get_rec(&pcur));
+ if (!rec) {
+corrupted_metadata:
+ err = DB_CORRUPTION;
+ goto err_exit;
+ }
+ if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG) {
+ if (!clust_index->is_instant()) {
+ goto corrupted_metadata;
+ }
+ if (page_rec_is_comp(rec)
+ && rec_get_status(rec) != REC_STATUS_INSTANT) {
+ goto corrupted_metadata;
+ }
+ /* Skip the metadata pseudo-record. */
+ btr_pcur_get_page_cur(&pcur)->rec = rec;
+ } else if (clust_index->is_instant()) {
+ goto corrupted_metadata;
+ }
+ }
+
+ /* Check if the table is supposed to be empty for our read view.
+
+ If we read bulk_trx_id as an older transaction ID, it is not
+ incorrect to check here whether that transaction should be
+ visible to us. If bulk_trx_id is not visible to us, the table
+ must have been empty at an earlier point of time, also in our
+ read view.
+
+ An INSERT would only update bulk_trx_id in
+ row_ins_clust_index_entry_low() if the table really was empty
+ (everything had been purged), when holding a leaf page latch
+ in the clustered index (actually, the root page is the only
+ leaf page in that case).
+
+ We are holding a clustered index leaf page latch here.
+ That will obviously prevent any concurrent INSERT from
+ updating bulk_trx_id while we read it. */
+ if (!online) {
+ } else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
+ ut_ad(trx->read_view.is_open());
+ ut_ad(bulk_trx_id != trx->id);
+ if (!trx->read_view.changes_visible(bulk_trx_id)) {
+ goto func_exit;
+ }
+ }
+
+ if (old_table != new_table) {
+ /* The table is being rebuilt. Identify the columns
+ that were flagged NOT NULL in the new table, so that
+ we can quickly check that the records in the old table
+ do not violate the added NOT NULL constraints. */
+
+ nonnull = static_cast<ulint*>(
+ ut_malloc_nokey(dict_table_get_n_cols(new_table)
+ * sizeof *nonnull));
+
+ for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
+ if (dict_table_get_nth_col(old_table, i)->prtype
+ & DATA_NOT_NULL) {
+ continue;
+ }
+
+ const ulint j = col_map[i];
+
+ if (j == ULINT_UNDEFINED) {
+ /* The column was dropped. */
+ continue;
+ }
+
+ if (dict_table_get_nth_col(new_table, j)->prtype
+ & DATA_NOT_NULL) {
+ nonnull[n_nonnull++] = j;
+ }
+ }
+
+ if (!n_nonnull) {
+ ut_free(nonnull);
+ nonnull = NULL;
+ }
+ }
+
+ row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+ if (dict_table_is_comp(old_table)
+ && !dict_table_is_comp(new_table)) {
+ conv_heap = mem_heap_create(sizeof(mrec_buf_t));
+ }
+
+ if (skip_pk_sort) {
+ prev_fields = static_cast<dfield_t*>(
+ ut_malloc_nokey(n_uniq * sizeof *prev_fields));
+ mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
+ }
+
+ mach_write_to_8(new_sys_trx_start, trx->id);
+ mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
+
+ /* Scan the clustered index. */
+ for (;;) {
+ /* Do not continue if table pages are still encrypted */
+ if (!old_table->is_readable() || !new_table->is_readable()) {
+ err = DB_DECRYPTION_FAILED;
+ goto err_exit;
+ }
+
+ const rec_t* rec;
+ trx_id_t rec_trx_id;
+ rec_offs* offsets;
+ dtuple_t* row;
+ row_ext_t* ext;
+ page_cur_t* cur = btr_pcur_get_page_cur(&pcur);
+ bool history_row, history_fts = false;
+
+ stage->n_pk_recs_inc();
+
+ if (!page_cur_move_to_next(cur)) {
+corrupted_rec:
+ err = DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ if (page_cur_is_after_last(cur)) {
+
+ stage->inc();
+
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ err = DB_INTERRUPTED;
+ goto err_exit;
+ }
+
+ if (online && old_table != new_table) {
+ err = row_log_table_get_error(clust_index);
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+ }
+
+ /* Insert the cached spatial index rows. */
+ err = row_merge_spatial_rows(
+ trx->id, sp_tuples, num_spatial,
+ row_heap, &pcur, mtr_started, &mtr);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ mem_heap_empty(row_heap);
+
+ if (!mtr_started) {
+ goto scan_next;
+ }
+
+ if (clust_index->lock.is_waiting()) {
+ /* There are waiters on the clustered
+ index tree lock, likely the purge
+ thread. Store and restore the cursor
+ position, and yield so that scanning a
+ large table will not starve other
+ threads. */
+
+ /* Store the cursor position on the last user
+ record on the page. */
+ if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+ goto corrupted_index;
+ }
+ /* Leaf pages must never be empty, unless
+ this is the only page in the index tree. */
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ && btr_pcur_get_block(&pcur)->page.id()
+ .page_no() != clust_index->page) {
+ goto corrupted_index;
+ }
+
+ btr_pcur_store_position(&pcur, &mtr);
+ mtr.commit();
+ mtr_started = false;
+
+ /* Give the waiters a chance to proceed. */
+ std::this_thread::yield();
+scan_next:
+ ut_ad(!mtr_started);
+ ut_ad(!mtr.is_active());
+ mtr.start();
+ mtr_started = true;
+ /* Restore position on the record, or its
+ predecessor if the record was purged
+ meanwhile. */
+ if (pcur.restore_position(BTR_SEARCH_LEAF,
+ &mtr)
+ == btr_pcur_t::CORRUPTED) {
+corrupted_index:
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+ /* Move to the successor of the
+ original record. */
+ if (!btr_pcur_move_to_next_user_rec(
+ &pcur, &mtr)) {
+end_of_index:
+ row = NULL;
+ mtr.commit();
+ mtr_started = false;
+ mem_heap_free(row_heap);
+ row_heap = NULL;
+ ut_free(nonnull);
+ nonnull = NULL;
+ goto write_buffers;
+ }
+ } else {
+ uint32_t next_page_no = btr_page_get_next(
+ page_cur_get_page(cur));
+
+ if (next_page_no == FIL_NULL) {
+ goto end_of_index;
+ }
+
+ buf_block_t* block = buf_page_get_gen(
+ page_id_t(old_table->space->id,
+ next_page_no),
+ old_table->space->zip_size(),
+ RW_S_LATCH, nullptr, BUF_GET, &mtr,
+ &err, false);
+ if (!block) {
+ goto err_exit;
+ }
+
+ page_cur_set_before_first(block, cur);
+ if (!page_cur_move_to_next(cur)
+ || page_cur_is_after_last(cur)) {
+ goto corrupted_rec;
+ }
+
+ const auto s = mtr.get_savepoint();
+ mtr.rollback_to_savepoint(s - 2, s - 1);
+ }
+ } else {
+ mem_heap_empty(row_heap);
+ }
+
+ rec = page_cur_get_rec(cur);
+
+ if (online) {
+ offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &row_heap);
+ rec_trx_id = row_get_rec_trx_id(rec, clust_index,
+ offsets);
+
+ /* Perform a REPEATABLE READ.
+
+ When rebuilding the table online,
+ row_log_table_apply() must not see a newer
+ state of the table when applying the log.
+ This is mainly to prevent false duplicate key
+ errors, because the log will identify records
+ by the PRIMARY KEY, and also to prevent unsafe
+ BLOB access.
+
+ When creating a secondary index online, this
+ table scan must not see records that have only
+ been inserted to the clustered index, but have
+ not been written to the online_log of
+ index[]. If we performed READ UNCOMMITTED, it
+ could happen that the ADD INDEX reaches
+ ONLINE_INDEX_COMPLETE state between the time
+ the DML thread has updated the clustered index
+ but has not yet accessed secondary index. */
+ ut_ad(trx->read_view.is_open());
+ ut_ad(rec_trx_id != trx->id);
+
+ if (!trx->read_view.changes_visible(rec_trx_id)) {
+ if (rec_trx_id
+ >= trx->read_view.low_limit_id()
+ && rec_trx_id
+ >= trx_sys.get_max_trx_id()) {
+ goto corrupted_rec;
+ }
+
+ rec_t* old_vers;
+
+ row_vers_build_for_consistent_read(
+ rec, &mtr, clust_index, &offsets,
+ &trx->read_view, &row_heap,
+ row_heap, &old_vers, NULL);
+
+ if (!old_vers) {
+ continue;
+ }
+
+ /* The old version must necessarily be
+ in the "prehistory", because the
+ exclusive lock in
+ ha_innobase::prepare_inplace_alter_table()
+ forced the completion of any transactions
+ that accessed this table. */
+ ut_ad(row_get_rec_trx_id(old_vers, clust_index,
+ offsets) < trx->id);
+
+ rec = old_vers;
+ rec_trx_id = 0;
+ }
+
+ if (rec_get_deleted_flag(
+ rec,
+ dict_table_is_comp(old_table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record.
+ Above, we did reset rec_trx_id = 0
+ for rec = old_vers.*/
+ ut_ad(rec == page_cur_get_rec(cur)
+ ? rec_trx_id
+ : !rec_trx_id);
+ /* This record was deleted in the latest
+ committed version, or it was deleted and
+ then reinserted-by-update before purge
+ kicked in. Skip it. */
+ continue;
+ }
+
+ ut_ad(!rec_offs_any_null_extern(rec, offsets));
+ } else if (rec_get_deleted_flag(
+ rec, dict_table_is_comp(old_table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_d(rec_trx_id = rec_get_trx_id(rec, clust_index));
+ ut_ad(rec_trx_id);
+ /* This must be a purgeable delete-marked record,
+ and the transaction that delete-marked the record
+ must have been committed before this
+ !online ALTER TABLE transaction. */
+ ut_ad(rec_trx_id < trx->id);
+ /* Skip delete-marked records.
+
+ Skipping delete-marked records will make the
+ created indexes unuseable for transactions
+ whose read views were created before the index
+ creation completed, but an attempt to preserve
+ the history would make it tricky to detect
+ duplicate keys. */
+ continue;
+ } else {
+ offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &row_heap);
+ /* This is a locking ALTER TABLE.
+
+ If we are not rebuilding the table, the
+ DB_TRX_ID does not matter, as it is not being
+ written to any secondary indexes; see
+ if (old_table == new_table) below.
+
+ If we are rebuilding the table, the
+ DB_TRX_ID,DB_ROLL_PTR should be reset, because
+ there will be no history available. */
+ ut_ad(rec_get_trx_id(rec, clust_index) < trx->id);
+ rec_trx_id = 0;
+ }
+
+ /* When !online, we are holding a lock on old_table, preventing
+ any inserts that could have written a record 'stub' before
+ writing out off-page columns. */
+ ut_ad(!rec_offs_any_null_extern(rec, offsets));
+
+ /* Build a row based on the clustered index. */
+
+ row = row_build_w_add_vcol(ROW_COPY_POINTERS, clust_index,
+ rec, offsets, new_table,
+ defaults, add_v, col_map, &ext,
+ row_heap);
+ ut_ad(row);
+
+ history_row = new_table->versioned()
+ && dtuple_get_nth_field(row, new_table->vers_end)
+ ->vers_history_row();
+ history_fts = history_row && new_table->fts;
+
+ for (ulint i = 0; i < n_nonnull; i++) {
+ dfield_t* field = &row->fields[nonnull[i]];
+
+ ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
+
+ if (dfield_is_null(field)) {
+
+ Field* null_field =
+ table->field[nonnull[i]];
+
+ null_field->set_warning(
+ Sql_condition::WARN_LEVEL_WARN,
+ WARN_DATA_TRUNCATED, 1,
+ ulong(n_rows + 1));
+
+ if (!allow_not_null) {
+ err = DB_INVALID_NULL;
+ goto err_exit;
+ }
+
+ const dfield_t& default_field
+ = defaults->fields[nonnull[i]];
+
+ *field = default_field;
+ }
+ }
+
+ /* Get the next Doc ID */
+ if (add_doc_id && !history_fts) {
+ doc_id++;
+ } else {
+ doc_id = 0;
+ }
+
+ ut_ad(row->fields[new_trx_id_col].type.mtype == DATA_SYS);
+ ut_ad(row->fields[new_trx_id_col].type.prtype
+ == (DATA_TRX_ID | DATA_NOT_NULL));
+ ut_ad(row->fields[new_trx_id_col].len == DATA_TRX_ID_LEN);
+ ut_ad(row->fields[new_trx_id_col + 1].type.mtype == DATA_SYS);
+ ut_ad(row->fields[new_trx_id_col + 1].type.prtype
+ == (DATA_ROLL_PTR | DATA_NOT_NULL));
+ ut_ad(row->fields[new_trx_id_col + 1].len == DATA_ROLL_PTR_LEN);
+
+ if (old_table == new_table) {
+ /* Do not bother touching DB_TRX_ID,DB_ROLL_PTR
+ because they are not going to be written into
+ secondary indexes. */
+ } else if (rec_trx_id < trx->id) {
+ /* Reset the DB_TRX_ID,DB_ROLL_PTR of old rows
+ for which history is not going to be
+ available after the rebuild operation.
+ This essentially mimics row_purge_reset_trx_id(). */
+ row->fields[new_trx_id_col].data
+ = const_cast<byte*>(reset_trx_id);
+ row->fields[new_trx_id_col + 1].data
+ = const_cast<byte*>(reset_trx_id
+ + DATA_TRX_ID_LEN);
+ }
+
+ if (add_autoinc != ULINT_UNDEFINED) {
+
+ ut_ad(add_autoinc
+ < dict_table_get_n_user_cols(new_table));
+
+ dfield_t* dfield = dtuple_get_nth_field(row,
+ add_autoinc);
+
+ if (new_table->versioned()) {
+ if (history_row) {
+ if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) {
+ err = DB_UNSUPPORTED;
+ my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
+ old_table->name.m_name);
+ goto func_exit;
+ }
+ dfield_set_null(dfield);
+ } else {
+ // set not null
+ ulint len = dfield_get_type(dfield)->len;
+ dfield_set_data(dfield, any_autoinc_data, len);
+ }
+ }
+
+ if (dfield_is_null(dfield)) {
+ goto write_buffers;
+ }
+
+ const dtype_t* dtype = dfield_get_type(dfield);
+ byte* b = static_cast<byte*>(dfield_get_data(dfield));
+
+ if (sequence.eof()) {
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_AUTOINC_READ_FAILED, "[NULL]");
+ err = DB_ERROR;
+ goto err_exit;
+ }
+
+ ulonglong value = sequence++;
+
+ switch (dtype_get_mtype(dtype)) {
+ case DATA_INT: {
+ ibool usign;
+ ulint len = dfield_get_len(dfield);
+
+ usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+ mach_write_ulonglong(b, value, len, usign);
+
+ break;
+ }
+
+ case DATA_FLOAT:
+ mach_float_write(
+ b, static_cast<float>(value));
+ break;
+
+ case DATA_DOUBLE:
+ mach_double_write(
+ b, static_cast<double>(value));
+ break;
+
+ default:
+ ut_ad(0);
+ }
+ }
+
+ if (old_table->versioned()) {
+ if (!new_table->versioned()
+ && clust_index->vers_history_row(rec, offsets)) {
+ continue;
+ }
+ } else if (new_table->versioned()) {
+ dfield_t* start =
+ dtuple_get_nth_field(row, new_table->vers_start);
+ dfield_t* end =
+ dtuple_get_nth_field(row, new_table->vers_end);
+ dfield_set_data(start, new_sys_trx_start, 8);
+ dfield_set_data(end, new_sys_trx_end, 8);
+ vers_update_trt = true;
+ }
+
+write_buffers:
+ /* Build all entries for all the indexes to be created
+ in a single scan of the clustered index. */
+
+ n_rows++;
+ ulint s_idx_cnt = 0;
+ bool skip_sort = skip_pk_sort
+ && dict_index_is_clust(merge_buf[0]->index);
+
+ for (ulint k = 0, i = 0; i < n_index; i++, skip_sort = false) {
+ row_merge_buf_t* buf = merge_buf[i];
+ ulint rows_added = 0;
+
+ if (dict_index_is_spatial(buf->index)) {
+ if (!row) {
+ continue;
+ }
+
+ ut_ad(sp_tuples[s_idx_cnt]->index
+ == buf->index);
+
+ /* If the geometry field is invalid, report
+ error. */
+ if (!row_geo_field_is_valid(row, buf->index)) {
+ err = DB_CANT_CREATE_GEOMETRY_OBJECT;
+ break;
+ }
+
+ sp_tuples[s_idx_cnt]->add(row, ext, buf->heap);
+ s_idx_cnt++;
+
+ continue;
+ }
+
+ ut_ad(!row
+ || !dict_index_is_clust(buf->index)
+ || trx_id_check(row->fields[new_trx_id_col].data,
+ trx->id));
+
+ merge_file_t* file = &files[k++];
+
+ if (UNIV_LIKELY
+ (row && (rows_added = row_merge_buf_add(
+ buf, fts_index, old_table, new_table,
+ psort_info, row, ext, history_fts,
+ &doc_id, conv_heap, &err,
+ &v_heap, eval_table, trx,
+ col_collate)))) {
+
+ /* If we are creating FTS index,
+ a single row can generate more
+ records for tokenized word */
+ file->n_rec += rows_added;
+
+ if (err != DB_SUCCESS) {
+ ut_ad(err == DB_TOO_BIG_RECORD);
+ break;
+ }
+
+ if (doc_id > max_doc_id) {
+ max_doc_id = doc_id;
+ }
+
+ if (buf->index->type & DICT_FTS) {
+ /* Check if error occurs in child thread */
+ for (ulint j = 0;
+ j < fts_sort_pll_degree; j++) {
+ if (psort_info[j].error
+ != DB_SUCCESS) {
+ err = psort_info[j].error;
+ trx->error_key_num = i;
+ break;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ if (skip_sort) {
+ ut_ad(buf->n_tuples > 0);
+ const mtuple_t* curr =
+ &buf->tuples[buf->n_tuples - 1];
+
+ ut_ad(i == 0);
+ ut_ad(dict_index_is_clust(merge_buf[0]->index));
+ /* Detect duplicates by comparing the
+ current record with previous record.
+ When temp file is not used, records
+ should be in sorted order. */
+ if (prev_mtuple.fields != NULL
+ && (row_mtuple_cmp(
+ &prev_mtuple, curr,
+ &clust_dup) == 0)) {
+
+ err = DB_DUPLICATE_KEY;
+ trx->error_key_num
+ = key_numbers[0];
+ goto func_exit;
+ }
+
+ prev_mtuple.fields = curr->fields;
+ }
+
+ continue;
+ }
+
+ if (err == DB_COMPUTE_VALUE_FAILED) {
+ trx->error_key_num = i;
+ goto func_exit;
+ }
+
+ if (buf->index->type & DICT_FTS) {
+ if (!row || !doc_id) {
+ continue;
+ }
+ }
+
+ /* The buffer must be sufficiently large
+ to hold at least one record. It may only
+ be empty when we reach the end of the
+ clustered index. row_merge_buf_add()
+ must not have been called in this loop. */
+ ut_ad(buf->n_tuples || row == NULL);
+
+ /* We have enough data tuples to form a block.
+ Sort them and write to disk if temp file is used
+ or insert into index if temp file is not used. */
+ ut_ad(old_table == new_table
+ ? !dict_index_is_clust(buf->index)
+ : (i == 0) == dict_index_is_clust(buf->index));
+
+ /* We have enough data tuples to form a block.
+ Sort them (if !skip_sort) and write to disk. */
+
+ if (buf->n_tuples) {
+ if (skip_sort) {
+ /* Temporary File is not used.
+ so insert sorted block to the index */
+ if (row != NULL) {
+ /* We have to do insert the
+ cached spatial index rows, since
+ after the mtr_commit, the cluster
+ index page could be updated, then
+ the data in cached rows become
+ invalid. */
+ err = row_merge_spatial_rows(
+ trx->id, sp_tuples,
+ num_spatial,
+ row_heap,
+ &pcur, mtr_started,
+ &mtr);
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ /* We are not at the end of
+ the scan yet. We must
+ mtr.commit() in order to be
+ able to call log_free_check()
+ in row_merge_insert_index_tuples().
+ Due to mtr.commit(), the
+ current row will be invalid, and
+ we must reread it on the next
+ loop iteration. */
+ if (mtr_started) {
+ if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+ btr_pcur_store_position(
+ &pcur, &mtr);
+
+ mtr.commit();
+ mtr_started = false;
+ }
+ }
+
+ mem_heap_empty(mtuple_heap);
+ prev_mtuple.fields = prev_fields;
+
+ row_mtuple_create(
+ &buf->tuples[buf->n_tuples - 1],
+ &prev_mtuple, n_uniq,
+ mtuple_heap);
+
+ if (clust_btr_bulk == NULL) {
+ clust_btr_bulk = UT_NEW_NOKEY(
+ BtrBulk(index[i],
+ trx));
+ } else {
+ clust_btr_bulk->latch();
+ }
+
+ err = row_merge_insert_index_tuples(
+ index[i], old_table,
+ OS_FILE_CLOSED, NULL, buf,
+ clust_btr_bulk,
+ table_total_rows,
+ curr_progress,
+ pct_cost,
+ crypt_block,
+ new_table->space_id);
+
+ if (row == NULL) {
+ err = clust_btr_bulk->finish(
+ err);
+ UT_DELETE(clust_btr_bulk);
+ clust_btr_bulk = NULL;
+ } else {
+ /* Release latches for possible
+ log_free_chck in spatial index
+ build. */
+ clust_btr_bulk->release();
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ if (row != NULL) {
+ /* Restore the cursor on the
+ previous clustered index record,
+ and empty the buffer. The next
+ iteration of the outer loop will
+ advance the cursor and read the
+ next record (the one which we
+ had to ignore due to the buffer
+ overflow). */
+ mtr.start();
+ mtr_started = true;
+ if (pcur.restore_position(
+ BTR_SEARCH_LEAF, &mtr)
+ == btr_pcur_t::CORRUPTED) {
+ goto corrupted_index;
+ }
+ buf = row_merge_buf_empty(buf);
+ merge_buf[i] = buf;
+ /* Restart the outer loop on the
+ record. We did not insert it
+ into any index yet. */
+ ut_ad(i == 0);
+ break;
+ }
+ } else if (dict_index_is_unique(buf->index)) {
+ row_merge_dup_t dup = {
+ buf->index, table, col_map, 0};
+
+ row_merge_buf_sort(buf, &dup);
+
+ if (dup.n_dup) {
+ err = DB_DUPLICATE_KEY;
+ trx->error_key_num
+ = key_numbers[i];
+ break;
+ }
+ } else {
+ row_merge_buf_sort(buf, NULL);
+ }
+ } else if (online && new_table == old_table) {
+ /* Note the newest transaction that
+ modified this index when the scan was
+ completed. We prevent older readers
+ from accessing this index, to ensure
+ read consistency. */
+
+ ut_a(row == NULL);
+
+ dict_index_t* index = buf->index;
+ index->lock.x_lock(SRW_LOCK_CALL);
+ ut_a(dict_index_get_online_status(index)
+ == ONLINE_INDEX_CREATION);
+
+ trx_id_t max_trx_id = row_log_get_max_trx(
+ index);
+
+ if (max_trx_id > index->trx_id) {
+ index->trx_id = max_trx_id;
+ }
+
+ index->lock.x_unlock();
+ }
+
+ /* Secondary index and clustered index which is
+ not in sorted order can use the temporary file.
+ Fulltext index should not use the temporary file. */
+ if (!skip_sort && !(buf->index->type & DICT_FTS)) {
+ /* In case we can have all rows in sort buffer,
+ we can insert directly into the index without
+ temporary file if clustered index does not uses
+ temporary file. */
+ if (row == NULL && file->fd == OS_FILE_CLOSED
+ && !clust_temp_file) {
+ DBUG_EXECUTE_IF(
+ "row_merge_write_failure",
+ err = DB_TEMP_FILE_WRITE_FAIL;
+ trx->error_key_num = i;
+ goto all_done;);
+
+ DBUG_EXECUTE_IF(
+ "row_merge_tmpfile_fail",
+ err = DB_OUT_OF_MEMORY;
+ trx->error_key_num = i;
+ goto all_done;);
+
+ BtrBulk btr_bulk(index[i], trx);
+
+ err = row_merge_insert_index_tuples(
+ index[i], old_table,
+ OS_FILE_CLOSED, NULL, buf,
+ &btr_bulk,
+ table_total_rows,
+ curr_progress,
+ pct_cost,
+ crypt_block,
+ new_table->space_id);
+
+ err = btr_bulk.finish(err);
+
+ DBUG_EXECUTE_IF(
+ "row_merge_insert_big_row",
+ err = DB_TOO_BIG_RECORD;);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ } else {
+ if (!row_merge_file_create_if_needed(
+ file, tmpfd,
+ buf->n_tuples, path)) {
+ err = DB_OUT_OF_MEMORY;
+ trx->error_key_num = i;
+ break;
+ }
+
+ /* Ensure that duplicates in the
+ clustered index will be detected before
+ inserting secondary index records. */
+ if (dict_index_is_clust(buf->index)) {
+ clust_temp_file = true;
+ }
+
+ ut_ad(file->n_rec > 0);
+
+ row_merge_buf_write(buf,
+#ifndef DBUG_OFF
+ file,
+#endif
+ block);
+
+ if (!row_merge_write(
+ file->fd, file->offset++,
+ block, crypt_block,
+ new_table->space_id)) {
+ err = DB_TEMP_FILE_WRITE_FAIL;
+ trx->error_key_num = i;
+ break;
+ }
+
+ MEM_UNDEFINED(
+ &block[0], srv_sort_buf_size);
+ }
+ }
+ merge_buf[i] = row_merge_buf_empty(buf);
+ buf = merge_buf[i];
+
+ if (UNIV_LIKELY(row != NULL)) {
+ /* Try writing the record again, now
+ that the buffer has been written out
+ and emptied. */
+
+ if (UNIV_UNLIKELY
+ (!(rows_added = row_merge_buf_add(
+ buf, fts_index, old_table,
+ new_table, psort_info,
+ row, ext, history_fts, &doc_id,
+ conv_heap, &err, &v_heap,
+ eval_table, trx, col_collate)))) {
+ /* An empty buffer should have enough
+ room for at least one record. */
+ ut_ad(err == DB_COMPUTE_VALUE_FAILED
+ || err == DB_OUT_OF_MEMORY
+ || err == DB_TOO_BIG_RECORD);
+ } else if (err == DB_SUCCESS) {
+ file->n_rec += rows_added;
+ continue;
+ }
+
+ trx->error_key_num = i;
+ break;
+ }
+ }
+
+ if (row == NULL) {
+ if (old_table != new_table) {
+ new_table->stat_n_rows = n_rows;
+ }
+
+ goto all_done;
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (v_heap) {
+ mem_heap_empty(v_heap);
+ }
+
+ /* Increment innodb_onlineddl_pct_progress status variable */
+ read_rows++;
+ if(read_rows % 1000 == 0) {
+ /* Update progress for each 1000 rows */
+ curr_progress = (read_rows >= table_total_rows) ?
+ pct_cost :
+ pct_cost * static_cast<double>(read_rows)
+ / static_cast<double>(table_total_rows);
+ /* presenting 10.12% as 1012 integer */
+ onlineddl_pct_progress = (ulint) (curr_progress * 100);
+ }
+ }
+
+func_exit:
+ ut_ad(mtr_started == mtr.is_active());
+ if (mtr_started) {
+ mtr.commit();
+ }
+ if (row_heap) {
+ mem_heap_free(row_heap);
+ }
+ ut_free(nonnull);
+
+all_done:
+ if (clust_btr_bulk != NULL) {
+ ut_ad(err != DB_SUCCESS);
+ clust_btr_bulk->latch();
+ err = clust_btr_bulk->finish(
+ err);
+ UT_DELETE(clust_btr_bulk);
+ }
+
+ if (prev_fields) {
+ ut_free(prev_fields);
+ mem_heap_free(mtuple_heap);
+ }
+
+ if (v_heap) {
+ mem_heap_free(v_heap);
+ }
+
+ if (conv_heap != NULL) {
+ mem_heap_free(conv_heap);
+ }
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+ if (UNIV_LIKELY_NULL(fts_parallel_sort_cond)) {
+wait_again:
+ /* Check if error occurs in child thread */
+ for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+ if (psort_info[j].error != DB_SUCCESS) {
+ err = psort_info[j].error;
+ trx->error_key_num = j;
+ break;
+ }
+ }
+
+ /* Tell all children that parent has done scanning */
+ for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+ if (err == DB_SUCCESS) {
+ psort_info[i].state = FTS_PARENT_COMPLETE;
+ } else {
+ psort_info[i].state = FTS_PARENT_EXITING;
+ }
+ }
+
+ /* Now wait all children to report back to be completed */
+ timespec abstime;
+ set_timespec(abstime, 1);
+ mysql_mutex_lock(&psort_info[0].mutex);
+ my_cond_timedwait(fts_parallel_sort_cond,
+ &psort_info[0].mutex.m_mutex, &abstime);
+ mysql_mutex_unlock(&psort_info[0].mutex);
+
+ for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+ if (!psort_info[i].child_status) {
+ goto wait_again;
+ }
+ }
+
+ for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+ psort_info[j].task->wait();
+ delete psort_info[j].task;
+ }
+ }
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+ for (ulint i = 0; i < n_index; i++) {
+ row_merge_buf_free(merge_buf[i]);
+ }
+
+ row_fts_free_pll_merge_buf(psort_info);
+
+ ut_free(merge_buf);
+ ut_free(pcur.old_rec_buf);
+
+ if (sp_tuples != NULL) {
+ for (ulint i = 0; i < num_spatial; i++) {
+ UT_DELETE(sp_tuples[i]);
+ }
+ ut_free(sp_tuples);
+ }
+
+ /* Update the next Doc ID we used. Table should be locked, so
+ no concurrent DML */
+ if (max_doc_id && err == DB_SUCCESS) {
+ /* Sync fts cache for other fts indexes to keep all
+ fts indexes consistent in sync_doc_id. */
+ err = fts_sync_table(const_cast<dict_table_t*>(new_table));
+
+ if (err == DB_SUCCESS) {
+ new_table->fts->cache->synced_doc_id = max_doc_id;
+
+ /* Update the max value as next FTS_DOC_ID */
+ if (max_doc_id >= new_table->fts->cache->next_doc_id) {
+ new_table->fts->cache->next_doc_id =
+ max_doc_id + 1;
+ }
+
+ new_table->fts->cache->first_doc_id =
+ new_table->fts->cache->next_doc_id;
+
+ err= fts_update_sync_doc_id(
+ new_table,
+ new_table->fts->cache->synced_doc_id,
+ NULL);
+ }
+ }
+
+ if (vers_update_trt) {
+ trx->mod_tables.emplace(new_table, 0)
+ .first->second.set_versioned(0);
+ }
+
+ trx->op_info = "";
+
+ DBUG_RETURN(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N number of the buffer (0 or 1)
+@param INDEX record descriptor
+@param AT_END statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END) \
+ do { \
+ b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
+ &buf[2], b2, \
+ of->fd, &of->offset, \
+ mrec##N, offsets##N, \
+ crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL , \
+ space); \
+ if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \
+ goto corrupt; \
+ } \
+ b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
+ &buf[N], b##N, INDEX, \
+ file->fd, foffs##N, \
+ &mrec##N, offsets##N, \
+ crypt_block ? &crypt_block[N * srv_sort_buf_size] : NULL, \
+ space); \
+ \
+ if (UNIV_UNLIKELY(!b##N)) { \
+ if (mrec##N) { \
+ goto corrupt; \
+ } \
+ AT_END; \
+ } \
+ } while (0)
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \
+ do { \
+ if (stage != NULL) { \
+ stage->inc(); \
+ } \
+ ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END); \
+ } while (0)
+#else /* HAVE_PSI_STAGE_INTERFACE */
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \
+ ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END)
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Merge two blocks of records on disk and write a bigger block.
+@param[in] dup descriptor of index being created
+@param[in] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] foffs0 offset of first source list in the file
+@param[in,out] foffs1 offset of second source list in the file
+@param[in,out] of output file
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out] crypt_block encryption buffer
+@param[in] space tablespace ID for encryption
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_blocks(
+ const row_merge_dup_t* dup,
+ const merge_file_t* file,
+ row_merge_block_t* block,
+ ulint* foffs0,
+ ulint* foffs1,
+ merge_file_t* of,
+ ut_stage_alter_t* stage MY_ATTRIBUTE((unused)),
+ row_merge_block_t* crypt_block,
+ ulint space)
+{
+ mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
+
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
+ const byte* b0; /*!< pointer to block[0] */
+ const byte* b1; /*!< pointer to block[srv_sort_buf_size] */
+ byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */
+ const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */
+ const mrec_t* mrec1; /*!< merge rec, points to
+ block[srv_sort_buf_size] or buf[1] */
+ rec_offs* offsets0;/* offsets of mrec0 */
+ rec_offs* offsets1;/* offsets of mrec1 */
+
+ DBUG_ENTER("row_merge_blocks");
+ DBUG_LOG("ib_merge_sort",
+ "fd=" << file->fd << ',' << *foffs0 << '+' << *foffs1
+ << " to fd=" << of->fd << ',' << of->offset);
+
+ heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
+
+ /* Write a record and read the next record. Split the output
+ file in two halves, which can be merged on the following pass. */
+
+ if (!row_merge_read(file->fd, *foffs0, &block[0],
+ crypt_block ? &crypt_block[0] : NULL,
+ space) ||
+ !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size],
+ crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+ space)) {
+corrupt:
+ mem_heap_free(heap);
+ DBUG_RETURN(DB_CORRUPTION);
+ }
+
+ b0 = &block[0];
+ b1 = &block[srv_sort_buf_size];
+ b2 = &block[2 * srv_sort_buf_size];
+
+ b0 = row_merge_read_rec(
+ &block[0], &buf[0], b0, dup->index,
+ file->fd, foffs0, &mrec0, offsets0,
+ crypt_block ? &crypt_block[0] : NULL,
+ space);
+
+ b1 = row_merge_read_rec(
+ &block[srv_sort_buf_size],
+ &buf[srv_sort_buf_size], b1, dup->index,
+ file->fd, foffs1, &mrec1, offsets1,
+ crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+ space);
+
+ if (UNIV_UNLIKELY(!b0 && mrec0)
+ || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+ goto corrupt;
+ }
+
+ while (mrec0 && mrec1) {
+ int cmp = cmp_rec_rec_simple(
+ mrec0, mrec1, offsets0, offsets1,
+ dup->index, dup->table);
+ if (cmp < 0) {
+ ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
+ } else if (cmp) {
+ ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
+ } else {
+ mem_heap_free(heap);
+ DBUG_RETURN(DB_DUPLICATE_KEY);
+ }
+ }
+
+merged:
+ if (mrec0) {
+ /* append all mrec0 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
+ }
+ }
+done0:
+ if (mrec1) {
+ /* append all mrec1 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
+ }
+ }
+done1:
+
+ mem_heap_free(heap);
+
+ b2 = row_merge_write_eof(
+ &block[2 * srv_sort_buf_size],
+ b2, of->fd, &of->offset,
+ crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL,
+ space);
+ DBUG_RETURN(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/** Copy a block of index entries.
+@param[in] index index being created
+@param[in] file input file
+@param[in,out] block 3 buffers
+@param[in,out] foffs0 input file offset
+@param[in,out] of output file
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out] crypt_block encryption buffer
+@param[in] space tablespace ID for encryption
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_merge_blocks_copy(
+ const dict_index_t* index,
+ const merge_file_t* file,
+ row_merge_block_t* block,
+ ulint* foffs0,
+ merge_file_t* of,
+ ut_stage_alter_t* stage MY_ATTRIBUTE((unused)),
+ row_merge_block_t* crypt_block,
+ ulint space)
+{
+ mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
+
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
+ const byte* b0; /*!< pointer to block[0] */
+ byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */
+ const mrec_t* mrec0; /*!< merge rec, points to block[0] */
+ rec_offs* offsets0;/* offsets of mrec0 */
+ rec_offs* offsets1;/* dummy offsets */
+
+ DBUG_ENTER("row_merge_blocks_copy");
+ DBUG_LOG("ib_merge_sort",
+ "fd=" << file->fd << ',' << foffs0
+ << " to fd=" << of->fd << ',' << of->offset);
+
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+ /* Write a record and read the next record. Split the output
+ file in two halves, which can be merged on the following pass. */
+
+ if (!row_merge_read(file->fd, *foffs0, &block[0],
+ crypt_block ? &crypt_block[0] : NULL,
+ space)) {
+corrupt:
+ mem_heap_free(heap);
+ DBUG_RETURN(FALSE);
+ }
+
+ b0 = &block[0];
+
+ b2 = &block[2 * srv_sort_buf_size];
+
+ b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
+ file->fd, foffs0, &mrec0, offsets0,
+ crypt_block ? &crypt_block[0] : NULL,
+ space);
+
+ if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+ goto corrupt;
+ }
+
+ if (mrec0) {
+ /* append all mrec0 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
+ }
+ }
+done0:
+
+ /* The file offset points to the beginning of the last page
+ that has been read. Update it to point to the next block. */
+ (*foffs0)++;
+
+ mem_heap_free(heap);
+
+ DBUG_RETURN(row_merge_write_eof(
+ &block[2 * srv_sort_buf_size],
+ b2, of->fd, &of->offset,
+ crypt_block
+ ? &crypt_block[2 * srv_sort_buf_size]
+ : NULL, space)
+ != NULL);
+}
+
+/** Merge disk files.
+@param[in] trx transaction
+@param[in] dup descriptor of index being created
+@param[in,out] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] tmpfd temporary file handle
+@param[in,out] num_run Number of runs that remain to be merged
+@param[in,out] run_offset Array that contains the first offset number
+for each merge run
+@param[in,out] stage performance schema accounting object, used by
+@param[in,out] crypt_block encryption buffer
+@param[in] space tablespace ID for encryption
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+row_merge(
+ trx_t* trx,
+ const row_merge_dup_t* dup,
+ merge_file_t* file,
+ row_merge_block_t* block,
+ pfs_os_file_t* tmpfd,
+ ulint* num_run,
+ ulint* run_offset,
+ ut_stage_alter_t* stage,
+ row_merge_block_t* crypt_block,
+ ulint space)
+{
+ ulint foffs0; /*!< first input offset */
+ ulint foffs1; /*!< second input offset */
+ dberr_t error; /*!< error code */
+ merge_file_t of; /*!< output file */
+ const ulint ihalf = run_offset[*num_run / 2];
+ /*!< half the input file */
+ ulint n_run = 0;
+ /*!< num of runs generated from this merge */
+
+ MEM_CHECK_ADDRESSABLE(&block[0], 3 * srv_sort_buf_size);
+
+ if (crypt_block) {
+ MEM_CHECK_ADDRESSABLE(&crypt_block[0], 3 * srv_sort_buf_size);
+ }
+
+ ut_ad(ihalf < file->offset);
+
+ of.fd = *tmpfd;
+ of.offset = 0;
+ of.n_rec = 0;
+
+#ifdef POSIX_FADV_SEQUENTIAL
+ /* The input file will be read sequentially, starting from the
+ beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL
+ affects the entire file. Each block will be read exactly once. */
+ posix_fadvise(file->fd, 0, 0,
+ POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
+ /* Merge blocks to the output file. */
+ foffs0 = 0;
+ foffs1 = ihalf;
+
+ MEM_UNDEFINED(run_offset, *num_run * sizeof *run_offset);
+
+ for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+ if (trx_is_interrupted(trx)) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ error = row_merge_blocks(dup, file, block,
+ &foffs0, &foffs1, &of, stage,
+ crypt_block, space);
+
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+
+ }
+
+ /* Copy the last blocks, if there are any. */
+
+ while (foffs0 < ihalf) {
+
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ if (!row_merge_blocks_copy(dup->index, file, block,
+ &foffs0, &of, stage,
+ crypt_block, space)) {
+ return(DB_CORRUPTION);
+ }
+ }
+
+ ut_ad(foffs0 == ihalf);
+
+ while (foffs1 < file->offset) {
+
+ if (trx_is_interrupted(trx)) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ if (!row_merge_blocks_copy(dup->index, file, block,
+ &foffs1, &of, stage,
+ crypt_block, space)) {
+ return(DB_CORRUPTION);
+ }
+ }
+
+ ut_ad(foffs1 == file->offset);
+
+ if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+ return(DB_CORRUPTION);
+ }
+
+ ut_ad(n_run <= *num_run);
+
+ *num_run = n_run;
+
+ /* Each run can contain one or more offsets. As merge goes on,
+ the number of runs (to merge) will reduce until we have one
+ single run. So the number of runs will always be smaller than
+ the number of offsets in file */
+ ut_ad((*num_run) <= file->offset);
+
+ /* The number of offsets in output file is always equal or
+ smaller than input file */
+ ut_ad(of.offset <= file->offset);
+
+ /* Swap file descriptors for the next pass. */
+ *tmpfd = file->fd;
+ *file = of;
+
+ MEM_UNDEFINED(&block[0], 3 * srv_sort_buf_size);
+
+ return(DB_SUCCESS);
+}
+
+/** Merge disk files.
+@param[in] trx transaction
+@param[in] dup descriptor of index being created
+@param[in,out] file file containing index entries
+@param[in,out] block 3 buffers
+@param[in,out] tmpfd temporary file handle
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+ trx_t* trx,
+ const row_merge_dup_t* dup,
+ merge_file_t* file,
+ row_merge_block_t* block,
+ pfs_os_file_t* tmpfd,
+ const bool update_progress,
+ /*!< in: update progress
+ status variable or not */
+ const double pct_progress,
+ /*!< in: total progress percent
+ until now */
+ const double pct_cost, /*!< in: current progress percent */
+ row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */
+ ulint space, /*!< in: space id */
+ ut_stage_alter_t* stage)
+{
+ const ulint half = file->offset / 2;
+ ulint num_runs;
+ ulint* run_offset;
+ dberr_t error = DB_SUCCESS;
+ ulint merge_count = 0;
+ ulint total_merge_sort_count;
+ double curr_progress = 0;
+
+ DBUG_ENTER("row_merge_sort");
+
+ /* Record the number of merge runs we need to perform */
+ num_runs = file->offset;
+
+ if (stage != NULL) {
+ stage->begin_phase_sort(log2(double(num_runs)));
+ }
+
+ /* If num_runs are less than 1, nothing to merge */
+ if (num_runs <= 1) {
+ DBUG_RETURN(error);
+ }
+
+ total_merge_sort_count = ulint(ceil(log2(double(num_runs))));
+
+ /* "run_offset" records each run's first offset number */
+ run_offset = (ulint*) ut_malloc_nokey(file->offset * sizeof(ulint));
+
+ /* This tells row_merge() where to start for the first round
+ of merge. */
+ run_offset[half] = half;
+
+ /* The file should always contain at least one byte (the end
+ of file marker). Thus, it must be at least one block. */
+ ut_ad(file->offset > 0);
+
+ /* These thd_progress* calls will crash on sol10-64 when innodb_plugin
+ is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on
+ sol10-64 in buildbot.
+ */
+#ifndef __sun__
+ /* Progress report only for "normal" indexes. */
+ if (dup && !(dup->index->type & DICT_FTS)) {
+ thd_progress_init(trx->mysql_thd, 1);
+ }
+#endif /* __sun__ */
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL : merge-sorting"
+ " has estimated " ULINTPF " runs",
+ num_runs);
+ }
+
+ /* Merge the runs until we have one big run */
+ do {
+ /* Report progress of merge sort to MySQL for
+ show processlist progress field */
+ /* Progress report only for "normal" indexes. */
+#ifndef __sun__
+ if (dup && !(dup->index->type & DICT_FTS)) {
+ thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset);
+ }
+#endif /* __sun__ */
+
+ error = row_merge(trx, dup, file, block, tmpfd,
+ &num_runs, run_offset, stage,
+ crypt_block, space);
+
+ if(update_progress) {
+ merge_count++;
+ curr_progress = (merge_count >= total_merge_sort_count) ?
+ pct_cost :
+ pct_cost * static_cast<double>(merge_count)
+ / static_cast<double>(total_merge_sort_count);
+ /* presenting 10.12% as 1012 integer */;
+ onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+ }
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+
+ MEM_CHECK_DEFINED(run_offset, num_runs * sizeof *run_offset);
+ } while (num_runs > 1);
+
+ ut_free(run_offset);
+
+ /* Progress report only for "normal" indexes. */
+#ifndef __sun__
+ if (dup && !(dup->index->type & DICT_FTS)) {
+ thd_progress_end(trx->mysql_thd);
+ }
+#endif /* __sun__ */
+
+ DBUG_RETURN(error);
+}
+
+/** Copy the blob from the given blob file and store it
+in field data for the tuple
+@param tuple tuple to be inserted
+@param heap heap to allocate the memory for the blob storage
+@param blob_file file to handle blob data */
+static dberr_t row_merge_copy_blob_from_file(dtuple_t *tuple, mem_heap_t *heap,
+ merge_file_t *blob_file)
+{
+ for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++)
+ {
+ dfield_t *field= dtuple_get_nth_field(tuple, i);
+ const byte *field_data= static_cast<byte*>(dfield_get_data(field));
+ ulint field_len= dfield_get_len(field);
+ if (!dfield_is_ext(field))
+ continue;
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(!dfield_is_null(field));
+
+ ut_ad(mach_read_from_8(field_data) == 0);
+ uint64_t offset= mach_read_from_8(field_data + 8);
+ uint32_t len= mach_read_from_4(field_data + 16);
+
+ byte *data= (byte*) mem_heap_alloc(heap, len);
+ if (dberr_t err= os_file_read(IORequestRead, blob_file->fd, data,
+ offset, len, nullptr))
+ return err;
+ dfield_set_data(field, data, len);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Copy externally stored columns to the data tuple.
+@param[in] mrec record containing BLOB pointers,
+or NULL to use tuple instead
+@param[in] offsets offsets of mrec
+@param[in] zip_size compressed page size in bytes, or 0
+@param[in,out] tuple data tuple
+@param[in,out] heap memory heap */
+static
+void
+row_merge_copy_blobs(
+ const mrec_t* mrec,
+ const rec_offs* offsets,
+ ulint zip_size,
+ dtuple_t* tuple,
+ mem_heap_t* heap)
+{
+ ut_ad(mrec == NULL || rec_offs_any_extern(offsets));
+
+ for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+ ulint len;
+ const void* data;
+ dfield_t* field = dtuple_get_nth_field(tuple, i);
+ ulint field_len;
+ const byte* field_data;
+
+ if (!dfield_is_ext(field)) {
+ continue;
+ }
+
+ ut_ad(!dfield_is_null(field));
+
+ /* During the creation of a PRIMARY KEY, the table is
+ X-locked, and we skip copying records that have been
+ marked for deletion. Therefore, externally stored
+ columns cannot possibly be freed between the time the
+ BLOB pointers are read (row_merge_read_clustered_index())
+ and dereferenced (below). */
+ if (mrec == NULL) {
+ field_data
+ = static_cast<byte*>(dfield_get_data(field));
+ field_len = dfield_get_len(field);
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ ut_a(memcmp(field_data + field_len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+
+ data = btr_copy_externally_stored_field(
+ &len, field_data, zip_size, field_len, heap);
+ } else {
+ data = btr_rec_copy_externally_stored_field(
+ mrec, offsets, zip_size, i, &len, heap);
+ }
+
+ /* Because we have locked the table, any records
+ written by incomplete transactions must have been
+ rolled back already. There must not be any incomplete
+ BLOB columns. */
+ ut_a(data);
+
+ dfield_set_data(field, data, len);
+ }
+}
+
+/** Convert a merge record to a typed data tuple. Note that externally
+stored fields are not copied to heap.
+@param[in,out] index index on the table
+@param[in] mtuple merge record
+@param[in] heap memory heap from which memory needed is allocated
+@return index entry built. */
+static
+void
+row_merge_mtuple_to_dtuple(
+ dict_index_t* index,
+ dtuple_t* dtuple,
+ const mtuple_t* mtuple)
+{
+ ut_ad(!dict_index_is_ibuf(index));
+
+ memcpy(dtuple->fields, mtuple->fields,
+ dtuple->n_fields * sizeof *mtuple->fields);
+}
+
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+ dict_index_t* index,
+ const dict_table_t* old_table,
+ const pfs_os_file_t& fd,
+ row_merge_block_t* block,
+ const row_merge_buf_t* row_buf,
+ BtrBulk* btr_bulk,
+ const ib_uint64_t table_total_rows,
+ double pct_progress,
+ double pct_cost,
+ row_merge_block_t* crypt_block,
+ ulint space,
+ ut_stage_alter_t* stage,
+ merge_file_t* blob_file)
+{
+ const byte* b;
+ mem_heap_t* heap;
+ mem_heap_t* tuple_heap;
+ dberr_t error = DB_SUCCESS;
+ ulint foffs = 0;
+ rec_offs* offsets;
+ mrec_buf_t* buf;
+ ulint n_rows = 0;
+ dtuple_t* dtuple;
+ ib_uint64_t inserted_rows = 0;
+ double curr_progress = 0;
+ dict_index_t* old_index = NULL;
+ const mrec_t* mrec = NULL;
+ mtr_t mtr;
+
+
+ DBUG_ENTER("row_merge_insert_index_tuples");
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!(index->type & DICT_FTS));
+ ut_ad(!dict_index_is_spatial(index));
+
+ if (stage != NULL) {
+ stage->begin_phase_insert();
+ }
+
+ tuple_heap = mem_heap_create(1000);
+
+ {
+ ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+ offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, i * sizeof *offsets));
+ rec_offs_set_n_alloc(offsets, i);
+ rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+ }
+
+ if (row_buf != NULL) {
+ ut_ad(fd == OS_FILE_CLOSED);
+ ut_ad(block == NULL);
+ DBUG_EXECUTE_IF("row_merge_read_failure",
+ error = DB_CORRUPTION;
+ goto err_exit;);
+ buf = NULL;
+ b = NULL;
+ dtuple = dtuple_create(
+ heap, dict_index_get_n_fields(index));
+ dtuple_set_n_fields_cmp(
+ dtuple, dict_index_get_n_unique_in_tree(index));
+ } else {
+ b = block;
+ dtuple = NULL;
+
+ if (!row_merge_read(fd, foffs, block, crypt_block, space)) {
+ error = DB_CORRUPTION;
+ goto err_exit;
+ } else {
+ buf = static_cast<mrec_buf_t*>(
+ mem_heap_alloc(heap, sizeof *buf));
+ }
+ }
+
+ for (;;) {
+
+ if (stage != NULL) {
+ stage->inc();
+ }
+
+ if (row_buf != NULL) {
+ if (n_rows >= row_buf->n_tuples) {
+ break;
+ }
+
+ /* Convert merge tuple record from
+ row buffer to data tuple record */
+ row_merge_mtuple_to_dtuple(
+ index, dtuple, &row_buf->tuples[n_rows]);
+ n_rows++;
+ /* BLOB pointers must be copied from dtuple */
+ mrec = NULL;
+ } else {
+ b = row_merge_read_rec(block, buf, b, index,
+ fd, &foffs, &mrec, offsets,
+ crypt_block,
+ space);
+
+ if (UNIV_UNLIKELY(!b)) {
+ /* End of list, or I/O error */
+ if (mrec) {
+ error = DB_CORRUPTION;
+ }
+ break;
+ }
+
+ dtuple = row_rec_to_index_entry_low(
+ mrec, index, offsets, tuple_heap);
+ }
+
+ old_index = dict_table_get_first_index(old_table);
+
+ if (dict_index_is_clust(index)
+ && dict_index_is_online_ddl(old_index)) {
+ error = row_log_table_get_error(old_index);
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ ut_ad(!dtuple_get_n_ext(dtuple) || index->is_primary());
+
+ if (!dtuple_get_n_ext(dtuple)) {
+ } else if (blob_file) {
+ error = row_merge_copy_blob_from_file(
+ dtuple, tuple_heap, blob_file);
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ } else {
+ /* Off-page columns can be fetched safely
+ when concurrent modifications to the table
+ are disabled. (Purge can process delete-marked
+ records, but row_merge_read_clustered_index()
+ would have skipped them.)
+
+ When concurrent modifications are enabled,
+ row_merge_read_clustered_index() will
+ only see rows from transactions that were
+ committed before the ALTER TABLE started
+ (REPEATABLE READ).
+
+ Any modifications after the
+ row_merge_read_clustered_index() scan
+ will go through row_log_table_apply(). */
+ row_merge_copy_blobs(
+ mrec, offsets,
+ old_table->space->zip_size(),
+ dtuple, tuple_heap);
+ }
+
+ ut_ad(dtuple_validate(dtuple));
+ error = btr_bulk->insert(dtuple);
+
+ if (error != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ mem_heap_empty(tuple_heap);
+
+ /* Increment innodb_onlineddl_pct_progress status variable */
+ inserted_rows++;
+ if(inserted_rows % 1000 == 0) {
+ /* Update progress for each 1000 rows */
+ curr_progress = (inserted_rows >= table_total_rows ||
+ table_total_rows <= 0) ?
+ pct_cost :
+ pct_cost * static_cast<double>(inserted_rows)
+ / static_cast<double>(table_total_rows);
+
+ /* presenting 10.12% as 1012 integer */;
+ onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+ }
+ }
+
+err_exit:
+ mem_heap_free(tuple_heap);
+ mem_heap_free(heap);
+
+ DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Drop an index that was created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_index_dict(
+/*======================*/
+ trx_t* trx, /*!< in/out: dictionary transaction */
+ index_id_t index_id)/*!< in: index identifier */
+{
+ static const char sql[] =
+ "PROCEDURE DROP_INDEX_PROC () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+ "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+ "END;\n";
+ dberr_t error;
+ pars_info_t* info;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(trx->dict_operation_lock_mode);
+ ut_ad(trx->dict_operation);
+ ut_ad(dict_sys.locked());
+
+ info = pars_info_create();
+ pars_info_add_ull_literal(info, "indexid", index_id);
+ trx->op_info = "dropping index from dictionary";
+ error = que_eval_sql(info, sql, trx);
+
+ if (error != DB_SUCCESS) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_drop_index_dict failed with error "
+ << error;
+ }
+
+ trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+ trx_t* trx, /*!< in/out: dictionary transaction */
+ table_id_t table_id)/*!< in: table identifier */
+{
+ static const char sql[] =
+ "PROCEDURE DROP_INDEXES_PROC () IS\n"
+ "ixid CHAR;\n"
+ "found INT;\n"
+
+ "DECLARE CURSOR index_cur IS\n"
+ " SELECT ID FROM SYS_INDEXES\n"
+ " WHERE TABLE_ID=:tableid AND\n"
+ " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+ "FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "found := 1;\n"
+ "OPEN index_cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH index_cur INTO ixid;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+ " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE index_cur;\n"
+
+ "END;\n";
+ dberr_t error;
+ pars_info_t* info;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(trx->dict_operation_lock_mode);
+ ut_ad(trx->dict_operation);
+ ut_ad(dict_sys.locked());
+
+ /* It is possible that table->n_ref_count > 1 when
+ locked=TRUE. In this case, all code that should have an open
+ handle to the table be waiting for the next statement to execute,
+ or waiting for a meta-data lock.
+
+ A concurrent purge will be prevented by dict_sys.latch. */
+
+ info = pars_info_create();
+ pars_info_add_ull_literal(info, "tableid", table_id);
+ trx->op_info = "dropping indexes";
+ error = que_eval_sql(info, sql, trx);
+
+ switch (error) {
+ case DB_SUCCESS:
+ break;
+ default:
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ ib::error() << "row_merge_drop_indexes_dict failed with error "
+ << error;
+ /* fall through */
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ trx->error_state = DB_SUCCESS;
+ }
+
+ trx->op_info = "";
+}
+
+/** Drop common internal tables if all fulltext indexes are dropped
+@param trx transaction
+@param table user table */
+static void row_merge_drop_fulltext_indexes(trx_t *trx, dict_table_t *table)
+{
+ if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
+ !table->fts ||
+ !ib_vector_is_empty(table->fts->indexes))
+ return;
+
+ for (const dict_index_t *index= dict_table_get_first_index(table);
+ index; index= dict_table_get_next_index(index))
+ if (index->type & DICT_FTS)
+ return;
+
+ fts_optimize_remove_table(table);
+ fts_drop_tables(trx, *table);
+ table->fts->~fts_t();
+ table->fts= nullptr;
+ DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+}
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx dictionary transaction
+@param table table containing the indexes
+@param locked True if table is locked,
+ false - may need to do lazy drop
+@param alter_trx Alter table transaction */
+void
+row_merge_drop_indexes(
+ trx_t* trx,
+ dict_table_t* table,
+ bool locked,
+ const trx_t* alter_trx)
+{
+ dict_index_t* index;
+ dict_index_t* next_index;
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad(trx->dict_operation_lock_mode);
+ ut_ad(trx->dict_operation);
+ ut_ad(dict_sys.locked());
+
+ index = dict_table_get_first_index(table);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE);
+
+ /* the caller should have an open handle to the table */
+ ut_ad(table->get_ref_count() >= 1);
+
+ /* It is possible that table->n_ref_count > 1 when
+ locked=TRUE. In this case, all code that should have an open
+ handle to the table be waiting for the next statement to execute,
+ or waiting for a meta-data lock.
+
+ A concurrent purge will be prevented by MDL. */
+
+ if (!locked && (table->get_ref_count() > 1
+ || table->has_lock_other_than(alter_trx))) {
+ while ((index = dict_table_get_next_index(index)) != NULL) {
+ ut_ad(!dict_index_is_clust(index));
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ continue;
+ case ONLINE_INDEX_COMPLETE:
+ if (index->is_committed()) {
+ /* Do nothing to already
+ published indexes. */
+ } else if (index->type & DICT_FTS) {
+ /* Drop a completed FULLTEXT
+ index, due to a timeout during
+ MDL upgrade for
+ commit_inplace_alter_table().
+ Because only concurrent reads
+ are allowed (and they are not
+ seeing this index yet) we
+ are safe to drop the index. */
+ dict_index_t* prev = UT_LIST_GET_PREV(
+ indexes, index);
+ /* At least there should be
+ the clustered index before
+ this one. */
+ ut_ad(prev);
+ ut_a(table->fts);
+ fts_drop_index(table, index, trx);
+ row_merge_drop_index_dict(
+ trx, index->id);
+ /* We can remove a DICT_FTS
+ index from the cache, because
+ we do not allow ADD FULLTEXT INDEX
+ with LOCK=NONE. If we allowed that,
+ we should exclude FTS entries from
+ prebuilt->ins_node->entry_list
+ in ins_node_create_entry_list(). */
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+ dict_index_remove_from_cache(
+ table, index);
+ index = prev;
+ } else {
+ index->lock.x_lock(SRW_LOCK_CALL);
+ dict_index_set_online_status(
+ index, ONLINE_INDEX_ABORTED);
+ index->type |= DICT_CORRUPT;
+ table->drop_aborted = TRUE;
+ goto drop_aborted;
+ }
+ continue;
+ case ONLINE_INDEX_CREATION:
+ index->lock.x_lock(SRW_LOCK_CALL);
+ ut_ad(!index->is_committed());
+ row_log_abort_sec(index);
+ drop_aborted:
+ index->lock.x_unlock();
+
+ DEBUG_SYNC_C("merge_drop_index_after_abort");
+ /* covered by dict_sys.latch */
+ MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
+ /* fall through */
+ case ONLINE_INDEX_ABORTED:
+ /* Drop the index tree from the
+ data dictionary and free it from
+ the tablespace, but keep the object
+ in the data dictionary cache. */
+ row_merge_drop_index_dict(trx, index->id);
+ index->lock.x_lock(SRW_LOCK_CALL);
+ dict_index_set_online_status(
+ index, ONLINE_INDEX_ABORTED_DROPPED);
+ index->lock.x_unlock();
+ table->drop_aborted = TRUE;
+ continue;
+ }
+ ut_error;
+ }
+
+ row_merge_drop_fulltext_indexes(trx, table);
+ return;
+ }
+
+ row_merge_drop_indexes_dict(trx, table->id);
+
+ /* Invalidate all row_prebuilt_t::ins_graph that are referring
+ to this table. That is, force row_get_prebuilt_insert_row() to
+ rebuild prebuilt->ins_node->entry_list). */
+ if (table->def_trx_id < trx->id) {
+ table->def_trx_id = trx->id;
+ } else {
+ ut_ad(table->def_trx_id == trx->id || table->name.part());
+ }
+
+ next_index = dict_table_get_next_index(index);
+
+ while ((index = next_index) != NULL) {
+ /* read the next pointer before freeing the index */
+ next_index = dict_table_get_next_index(index);
+
+ ut_ad(!dict_index_is_clust(index));
+
+ if (!index->is_committed()) {
+ /* If it is FTS index, drop from table->fts
+ and also drop its auxiliary tables */
+ if (index->type & DICT_FTS) {
+ ut_a(table->fts);
+ fts_drop_index(table, index, trx);
+ }
+
+ switch (dict_index_get_online_status(index)) {
+ case ONLINE_INDEX_CREATION:
+ /* This state should only be possible
+ when prepare_inplace_alter_table() fails
+ after invoking row_merge_create_index().
+ In inplace_alter_table(),
+ row_merge_build_indexes()
+ should never leave the index in this state.
+ It would invoke row_log_abort_sec() on
+ failure. */
+ case ONLINE_INDEX_COMPLETE:
+ /* In these cases, we are able to drop
+ the index straight. The DROP INDEX was
+ never deferred. */
+ break;
+ case ONLINE_INDEX_ABORTED:
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ /* covered by dict_sys.latch */
+ MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
+ }
+
+ dict_index_remove_from_cache(table, index);
+ }
+ }
+
+ row_merge_drop_fulltext_indexes(trx, table);
+ table->drop_aborted = FALSE;
+ ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+}
+
+/** Drop fulltext indexes */
+static ibool row_merge_drop_fts(void *node, void *trx)
+{
+ auto s= static_cast<sel_node_t*>(node);
+
+ const dfield_t *table_id= que_node_get_val(s->select_list);
+ ut_ad(table_id->type.mtype == DATA_BINARY);
+ node= que_node_get_next(s->select_list);
+ ut_ad(!que_node_get_next(node));
+ const dfield_t *index_id= que_node_get_val(node);
+ ut_ad(index_id->type.mtype == DATA_BINARY);
+
+ static const char sql[]=
+ "PROCEDURE DROP_TABLES_PROC () IS\n"
+ "tid CHAR;\n"
+ "iid CHAR;\n"
+
+ "DECLARE CURSOR cur_tab IS\n"
+ "SELECT ID FROM SYS_TABLES\n"
+ "WHERE INSTR(NAME,:name)+45=LENGTH(NAME)"
+ " AND INSTR('123456',SUBSTR(NAME,LENGTH(NAME)-1,1))>0"
+ " FOR UPDATE;\n"
+
+ "DECLARE CURSOR cur_idx IS\n"
+ "SELECT ID FROM SYS_INDEXES\n"
+ "WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "OPEN cur_tab;\n"
+ "WHILE 1 = 1 LOOP\n"
+ " FETCH cur_tab INTO tid;\n"
+ " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+ " OPEN cur_idx;\n"
+ " WHILE 1 = 1 LOOP\n"
+ " FETCH cur_idx INTO iid;\n"
+ " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+ " DELETE FROM SYS_INDEXES WHERE CURRENT OF cur_idx;\n"
+ " END LOOP;\n"
+ " CLOSE cur_idx;\n"
+ " DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+ " DELETE FROM SYS_TABLES WHERE CURRENT OF cur_tab;\n"
+ "END LOOP;\n"
+ "CLOSE cur_tab;\n"
+ "END;\n";
+
+ if (table_id->len == 8 && index_id->len == 8)
+ {
+ char buf[sizeof "/FTS_0000000000000000_0000000000000000_INDEX_"];
+ snprintf(buf, sizeof buf, "/FTS_%016llx_%016llx_INDEX_",
+ static_cast<ulonglong>
+ (mach_read_from_8(static_cast<const byte*>(table_id->data))),
+ static_cast<ulonglong>
+ (mach_read_from_8(static_cast<const byte*>(index_id->data))));
+ auto pinfo= pars_info_create();
+ pars_info_add_str_literal(pinfo, "name", buf);
+ que_eval_sql(pinfo, sql, static_cast<trx_t*>(trx));
+ }
+
+ return true;
+}
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes()
+{
+ static_assert(DICT_FTS == 32, "compatibility");
+
+ static const char sql[] =
+ "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+ "ixid CHAR;\n"
+ "found INT;\n"
+
+ "DECLARE FUNCTION drop_fts;\n"
+
+ "DECLARE CURSOR fts_cur IS\n"
+ " SELECT TABLE_ID,ID FROM SYS_INDEXES\n"
+ " WHERE TYPE=32"
+ " AND SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+ " FOR UPDATE;\n"
+
+ "DECLARE CURSOR index_cur IS\n"
+ " SELECT ID FROM SYS_INDEXES\n"
+ " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+ "FOR UPDATE;\n"
+
+ "BEGIN\n"
+ "found := 1;\n"
+ "OPEN fts_cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH fts_cur INTO drop_fts();\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE fts_cur;\n"
+
+ "OPEN index_cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH index_cur INTO ixid;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+ " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE index_cur;\n"
+ "END;\n";
+
+ /* Load the table definitions that contain partially defined
+ indexes, so that the data dictionary information can be checked
+ when accessing the tablename.ibd files. */
+ trx_t* trx = trx_create();
+ trx_start_for_ddl(trx);
+ trx->op_info = "dropping partially created indexes";
+ dberr_t error = lock_sys_tables(trx);
+
+ row_mysql_lock_data_dictionary(trx);
+ /* Ensure that this transaction will be rolled back and locks
+ will be released, if the server gets killed before the commit
+ gets written to the redo log. */
+ trx->dict_operation = true;
+
+ trx->op_info = "dropping indexes";
+
+ pars_info_t* pinfo = pars_info_create();
+ pars_info_bind_function(pinfo, "drop_fts", row_merge_drop_fts, trx);
+ if (error == DB_SUCCESS) {
+ error = que_eval_sql(pinfo, sql, trx);
+ }
+
+ if (error) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_drop_temp_indexes(): " << error;
+ }
+
+ trx_commit_for_mysql(trx);
+ row_mysql_unlock_data_dictionary(trx);
+ trx->free();
+}
+
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in] path location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+ const char* path)
+{
+ if (!path) {
+ path = mysql_tmpdir;
+ }
+#ifdef UNIV_PFS_IO
+ /* This temp file open does not go through normal
+ file APIs, add instrumentation to register with
+ performance schema */
+ struct PSI_file_locker* locker;
+ PSI_file_locker_state state;
+ static const char label[] = "/Innodb Merge Temp File";
+ char* name = static_cast<char*>(
+ ut_malloc_nokey(strlen(path) + sizeof label));
+ strcpy(name, path);
+ strcat(name, label);
+
+ register_pfs_file_open_begin(
+ &state, locker, innodb_temp_file_key,
+ PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__);
+
+#endif
+ DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN);
+ char filename[FN_REFLEN];
+ File f = create_temp_file(filename, path, "ib",
+ O_BINARY | O_SEQUENTIAL,
+ MYF(MY_WME | MY_TEMPORARY));
+ pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f);
+
+#ifdef UNIV_PFS_IO
+ register_pfs_file_open_end(locker, fd,
+ (fd == OS_FILE_CLOSED)?NULL:&fd);
+ ut_free(name);
+#endif
+
+ if (fd == OS_FILE_CLOSED) {
+ ib::error() << "Cannot create temporary merge file";
+ }
+ return(fd);
+}
+
+
+/** Create a merge file in the given location.
+@param[out] merge_file merge file structure
+@param[in] path location for creating temporary file, or NULL
+@return file descriptor, or OS_FILE_CLOSED on error */
+pfs_os_file_t
+row_merge_file_create(
+ merge_file_t* merge_file,
+ const char* path)
+{
+ merge_file->fd = row_merge_file_create_low(path);
+ merge_file->offset = 0;
+ merge_file->n_rec = 0;
+
+ if (merge_file->fd != OS_FILE_CLOSED) {
+ if (srv_disable_sort_file_cache) {
+ os_file_set_nocache(merge_file->fd,
+ "row0merge.cc", "sort");
+ }
+ }
+ return(merge_file->fd);
+}
+
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+ const pfs_os_file_t& fd) /*!< in: merge file descriptor */
+{
+ if (fd != OS_FILE_CLOSED) {
+ int res = mysql_file_close(IF_WIN(my_win_handle2File((os_file_t)fd), fd),
+ MYF(MY_WME));
+ ut_a(res != -1);
+ }
+}
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+ merge_file_t* merge_file) /*!< in/out: merge file structure */
+{
+ ut_ad(!srv_read_only_mode);
+
+ if (merge_file->fd != OS_FILE_CLOSED) {
+ row_merge_file_destroy_low(merge_file->fd);
+ merge_file->fd = OS_FILE_CLOSED;
+ }
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+ trx_t* trx, /*!< in/out: transaction */
+ table_id_t table_id, /*!< in: table identifier */
+ index_id_t index_id) /*!< in: index identifier */
+{
+ dberr_t err = DB_SUCCESS;
+ pars_info_t* info = pars_info_create();
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in renaming indexes. */
+
+ static const char rename_index[] =
+ "PROCEDURE RENAME_INDEX_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+ "WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+ "END;\n";
+
+ ut_ad(trx->dict_operation_lock_mode);
+ ut_ad(trx->dict_operation);
+
+ trx->op_info = "renaming index to add";
+
+ pars_info_add_ull_literal(info, "tableid", table_id);
+ pars_info_add_ull_literal(info, "indexid", index_id);
+
+ err = que_eval_sql(info, rename_index, trx);
+
+ if (err != DB_SUCCESS) {
+ /* Even though we ensure that DDL transactions are WAIT
+ and DEADLOCK free, we could encounter other errors e.g.,
+ DB_TOO_MANY_CONCURRENT_TRXS. */
+ trx->error_state = DB_SUCCESS;
+
+ ib::error() << "row_merge_rename_index_to_add failed with"
+ " error " << err;
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Create the index and load in to the dictionary.
+@param[in,out] table the index is on this table
+@param[in] index_def the index definition
+@param[in] add_v new virtual columns added along with add
+ index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+ dict_table_t* table,
+ const index_def_t* index_def,
+ const dict_add_v_col_t* add_v)
+{
+ dict_index_t* index;
+ ulint n_fields = index_def->n_fields;
+ ulint i;
+ ulint n_add_vcol = 0;
+
+ DBUG_ENTER("row_merge_create_index");
+
+ ut_ad(!srv_read_only_mode);
+
+ /* Create the index prototype, using the passed in def, this is not
+ a persistent operation. We pass 0 as the space id, and determine at
+ a lower level the space id where to store the table. */
+
+ index = dict_mem_index_create(table, index_def->name,
+ index_def->ind_type, n_fields);
+ index->set_committed(index_def->rebuild);
+
+ for (i = 0; i < n_fields; i++) {
+ const char* name;
+ index_field_t* ifield = &index_def->fields[i];
+
+ if (ifield->is_v_col) {
+ if (ifield->col_no >= table->n_v_def) {
+ ut_ad(ifield->col_no < table->n_v_def
+ + add_v->n_v_col);
+ ut_ad(ifield->col_no >= table->n_v_def);
+ name = add_v->v_col_name[
+ ifield->col_no - table->n_v_def];
+ n_add_vcol++;
+ } else {
+ name = dict_table_get_v_col_name(
+ table, ifield->col_no);
+ }
+ } else {
+ name = dict_table_get_col_name(table, ifield->col_no);
+ }
+
+ dict_mem_index_add_field(index, name, ifield->prefix_len,
+ ifield->descending);
+ }
+
+ if (n_add_vcol) {
+ index->assign_new_v_col(n_add_vcol);
+ }
+
+ DBUG_RETURN(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+bool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index) /*!< in: index to check */
+{
+ if (!index->is_primary()
+ && dict_index_is_online_ddl(index)) {
+ /* Indexes that are being created are not useable. */
+ return(false);
+ }
+
+ return(!index->is_corrupted()
+ && (index->table->is_temporary() || index->table->no_rollback()
+ || index->trx_id == 0
+ || !trx->read_view.is_open()
+ || trx->read_view.changes_visible(index->trx_id)));
+}
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in] trx transaction
+@param[in] old_table table where rows are read from
+@param[in] new_table table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in] online true if creating indexes online
+@param[in] indexes indexes to be created
+@param[in] key_numbers MySQL key numbers
+@param[in] n_indexes size of indexes[]
+@param[in,out] table MySQL table, for reporting erroneous key value
+if applicable
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] col_map mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in] add_autoinc number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out] sequence autoinc sequence
+@param[in] skip_pk_sort whether the new PRIMARY KEY will follow
+existing order
+@param[in,out] stage performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in] add_v new virtual columns added along with indexes
+@param[in] eval_table mysql table used to evaluate virtual column
+ value, see innobase_get_computed_value().
+@param[in] allow_not_null allow the conversion from null to not-null
+@param[in] col_collate columns whose collations changed, or nullptr
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+ trx_t* trx,
+ dict_table_t* old_table,
+ dict_table_t* new_table,
+ bool online,
+ dict_index_t** indexes,
+ const ulint* key_numbers,
+ ulint n_indexes,
+ struct TABLE* table,
+ const dtuple_t* defaults,
+ const ulint* col_map,
+ ulint add_autoinc,
+ ib_sequence_t& sequence,
+ bool skip_pk_sort,
+ ut_stage_alter_t* stage,
+ const dict_add_v_col_t* add_v,
+ struct TABLE* eval_table,
+ bool allow_not_null,
+ const col_collations* col_collate)
+{
+ merge_file_t* merge_files;
+ row_merge_block_t* block;
+ ut_new_pfx_t block_pfx;
+ size_t block_size;
+ ut_new_pfx_t crypt_pfx;
+ row_merge_block_t* crypt_block = NULL;
+ ulint i;
+ ulint j;
+ dberr_t error;
+ pfs_os_file_t tmpfd = OS_FILE_CLOSED;
+ dict_index_t* fts_sort_idx = NULL;
+ fts_psort_t* psort_info = NULL;
+ fts_psort_t* merge_info = NULL;
+ bool fts_psort_initiated = false;
+
+ double total_static_cost = 0;
+ double total_dynamic_cost = 0;
+ ulint total_index_blocks = 0;
+ double pct_cost=0;
+ double pct_progress=0;
+
+ DBUG_ENTER("row_merge_build_indexes");
+
+ ut_ad(!srv_read_only_mode);
+ ut_ad((old_table == new_table) == !col_map);
+ ut_ad(!defaults || col_map);
+
+ stage->begin_phase_read_pk(skip_pk_sort && new_table != old_table
+ ? n_indexes - 1
+ : n_indexes);
+
+ /* Allocate memory for merge file data structure and initialize
+ fields */
+
+ ut_allocator<row_merge_block_t> alloc(mem_key_row_merge_sort);
+
+ /* This will allocate "3 * srv_sort_buf_size" elements of type
+ row_merge_block_t. The latter is defined as byte. */
+ block_size = 3 * srv_sort_buf_size;
+ block = alloc.allocate_large(block_size, &block_pfx);
+
+ if (block == NULL) {
+ DBUG_RETURN(DB_OUT_OF_MEMORY);
+ }
+
+ crypt_pfx.m_size = 0; /* silence bogus -Wmaybe-uninitialized */
+ TRASH_ALLOC(&crypt_pfx, sizeof crypt_pfx);
+
+ if (srv_encrypt_log) {
+ crypt_block = static_cast<row_merge_block_t*>(
+ alloc.allocate_large(block_size,
+ &crypt_pfx));
+
+ if (crypt_block == NULL) {
+ DBUG_RETURN(DB_OUT_OF_MEMORY);
+ }
+ }
+
+ trx_start_if_not_started_xa(trx, true);
+ ulint n_merge_files = 0;
+
+ for (ulint i = 0; i < n_indexes; i++)
+ {
+ if (!dict_index_is_spatial(indexes[i])) {
+ n_merge_files++;
+ }
+ }
+
+ merge_files = static_cast<merge_file_t*>(
+ ut_malloc_nokey(n_merge_files * sizeof *merge_files));
+
+ /* Initialize all the merge file descriptors, so that we
+ don't call row_merge_file_destroy() on uninitialized
+ merge file descriptor */
+
+ for (i = 0; i < n_merge_files; i++) {
+ merge_files[i].fd = OS_FILE_CLOSED;
+ merge_files[i].offset = 0;
+ merge_files[i].n_rec = 0;
+ }
+
+ total_static_cost = COST_BUILD_INDEX_STATIC
+ * static_cast<double>(n_indexes) + COST_READ_CLUSTERED_INDEX;
+ total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC
+ * static_cast<double>(n_indexes);
+ for (i = 0; i < n_indexes; i++) {
+ if (indexes[i]->type & DICT_FTS) {
+ ibool opt_doc_id_size = FALSE;
+
+ /* To build FTS index, we would need to extract
+ doc's word, Doc ID, and word's position, so
+ we need to build a "fts sort index" indexing
+ on above three 'fields' */
+ fts_sort_idx = row_merge_create_fts_sort_index(
+ indexes[i], old_table, &opt_doc_id_size);
+
+ row_merge_dup_t* dup
+ = static_cast<row_merge_dup_t*>(
+ ut_malloc_nokey(sizeof *dup));
+ dup->index = fts_sort_idx;
+ dup->table = table;
+ dup->col_map = col_map;
+ dup->n_dup = 0;
+
+ /* This can fail e.g. if temporal files can't be
+ created */
+ if (!row_fts_psort_info_init(
+ trx, dup, new_table, opt_doc_id_size,
+ old_table->space->zip_size(),
+ &psort_info, &merge_info)) {
+ error = DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ /* We need to ensure that we free the resources
+ allocated */
+ fts_psort_initiated = true;
+ }
+ }
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL : Start reading"
+ " clustered index of the table"
+ " and create temporary files");
+ }
+
+ pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost);
+
+ /* Do not continue if we can't encrypt table pages */
+ if (!old_table->is_readable() ||
+ !new_table->is_readable()) {
+ error = DB_DECRYPTION_FAILED;
+ ib_push_warning(trx->mysql_thd, DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ !old_table->is_readable() ? old_table->name.m_name :
+ new_table->name.m_name);
+ goto func_exit;
+ }
+
+ /* Read clustered index of the table and create files for
+ secondary index entries for merge sort */
+ error = row_merge_read_clustered_index(
+ trx, table, old_table, new_table, online, indexes,
+ fts_sort_idx, psort_info, merge_files, key_numbers,
+ n_indexes, defaults, add_v, col_map, add_autoinc,
+ sequence, block, skip_pk_sort, &tmpfd, stage,
+ pct_cost, crypt_block, eval_table, allow_not_null,
+ col_collate);
+
+ stage->end_phase_read_pk();
+
+ pct_progress += pct_cost;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL : End of reading "
+ "clustered index of the table"
+ " and create temporary files");
+ }
+
+ for (i = 0; i < n_merge_files; i++) {
+ total_index_blocks += merge_files[i].offset;
+ }
+
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ DEBUG_SYNC_C("row_merge_after_scan");
+
+ /* Now we have files containing index entries ready for
+ sorting and inserting. */
+
+ for (ulint k = 0, i = 0; i < n_indexes; i++) {
+ dict_index_t* sort_idx = indexes[i];
+
+ if (dict_index_is_spatial(sort_idx)) {
+ continue;
+ }
+
+ if (indexes[i]->type & DICT_FTS) {
+
+ sort_idx = fts_sort_idx;
+
+ if (FTS_PLL_MERGE) {
+ row_fts_start_parallel_merge(merge_info);
+ for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+ merge_info[j].task->wait();
+ delete merge_info[j].task;
+ }
+ } else {
+ /* This cannot report duplicates; an
+ assertion would fail in that case. */
+ error = row_fts_merge_insert(
+ sort_idx, new_table,
+ psort_info, 0);
+ }
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+ DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
+ } else if (merge_files[k].fd != OS_FILE_CLOSED) {
+ char buf[NAME_LEN + 1];
+ row_merge_dup_t dup = {
+ sort_idx, table, col_map, 0};
+
+ pct_cost = (COST_BUILD_INDEX_STATIC +
+ (total_dynamic_cost
+ * static_cast<double>(merge_files[k].offset)
+ / static_cast<double>(total_index_blocks)))
+ / (total_static_cost + total_dynamic_cost)
+ * PCT_COST_MERGESORT_INDEX * 100;
+ char* bufend = innobase_convert_name(
+ buf, sizeof buf,
+ indexes[i]->name,
+ strlen(indexes[i]->name),
+ trx->mysql_thd);
+ buf[bufend - buf]='\0';
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL :"
+ " Start merge-sorting"
+ " index %s"
+ " (" ULINTPF
+ " / " ULINTPF "),"
+ " estimated cost :"
+ " %2.4f",
+ buf, i + 1, n_indexes,
+ pct_cost);
+ }
+
+ error = row_merge_sort(
+ trx, &dup, &merge_files[k],
+ block, &tmpfd, true,
+ pct_progress, pct_cost,
+ crypt_block, new_table->space_id,
+ stage);
+
+ pct_progress += pct_cost;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information("InnoDB: Online DDL :"
+ " End of "
+ " merge-sorting index %s"
+ " (" ULINTPF
+ " / " ULINTPF ")",
+ buf, i + 1, n_indexes);
+ }
+
+ if (error == DB_SUCCESS) {
+ BtrBulk btr_bulk(sort_idx, trx);
+
+ pct_cost = (COST_BUILD_INDEX_STATIC +
+ (total_dynamic_cost
+ * static_cast<double>(
+ merge_files[k].offset)
+ / static_cast<double>(
+ total_index_blocks)))
+ / (total_static_cost
+ + total_dynamic_cost)
+ * PCT_COST_INSERT_INDEX * 100;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information(
+ "InnoDB: Online DDL : Start "
+ "building index %s"
+ " (" ULINTPF
+ " / " ULINTPF "), estimated "
+ "cost : %2.4f", buf, i + 1,
+ n_indexes, pct_cost);
+ }
+
+ error = row_merge_insert_index_tuples(
+ sort_idx, old_table,
+ merge_files[k].fd, block, NULL,
+ &btr_bulk,
+ merge_files[k].n_rec, pct_progress, pct_cost,
+ crypt_block, new_table->space_id,
+ stage);
+
+ error = btr_bulk.finish(error);
+
+ pct_progress += pct_cost;
+
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information(
+ "InnoDB: Online DDL : "
+ "End of building index %s"
+ " (" ULINTPF " / " ULINTPF ")",
+ buf, i + 1, n_indexes);
+ }
+ }
+ }
+
+ /* Close the temporary file to free up space. */
+ row_merge_file_destroy(&merge_files[k++]);
+
+ if (indexes[i]->type & DICT_FTS) {
+ row_fts_psort_info_destroy(psort_info, merge_info);
+ fts_psort_initiated = false;
+ } else if (old_table != new_table) {
+ ut_ad(!sort_idx->online_log);
+ ut_ad(sort_idx->online_status
+ == ONLINE_INDEX_COMPLETE);
+ }
+
+ if (old_table != new_table
+ || (indexes[i]->type & (DICT_FTS | DICT_SPATIAL))
+ || error != DB_SUCCESS || !online) {
+ /* Do not apply any online log. */
+ } else {
+ if (global_system_variables.log_warnings > 2) {
+ sql_print_information(
+ "InnoDB: Online DDL : Applying"
+ " log to index");
+ }
+
+ DEBUG_SYNC_C("row_log_apply_before");
+ error = row_log_apply(trx, sort_idx, table, stage);
+ DEBUG_SYNC_C("row_log_apply_after");
+ }
+
+ if (error != DB_SUCCESS) {
+ trx->error_key_num = key_numbers[i];
+ goto func_exit;
+ }
+
+ if (indexes[i]->type & DICT_FTS
+ && UNIV_UNLIKELY(fts_enable_diag_print)) {
+ ib::info() << "Finished building full-text index "
+ << indexes[i]->name;
+ }
+ }
+
+func_exit:
+
+ DBUG_EXECUTE_IF(
+ "ib_build_indexes_too_many_concurrent_trxs",
+ error = DB_TOO_MANY_CONCURRENT_TRXS;
+ trx->error_state = error;);
+
+ if (fts_psort_initiated) {
+ /* Clean up FTS psort related resource */
+ row_fts_psort_info_destroy(psort_info, merge_info);
+ fts_psort_initiated = false;
+ }
+
+ row_merge_file_destroy_low(tmpfd);
+
+ for (i = 0; i < n_merge_files; i++) {
+ row_merge_file_destroy(&merge_files[i]);
+ }
+
+ if (fts_sort_idx) {
+ dict_mem_index_free(fts_sort_idx);
+ }
+
+ ut_free(merge_files);
+
+ alloc.deallocate_large(block, &block_pfx);
+
+ if (crypt_block) {
+ alloc.deallocate_large(crypt_block, &crypt_pfx);
+ }
+
+ DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+ if (online && old_table == new_table && error != DB_SUCCESS) {
+ /* On error, flag all online secondary index creation
+ as aborted. */
+ for (i = 0; i < n_indexes; i++) {
+ ut_ad(!(indexes[i]->type & DICT_FTS));
+ ut_ad(!indexes[i]->is_committed());
+ ut_ad(!dict_index_is_clust(indexes[i]));
+
+ /* Completed indexes should be dropped as
+ well, and indexes whose creation was aborted
+ should be dropped from the persistent
+ storage. However, at this point we can only
+ set some flags in the not-yet-published
+ indexes. These indexes will be dropped later
+ in row_merge_drop_indexes(), called by
+ rollback_inplace_alter_table(). */
+
+ switch (dict_index_get_online_status(indexes[i])) {
+ case ONLINE_INDEX_COMPLETE:
+ break;
+ case ONLINE_INDEX_CREATION:
+ indexes[i]->lock.x_lock(SRW_LOCK_CALL);
+ row_log_abort_sec(indexes[i]);
+ indexes[i]->type |= DICT_CORRUPT;
+ indexes[i]->lock.x_unlock();
+ new_table->drop_aborted = TRUE;
+ /* fall through */
+ case ONLINE_INDEX_ABORTED_DROPPED:
+ case ONLINE_INDEX_ABORTED:
+ MONITOR_ATOMIC_INC(
+ MONITOR_BACKGROUND_DROP_INDEX);
+ }
+ }
+
+ dict_index_t *clust_index= new_table->indexes.start;
+ clust_index->lock.x_lock(SRW_LOCK_CALL);
+ ut_ad(!clust_index->online_log ||
+ clust_index->online_log_is_dummy());
+ clust_index->online_log= nullptr;
+ clust_index->lock.x_unlock();
+ }
+
+ DBUG_RETURN(error);
+}
+
+dberr_t row_merge_bulk_t::alloc_block()
+{
+ if (m_block)
+ return DB_SUCCESS;
+ m_block= m_alloc.allocate_large_dontdump(
+ 3 * srv_sort_buf_size, &m_block_pfx);
+ if (m_block == nullptr)
+ return DB_OUT_OF_MEMORY;
+
+ m_crypt_pfx.m_size= 0;
+ TRASH_ALLOC(&m_crypt_pfx, sizeof m_crypt_pfx);
+ if (srv_encrypt_log)
+ {
+ m_crypt_block= static_cast<row_merge_block_t*>(
+ m_alloc.allocate_large(3 * srv_sort_buf_size, &m_crypt_pfx));
+ if (!m_crypt_block)
+ return DB_OUT_OF_MEMORY;
+ }
+ return DB_SUCCESS;
+}
+
+row_merge_bulk_t::row_merge_bulk_t(dict_table_t *table)
+{
+ ulint n_index= 0;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+ index; index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (!index->is_btree())
+ continue;
+ n_index++;
+ }
+
+ m_merge_buf= static_cast<row_merge_buf_t*>(
+ ut_zalloc_nokey(n_index * sizeof *m_merge_buf));
+
+ ulint i= 0;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+ index; index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (!index->is_btree())
+ continue;
+
+ mem_heap_t *heap= mem_heap_create(100);
+ row_merge_buf_create_low(&m_merge_buf[i], heap, index);
+ i++;
+ }
+
+ m_tmpfd= OS_FILE_CLOSED;
+ m_blob_file.fd= OS_FILE_CLOSED;
+ m_blob_file.offset= 0;
+ m_blob_file.n_rec= 0;
+}
+
+row_merge_bulk_t::~row_merge_bulk_t()
+{
+ ulint i= 0;
+ dict_table_t *table= m_merge_buf[0].index->table;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+ index; index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (!index->is_btree())
+ continue;
+ row_merge_buf_free(&m_merge_buf[i]);
+ if (m_merge_files)
+ row_merge_file_destroy(&m_merge_files[i]);
+ i++;
+ }
+
+ row_merge_file_destroy_low(m_tmpfd);
+
+ row_merge_file_destroy(&m_blob_file);
+
+ ut_free(m_merge_buf);
+
+ ut_free(m_merge_files);
+
+ if (m_block)
+ m_alloc.deallocate_large(m_block, &m_block_pfx);
+
+ if (m_crypt_block)
+ m_alloc.deallocate_large(m_crypt_block, &m_crypt_pfx);
+}
+
+void row_merge_bulk_t::init_tmp_file()
+{
+ if (m_merge_files)
+ return;
+
+ ulint n_index= 0;
+ dict_table_t *table= m_merge_buf[0].index->table;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+ index; index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (!index->is_btree())
+ continue;
+ n_index++;
+ }
+
+ m_merge_files= static_cast<merge_file_t*>(
+ ut_malloc_nokey(n_index * sizeof *m_merge_files));
+
+ for (ulint i= 0; i < n_index; i++)
+ {
+ m_merge_files[i].fd= OS_FILE_CLOSED;
+ m_merge_files[i].offset= 0;
+ m_merge_files[i].n_rec= 0;
+ }
+}
+
+void row_merge_bulk_t::clean_bulk_buffer(ulint index_no)
+{
+ mem_heap_empty(m_merge_buf[index_no].heap);
+ m_merge_buf[index_no].total_size = m_merge_buf[index_no].n_tuples = 0;
+}
+
+bool row_merge_bulk_t::create_tmp_file(ulint index_no)
+{
+ return row_merge_file_create_if_needed(
+ &m_merge_files[index_no], &m_tmpfd,
+ m_merge_buf[index_no].n_tuples, NULL);
+}
+
+dberr_t row_merge_bulk_t::write_to_tmp_file(ulint index_no)
+{
+ if (!create_tmp_file(index_no))
+ return DB_OUT_OF_MEMORY;
+ merge_file_t *file= &m_merge_files[index_no];
+ row_merge_buf_t *buf= &m_merge_buf[index_no];
+
+ alloc_block();
+
+ if (dberr_t err= row_merge_buf_write(buf,
+#ifndef DBUG_OFF
+ file,
+#endif
+ m_block,
+ index_no == 0 ? &m_blob_file : nullptr))
+ return err;
+
+ if (!row_merge_write(file->fd, file->offset++,
+ m_block, m_crypt_block,
+ buf->index->table->space->id))
+ return DB_TEMP_FILE_WRITE_FAIL;
+ MEM_UNDEFINED(&m_block[0], srv_sort_buf_size);
+ return DB_SUCCESS;
+}
+
+dberr_t row_merge_bulk_t::bulk_insert_buffered(const dtuple_t &row,
+ const dict_index_t &ind,
+ trx_t *trx)
+{
+ dberr_t err= DB_SUCCESS;
+ ulint i= 0;
+ mem_heap_t *large_tuple_heap= nullptr;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(ind.table->indexes);
+ index; index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (!index->is_btree())
+ continue;
+
+ if (index != &ind)
+ {
+ i++;
+ continue;
+ }
+ row_merge_buf_t *buf= &m_merge_buf[i];
+add_to_buf:
+ if (row_merge_bulk_buf_add(buf, *ind.table, row))
+ {
+ i++;
+ goto func_exit;
+ }
+
+ if (buf->n_tuples == 0)
+ {
+ /* Tuple data size is greater than srv_sort_buf_size */
+ dtuple_t *big_tuple= row_merge_buf_large_tuple(
+ row, &m_blob_file, &large_tuple_heap);
+ if (row_merge_bulk_buf_add(buf, *ind.table, *big_tuple))
+ {
+ i++;
+ goto func_exit;
+ }
+ }
+
+ if (index->is_unique())
+ {
+ row_merge_dup_t dup{index, nullptr, nullptr, 0};
+ row_merge_buf_sort(buf, &dup);
+ if (dup.n_dup)
+ {
+ trx->error_info= index;
+ err= DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+ }
+ else
+ row_merge_buf_sort(buf, NULL);
+ init_tmp_file();
+ merge_file_t *file= &m_merge_files[i];
+ file->n_rec+= buf->n_tuples;
+ err= write_to_tmp_file(i);
+ if (err != DB_SUCCESS)
+ {
+ trx->error_info= index;
+ goto func_exit;
+ }
+ clean_bulk_buffer(i);
+ buf= &m_merge_buf[i];
+ goto add_to_buf;
+ }
+
+func_exit:
+ if (large_tuple_heap)
+ mem_heap_free(large_tuple_heap);
+ return err;
+}
+
+dberr_t row_merge_bulk_t::write_to_index(ulint index_no, trx_t *trx)
+{
+ dberr_t err= DB_SUCCESS;
+ row_merge_buf_t buf= m_merge_buf[index_no];
+ merge_file_t *file= m_merge_files ?
+ &m_merge_files[index_no] : nullptr;
+ dict_index_t *index= buf.index;
+ dict_table_t *table= index->table;
+ BtrBulk btr_bulk(index, trx);
+ row_merge_dup_t dup = {index, nullptr, nullptr, 0};
+
+ if (buf.n_tuples)
+ {
+ if (dict_index_is_unique(index))
+ {
+ row_merge_buf_sort(&buf, &dup);
+ if (dup.n_dup)
+ {
+ err= DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+ }
+ else row_merge_buf_sort(&buf, NULL);
+ if (file && file->fd != OS_FILE_CLOSED)
+ {
+ file->n_rec+= buf.n_tuples;
+ err= write_to_tmp_file(index_no);
+ if (err!= DB_SUCCESS)
+ goto func_exit;
+ }
+ else
+ {
+ /* Data got fit in merge buffer. */
+ err= row_merge_insert_index_tuples(
+ index, table, OS_FILE_CLOSED, nullptr,
+ &buf, &btr_bulk, 0, 0, 0, nullptr, table->space_id, nullptr,
+ m_blob_file.fd == OS_FILE_CLOSED ? nullptr : &m_blob_file);
+ goto func_exit;
+ }
+ }
+
+ err= row_merge_sort(trx, &dup, file,
+ m_block, &m_tmpfd, true, 0, 0,
+ m_crypt_block, table->space_id, nullptr);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ err= row_merge_insert_index_tuples(
+ index, table, file->fd, m_block, nullptr,
+ &btr_bulk, 0, 0, 0, m_crypt_block, table->space_id,
+ nullptr, &m_blob_file);
+
+func_exit:
+ if (err != DB_SUCCESS)
+ trx->error_info= index;
+ else if (index->is_primary() && table->persistent_autoinc)
+ btr_write_autoinc(index, table->autoinc - 1);
+ err= btr_bulk.finish(err);
+ return err;
+}
+
+dberr_t row_merge_bulk_t::write_to_table(dict_table_t *table, trx_t *trx)
+{
+ ulint i= 0;
+ for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+ index; index= UT_LIST_GET_NEXT(indexes, index))
+ {
+ if (!index->is_btree())
+ continue;
+
+ dberr_t err= write_to_index(i, trx);
+ if (err != DB_SUCCESS)
+ return err;
+ i++;
+ }
+
+ return DB_SUCCESS;
+}
+
+dberr_t trx_mod_table_time_t::write_bulk(dict_table_t *table, trx_t *trx)
+{
+ if (!bulk_store)
+ return DB_SUCCESS;
+ dberr_t err= bulk_store->write_to_table(table, trx);
+ delete bulk_store;
+ bulk_store= nullptr;
+ return err;
+}
+
+dberr_t trx_t::bulk_insert_apply_low()
+{
+ ut_ad(bulk_insert);
+ ut_ad(!check_unique_secondary);
+ ut_ad(!check_foreigns);
+ dberr_t err;
+ for (auto& t : mod_tables)
+ if (t.second.is_bulk_insert())
+ if ((err= t.second.write_bulk(t.first, this)) != DB_SUCCESS)
+ goto bulk_rollback;
+ return DB_SUCCESS;
+bulk_rollback:
+ undo_no_t low_limit= UINT64_MAX;
+ for (auto& t : mod_tables)
+ {
+ if (t.second.is_bulk_insert())
+ {
+ if (t.second.get_first() < low_limit)
+ low_limit= t.second.get_first();
+ delete t.second.bulk_store;
+ t.second.bulk_store= nullptr;
+ }
+ }
+ trx_savept_t bulk_save{low_limit};
+ rollback(&bulk_save);
+ return err;
+}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
new file mode 100644
index 00000000..c5ee3be7
--- /dev/null
+++ b/storage/innobase/row/row0mysql.cc
@@ -0,0 +1,2916 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.cc
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <gstream.h>
+#include <spatial.h>
+
+#include "row0mysql.h"
+#include "buf0flu.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0file.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0import.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "log.h"
+
+#include <algorithm>
+#include <vector>
+#include <thread>
+
+
+/** Delay an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static void row_mysql_delay_if_needed()
+{
+ const auto delay= srv_dml_needed_delay;
+ if (UNIV_UNLIKELY(delay != 0))
+ {
+ /* Adjust for purge_coordinator_state::refresh() */
+ log_sys.latch.rd_lock(SRW_LOCK_CALL);
+ const lsn_t last= log_sys.last_checkpoint_lsn,
+ max_age= log_sys.max_checkpoint_age;
+ log_sys.latch.rd_unlock();
+ const lsn_t lsn= log_sys.get_lsn();
+ if ((lsn - last) / 4 >= max_age / 5)
+ buf_flush_ahead(last + max_age / 5, false);
+ purge_sys.wake_if_not_active();
+ std::this_thread::sleep_for(std::chrono::microseconds(delay));
+ }
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+{
+ DBUG_ENTER("row_mysql_prebuilt_free_blob_heap");
+
+ DBUG_PRINT("row_mysql_prebuilt_free_blob_heap",
+ ("blob_heap freeing: %p", prebuilt->blob_heap));
+
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen) /*!< in: storage length of len: either 1 or 2 bytes */
+{
+ if (lenlen == 2) {
+ ut_a(len < 256 * 256);
+
+ mach_write_to_2_little_endian(dest, len);
+
+ return(dest + 2);
+ }
+
+ ut_a(lenlen == 1);
+ ut_a(len < 256);
+
+ mach_write_to_1(dest, len);
+
+ return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen) /*!< in: storage length of len: either 1
+ or 2 bytes */
+{
+ if (lenlen == 2) {
+ *len = mach_read_from_2_little_endian(field);
+
+ return(field + 2);
+ }
+
+ ut_a(lenlen == 1);
+
+ *len = mach_read_from_1(field);
+
+ return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len) /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+{
+ /* MySQL might assume the field is set to zero except the length and
+ the pointer fields */
+
+ memset(dest, '\0', col_len);
+
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ ut_a(col_len - 8 > 1 || len < 256);
+ ut_a(col_len - 8 > 2 || len < 256 * 256);
+ ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+ mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+ memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len) /*!< in: BLOB reference length
+ (not BLOB length) */
+{
+ byte* data;
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ memcpy(&data, ref + col_len - 8, sizeof data);
+
+ return(data);
+}
+
+/*******************************************************************//**
+Converting InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+ byte* dest, /*!< in/out: where to store */
+ ulint dest_len, /*!< in: dest buffer size: determines
+ into how many bytes the GEOMETRY length
+ is stored, the space for the length
+ may vary from 1 to 4 bytes */
+ const byte* src, /*!< in: GEOMETRY data; if the value to
+ store is SQL NULL this should be NULL
+ pointer */
+ ulint src_len) /*!< in: GEOMETRY length; if the value
+ to store is SQL NULL this should be 0;
+ remember also to set the NULL bit in
+ the MySQL record header! */
+{
+ /* MySQL might assume the field is set to zero except the length and
+ the pointer fields */
+ MEM_CHECK_DEFINED(src, src_len);
+
+ memset(dest, '\0', dest_len);
+
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ ut_ad(dest_len - 8 > 1 || src_len < 1<<8);
+ ut_ad(dest_len - 8 > 2 || src_len < 1<<16);
+ ut_ad(dest_len - 8 > 3 || src_len < 1<<24);
+
+ mach_write_to_n_little_endian(dest, dest_len - 8, src_len);
+
+ memcpy(dest + dest_len - 8, &src, sizeof src);
+}
+
+/*******************************************************************//**
+Read geometry data in the MySQL format.
+@return pointer to geometry data */
+static
+const byte*
+row_mysql_read_geometry(
+/*====================*/
+ ulint* len, /*!< out: data length */
+ const byte* ref, /*!< in: geometry data in the
+ MySQL format */
+ ulint col_len) /*!< in: MySQL format length */
+{
+ byte* data;
+ ut_ad(col_len > 8);
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ memcpy(&data, ref + col_len - 8, sizeof data);
+
+ return(data);
+}
+
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+ ulint mbminlen, /*!< in: minimum size of a character,
+ in bytes */
+ byte* pad, /*!< out: padded buffer */
+ ulint len) /*!< in: number of bytes to pad */
+{
+ const byte* pad_end;
+
+ switch (UNIV_EXPECT(mbminlen, 1)) {
+ default:
+ ut_error;
+ case 1:
+ /* space=0x20 */
+ memset(pad, 0x20, len);
+ break;
+ case 2:
+ /* space=0x0020 */
+ pad_end = pad + len;
+ ut_a(!(len % 2));
+ while (pad < pad_end) {
+ *pad++ = 0x00;
+ *pad++ = 0x20;
+ };
+ break;
+ case 4:
+ /* space=0x00000020 */
+ pad_end = pad + len;
+ ut_a(!(len % 4));
+ while (pad < pad_end) {
+ *pad++ = 0x00;
+ *pad++ = 0x00;
+ *pad++ = 0x00;
+ *pad++ = 0x20;
+ }
+ break;
+ }
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! NOTE that dfield
+ may also get a pointer to 'buf',
+ therefore do not discard this as long
+ as dfield is used! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp) /*!< in: nonzero=compact format */
+{
+ const byte* ptr = mysql_data;
+ const dtype_t* dtype;
+ ulint type;
+ ulint lenlen;
+
+ dtype = dfield_get_type(dfield);
+
+ type = dtype->mtype;
+
+ if (type == DATA_INT) {
+ /* Store integer data in Innobase in a big-endian format,
+ sign bit negated if the data is a signed integer. In MySQL,
+ integers are stored in a little-endian format. */
+
+ byte* p = buf + col_len;
+
+ for (;;) {
+ p--;
+ *p = *mysql_data;
+ if (p == buf) {
+ break;
+ }
+ mysql_data++;
+ }
+
+ if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+ *buf ^= 128;
+ }
+
+ ptr = buf;
+ buf += col_len;
+ } else if ((type == DATA_VARCHAR
+ || type == DATA_VARMYSQL
+ || type == DATA_BINARY)) {
+
+ if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+ /* The length of the actual data is stored to 1 or 2
+ bytes at the start of the field */
+
+ if (row_format_col) {
+ if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+ lenlen = 2;
+ } else {
+ lenlen = 1;
+ }
+ } else {
+ /* In a MySQL key value, lenlen is always 2 */
+ lenlen = 2;
+ }
+
+ ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+ lenlen);
+ } else {
+ /* Remove trailing spaces from old style VARCHAR
+ columns. */
+
+ /* Handle Unicode strings differently. */
+ ulint mbminlen = dtype_get_mbminlen(dtype);
+
+ ptr = mysql_data;
+
+ switch (mbminlen) {
+ default:
+ ut_error;
+ case 4:
+ /* space=0x00000020 */
+ /* Trim "half-chars", just in case. */
+ col_len &= ~3U;
+
+ while (col_len >= 4
+ && ptr[col_len - 4] == 0x00
+ && ptr[col_len - 3] == 0x00
+ && ptr[col_len - 2] == 0x00
+ && ptr[col_len - 1] == 0x20) {
+ col_len -= 4;
+ }
+ break;
+ case 2:
+ /* space=0x0020 */
+ /* Trim "half-chars", just in case. */
+ col_len &= ~1U;
+
+ while (col_len >= 2 && ptr[col_len - 2] == 0x00
+ && ptr[col_len - 1] == 0x20) {
+ col_len -= 2;
+ }
+ break;
+ case 1:
+ /* space=0x20 */
+ while (col_len > 0
+ && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ }
+ }
+ } else if (comp && type == DATA_MYSQL
+ && dtype_get_mbminlen(dtype) == 1
+ && dtype_get_mbmaxlen(dtype) > 1) {
+ /* In some cases we strip trailing spaces from UTF-8 and other
+ multibyte charsets, from FIXED-length CHAR columns, to save
+ space. UTF-8 would otherwise normally use 3 * the string length
+ bytes to store an ASCII string! */
+
+ /* We assume that this CHAR field is encoded in a
+ variable-length character set where spaces have
+ 1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+ Consider a CHAR(n) field, a field of n characters.
+ It will contain between n * mbminlen and n * mbmaxlen bytes.
+ We will try to truncate it to n bytes by stripping
+ space padding. If the field contains single-byte
+ characters only, it will be truncated to n characters.
+ Consider a CHAR(5) field containing the string
+ ".a " where "." denotes a 3-byte character represented
+ by the bytes "$%&". After our stripping, the string will
+ be stored as "$%&a " (5 bytes). The string
+ ".abc " will be stored as "$%&abc" (6 bytes).
+
+ The space padding will be restored in row0sel.cc, function
+ row_sel_field_store_in_mysql_format(). */
+
+ ulint n_chars;
+
+ ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+ n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+ /* Strip space padding. */
+ while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ } else if (!row_format_col) {
+ /* if mysql data is from a MySQL key value
+ since the length is always stored in 2 bytes,
+ we need do nothing here. */
+ } else if (type == DATA_BLOB) {
+
+ ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+ } else if (DATA_GEOMETRY_MTYPE(type)) {
+ ptr = row_mysql_read_geometry(&col_len, mysql_data, col_len);
+ }
+
+ dfield_set_data(dfield, ptr, col_len);
+
+ return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+ dtuple_t* row, /*!< in/out: Innobase row where the
+ field type information is already
+ copied there! */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct where template
+ must be of type ROW_MYSQL_WHOLE_ROW */
+ const byte* mysql_rec, /*!< in: row in the MySQL format;
+ NOTE: do not discard as long as
+ row is used, as row may contain
+ pointers to this record! */
+ mem_heap_t** blob_heap) /*!< in: FIX_ME, remove this after
+ server fixes its issue */
+{
+ const mysql_row_templ_t*templ;
+ dfield_t* dfield;
+ ulint i;
+ ulint n_col = 0;
+ ulint n_v_col = 0;
+
+ ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(prebuilt->mysql_template);
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+
+ if (templ->is_virtual) {
+ ut_ad(n_v_col < dtuple_get_n_v_fields(row));
+ dfield = dtuple_get_nth_v_field(row, n_v_col);
+ n_v_col++;
+ } else {
+ dfield = dtuple_get_nth_field(row, n_col);
+ n_col++;
+ }
+
+ if (templ->mysql_null_bit_mask != 0) {
+ /* Column may be SQL NULL */
+
+ if (mysql_rec[templ->mysql_null_byte_offset]
+ & (byte) (templ->mysql_null_bit_mask)) {
+
+ /* It is SQL NULL */
+
+ dfield_set_null(dfield);
+
+ goto next_column;
+ }
+ }
+
+ row_mysql_store_col_in_innobase_format(
+ dfield,
+ prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+ TRUE, /* MySQL row format data */
+ mysql_rec + templ->mysql_col_offset,
+ templ->mysql_col_len,
+ dict_table_is_comp(prebuilt->table));
+
+ /* server has issue regarding handling BLOB virtual fields,
+ and we need to duplicate it with our own memory here */
+ if (templ->is_virtual
+ && DATA_LARGE_MTYPE(dfield_get_type(dfield)->mtype)) {
+ if (*blob_heap == NULL) {
+ *blob_heap = mem_heap_create(dfield->len);
+ }
+ dfield_dup(dfield, *blob_heap);
+ }
+next_column:
+ ;
+ }
+
+ /* If there is a FTS doc id column and it is not user supplied (
+ generated by server) then assign it a new doc id. */
+ if (!prebuilt->table->fts) {
+ return;
+ }
+
+ ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+ doc_id_t doc_id;
+
+ if (!DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ if (prebuilt->table->fts->cache->first_doc_id
+ == FTS_NULL_DOC_ID) {
+ fts_get_next_doc_id(prebuilt->table, &doc_id);
+ }
+ return;
+ }
+
+ dfield_t* fts_doc_id = dtuple_get_nth_field(
+ row, prebuilt->table->fts->doc_col);
+
+ if (fts_get_next_doc_id(prebuilt->table, &doc_id) == DB_SUCCESS) {
+ ut_a(doc_id != FTS_NULL_DOC_ID);
+ ut_ad(sizeof(doc_id) == fts_doc_id->type.len);
+ dfield_set_data(fts_doc_id, prebuilt->ins_upd_rec_buff
+ + prebuilt->mysql_row_len, 8);
+ fts_write_doc_id(fts_doc_id->data, doc_id);
+ } else {
+ dfield_set_null(fts_doc_id);
+ }
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+bool
+row_mysql_handle_errors(
+/*====================*/
+ dberr_t* new_err,/*!< out: possible new error encountered in
+ lock wait, or if no new error, the value
+ of trx->error_state at the entry of this
+ function */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread, or NULL */
+ trx_savept_t* savept) /*!< in: savepoint, or NULL */
+{
+ dberr_t err;
+
+ DBUG_ENTER("row_mysql_handle_errors");
+ DEBUG_SYNC_C("row_mysql_handle_errors");
+
+ err = trx->error_state;
+
+handle_new_error:
+ ut_a(err != DB_SUCCESS);
+
+ trx->error_state = DB_SUCCESS;
+
+ DBUG_LOG("trx", "handle error: " << err
+ << ";id=" << ib::hex(trx->id) << ", " << trx);
+
+ switch (err) {
+ case DB_LOCK_WAIT_TIMEOUT:
+ extern my_bool innobase_rollback_on_timeout;
+ if (innobase_rollback_on_timeout) {
+ goto rollback;
+ }
+ /* fall through */
+ case DB_DUPLICATE_KEY:
+ case DB_FOREIGN_DUPLICATE_KEY:
+ case DB_TOO_BIG_RECORD:
+ case DB_UNDO_RECORD_TOO_BIG:
+ case DB_ROW_IS_REFERENCED:
+ case DB_NO_REFERENCED_ROW:
+ case DB_CANNOT_ADD_CONSTRAINT:
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ case DB_OUT_OF_FILE_SPACE:
+ case DB_READ_ONLY:
+ case DB_FTS_INVALID_DOCID:
+ case DB_INTERRUPTED:
+ case DB_CANT_CREATE_GEOMETRY_OBJECT:
+ case DB_TABLE_NOT_FOUND:
+ case DB_DECRYPTION_FAILED:
+ case DB_COMPUTE_VALUE_FAILED:
+ rollback_to_savept:
+ DBUG_EXECUTE_IF("row_mysql_crash_if_error", {
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(); });
+ if (savept) {
+ /* Roll back the latest, possibly incomplete insertion
+ or update */
+
+ trx->rollback(savept);
+ }
+ if (!trx->bulk_insert) {
+ /* MariaDB will roll back the latest SQL statement */
+ break;
+ }
+ /* MariaDB will roll back the entire transaction. */
+ trx->bulk_insert = false;
+ trx->last_sql_stat_start.least_undo_no = 0;
+ trx->savepoints_discard();
+ break;
+ case DB_LOCK_WAIT:
+ err = lock_wait(thr);
+ if (err != DB_SUCCESS) {
+ goto handle_new_error;
+ }
+
+ *new_err = err;
+
+ DBUG_RETURN(true);
+
+ case DB_DEADLOCK:
+ case DB_LOCK_TABLE_FULL:
+ rollback:
+ /* Roll back the whole transaction; this resolution was added
+ to version 3.23.43 */
+
+ trx->rollback();
+ break;
+
+ case DB_IO_ERROR:
+ case DB_TABLE_CORRUPT:
+ case DB_CORRUPTION:
+ case DB_PAGE_CORRUPTED:
+ ib::error() << "We detected index corruption in an InnoDB type"
+ " table. You have to dump + drop + reimport the"
+ " table or, in a case of widespread corruption,"
+ " dump all InnoDB tables and recreate the whole"
+ " tablespace. If the mariadbd server crashes after"
+ " the startup or when you dump the tables. "
+ << FORCE_RECOVERY_MSG;
+ goto rollback_to_savept;
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ ib::error() << "Cannot delete/update rows with cascading"
+ " foreign key constraints that exceed max depth of "
+ << FK_MAX_CASCADE_DEL << ". Please drop excessive"
+ " foreign constraints and try again";
+ goto rollback_to_savept;
+ case DB_UNSUPPORTED:
+ ib::error() << "Cannot delete/update rows with cascading"
+ " foreign key constraints in timestamp-based temporal"
+ " table. Please drop excessive"
+ " foreign constraints and try again";
+ goto rollback_to_savept;
+ default:
+ ib::fatal() << "Unknown error " << err;
+ }
+
+ if (dberr_t n_err = trx->error_state) {
+ trx->error_state = DB_SUCCESS;
+ *new_err = n_err;
+ } else {
+ *new_err = err;
+ }
+
+ DBUG_RETURN(false);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table, /*!< in: Innobase table handle */
+ ulint mysql_row_len) /*!< in: length in bytes of a row in
+ the MySQL format */
+{
+ DBUG_ENTER("row_create_prebuilt");
+
+ row_prebuilt_t* prebuilt;
+ mem_heap_t* heap;
+ dict_index_t* clust_index;
+ dict_index_t* temp_index;
+ dtuple_t* ref;
+ ulint ref_len;
+ uint srch_key_len = 0;
+ ulint search_tuple_n_fields;
+
+ search_tuple_n_fields = 2 * (dict_table_get_n_cols(table)
+ + dict_table_get_n_v_cols(table));
+
+ clust_index = dict_table_get_first_index(table);
+
+ /* Make sure that search_tuple is long enough for clustered index */
+ ut_a(2 * unsigned(table->n_cols) >= unsigned(clust_index->n_fields)
+ - clust_index->table->n_dropped());
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+
+ /* Maximum size of the buffer needed for conversion of INTs from
+ little endian format to big endian format in an index. An index
+ can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore
+ Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes
+ Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */
+#define MAX_SRCH_KEY_VAL_BUFFER 2* (8 * MAX_REF_PARTS)
+
+#define PREBUILT_HEAP_INITIAL_SIZE \
+ ( \
+ sizeof(*prebuilt) \
+ /* allocd in this function */ \
+ + DTUPLE_EST_ALLOC(search_tuple_n_fields) \
+ + DTUPLE_EST_ALLOC(ref_len) \
+ /* allocd in row_prebuild_sel_graph() */ \
+ + sizeof(sel_node_t) \
+ + sizeof(que_fork_t) \
+ + sizeof(que_thr_t) \
+ /* allocd in row_get_prebuilt_update_vector() */ \
+ + sizeof(upd_node_t) \
+ + sizeof(upd_t) \
+ + sizeof(upd_field_t) \
+ * dict_table_get_n_cols(table) \
+ + sizeof(que_fork_t) \
+ + sizeof(que_thr_t) \
+ /* allocd in row_get_prebuilt_insert_row() */ \
+ + sizeof(ins_node_t) \
+ /* mysql_row_len could be huge and we are not \
+ sure if this prebuilt instance is going to be \
+ used in inserts */ \
+ + (mysql_row_len < 256 ? mysql_row_len : 0) \
+ + DTUPLE_EST_ALLOC(dict_table_get_n_cols(table) \
+ + dict_table_get_n_v_cols(table)) \
+ + sizeof(que_fork_t) \
+ + sizeof(que_thr_t) \
+ + sizeof(*prebuilt->pcur) \
+ + sizeof(*prebuilt->clust_pcur) \
+ )
+
+ /* Calculate size of key buffer used to store search key in
+ InnoDB format. MySQL stores INTs in little endian format and
+ InnoDB stores INTs in big endian format with the sign bit
+ flipped. All other field types are stored/compared the same
+ in MySQL and InnoDB, so we must create a buffer containing
+ the INT key parts in InnoDB format.We need two such buffers
+ since both start and end keys are used in records_in_range(). */
+
+ for (temp_index = dict_table_get_first_index(table); temp_index;
+ temp_index = dict_table_get_next_index(temp_index)) {
+ DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+ ut_a(temp_index->n_user_defined_cols
+ == MAX_REF_PARTS););
+ if (temp_index->is_corrupted()) {
+ continue;
+ }
+
+ uint temp_len = 0;
+ for (uint i = 0; i < temp_index->n_uniq; i++) {
+ ulint type = temp_index->fields[i].col->mtype;
+ if (type == DATA_INT) {
+ temp_len +=
+ temp_index->fields[i].fixed_len;
+ }
+ }
+ srch_key_len = std::max(srch_key_len,temp_len);
+ }
+
+ ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER);
+
+ DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+ ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER););
+
+ /* We allocate enough space for the objects that are likely to
+ be created later in order to minimize the number of malloc()
+ calls */
+ heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len);
+
+ prebuilt = static_cast<row_prebuilt_t*>(
+ mem_heap_zalloc(heap, sizeof(*prebuilt)));
+
+ prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+ prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+ prebuilt->table = table;
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->heap = heap;
+
+ prebuilt->srch_key_val_len = srch_key_len;
+ if (prebuilt->srch_key_val_len) {
+ prebuilt->srch_key_val1 = static_cast<byte*>(
+ mem_heap_alloc(prebuilt->heap,
+ 2 * prebuilt->srch_key_val_len));
+ prebuilt->srch_key_val2 = prebuilt->srch_key_val1 +
+ prebuilt->srch_key_val_len;
+ } else {
+ prebuilt->srch_key_val1 = NULL;
+ prebuilt->srch_key_val2 = NULL;
+ }
+
+ prebuilt->pcur = static_cast<btr_pcur_t*>(
+ mem_heap_zalloc(prebuilt->heap,
+ sizeof(btr_pcur_t)));
+ prebuilt->clust_pcur = static_cast<btr_pcur_t*>(
+ mem_heap_zalloc(prebuilt->heap,
+ sizeof(btr_pcur_t)));
+ btr_pcur_reset(prebuilt->pcur);
+ btr_pcur_reset(prebuilt->clust_pcur);
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
+
+ prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ prebuilt->clust_ref = ref;
+
+ prebuilt->autoinc_error = DB_SUCCESS;
+ prebuilt->autoinc_offset = 0;
+
+ /* Default to 1, we will set the actual value later in
+ ha_innobase::get_auto_increment(). */
+ prebuilt->autoinc_increment = 1;
+
+ prebuilt->autoinc_last_value = 0;
+
+ /* During UPDATE and DELETE we need the doc id. */
+ prebuilt->fts_doc_id = 0;
+
+ prebuilt->mysql_row_len = mysql_row_len;
+
+ prebuilt->fts_doc_id_in_read_set = 0;
+ prebuilt->blob_heap = NULL;
+
+ DBUG_RETURN(prebuilt);
+}
+
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt)
+{
+ DBUG_ENTER("row_prebuilt_free");
+
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ prebuilt->magic_n = ROW_PREBUILT_FREED;
+ prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+ btr_pcur_reset(prebuilt->pcur);
+ btr_pcur_reset(prebuilt->clust_pcur);
+
+ ut_free(prebuilt->mysql_template);
+
+ if (prebuilt->ins_graph) {
+ que_graph_free_recursive(prebuilt->ins_graph);
+ }
+
+ if (prebuilt->sel_graph) {
+ que_graph_free_recursive(prebuilt->sel_graph);
+ }
+
+ if (prebuilt->upd_graph) {
+ que_graph_free_recursive(prebuilt->upd_graph);
+ }
+
+ if (prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_free(prebuilt->old_vers_heap);
+ }
+
+ if (prebuilt->fetch_cache[0] != NULL) {
+ byte* base = prebuilt->fetch_cache[0] - 4;
+ byte* ptr = base;
+
+ for (ulint i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ ulint magic1 = mach_read_from_4(ptr);
+ ut_a(magic1 == ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+
+ byte* row = ptr;
+ ut_a(row == prebuilt->fetch_cache[i]);
+ ptr += prebuilt->mysql_row_len;
+
+ ulint magic2 = mach_read_from_4(ptr);
+ ut_a(magic2 == ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+ }
+
+ ut_free(base);
+ }
+
+ if (prebuilt->rtr_info) {
+ rtr_clean_rtr_info(prebuilt->rtr_info, true);
+ }
+ if (prebuilt->table) {
+ dict_table_close(prebuilt->table);
+ }
+
+ mem_heap_free(prebuilt->heap);
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ prebuilt->trx = trx;
+
+ if (prebuilt->ins_graph) {
+ prebuilt->ins_graph->trx = trx;
+ }
+
+ if (prebuilt->upd_graph) {
+ prebuilt->upd_graph->trx = trx;
+ }
+
+ if (prebuilt->sel_graph) {
+ prebuilt->sel_graph->trx = trx;
+ }
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ dict_table_t* table = prebuilt->table;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->ins_node != 0) {
+
+ /* Check if indexes have been dropped or added and we
+ may need to rebuild the row insert template. */
+
+ if (prebuilt->trx_id == table->def_trx_id
+ && prebuilt->ins_node->entry_list.size()
+ == UT_LIST_GET_LEN(table->indexes)) {
+ return(prebuilt->ins_node->row);
+ }
+
+ ut_ad(prebuilt->trx_id < table->def_trx_id);
+
+ que_graph_free_recursive(prebuilt->ins_graph);
+
+ prebuilt->ins_graph = 0;
+ }
+
+ /* Create an insert node and query graph to the prebuilt struct */
+
+ ins_node_t* node;
+
+ node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+ prebuilt->ins_node = node;
+
+ if (prebuilt->ins_upd_rec_buff == 0) {
+ prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+ mem_heap_alloc(
+ prebuilt->heap,
+ DICT_TF2_FLAG_IS_SET(prebuilt->table,
+ DICT_TF2_FTS_HAS_DOC_ID)
+ ? prebuilt->mysql_row_len + 8/* FTS_DOC_ID */
+ : prebuilt->mysql_row_len));
+ }
+
+ dtuple_t* row;
+
+ row = dtuple_create_with_vcol(
+ prebuilt->heap, dict_table_get_n_cols(table),
+ dict_table_get_n_v_cols(table));
+
+ dict_table_copy_types(row, table);
+
+ ins_node_set_new_row(node, row);
+ que_thr_t* fork = pars_complete_graph_for_exec(
+ node, prebuilt->trx, prebuilt->heap, prebuilt);
+ fork->state = QUE_THR_RUNNING;
+
+ prebuilt->ins_graph = static_cast<que_fork_t*>(
+ que_node_get_parent(fork));
+
+ prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+
+ prebuilt->trx_id = table->def_trx_id;
+
+ return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL
+ table handle */
+{
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+ const dict_table_t* table = prebuilt->table;
+ que_thr_t* thr;
+ dberr_t err;
+
+ /* If we already hold an AUTOINC lock on the table then do nothing.
+ Note: We peek at the value of the current owner without acquiring
+ lock_sys.latch. */
+ if (trx == table->autoinc_trx) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "setting auto-inc lock";
+
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+
+ /* We use the insert query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ do {
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started_xa(trx, true);
+
+ err = lock_table(prebuilt->table, NULL, LOCK_AUTO_INC, thr);
+
+ trx->error_state = err;
+ } while (err != DB_SUCCESS
+ && row_mysql_handle_errors(&err, trx, thr, NULL));
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Lock a table.
+@param[in,out] prebuilt table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt)
+{
+ trx_t* trx = prebuilt->trx;
+ que_thr_t* thr;
+ dberr_t err;
+
+ trx->op_info = "setting table lock";
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ do {
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started_xa(trx, false);
+
+ err = lock_table(prebuilt->table, NULL, static_cast<lock_mode>(
+ prebuilt->select_lock_type), thr);
+ trx->error_state = err;
+ } while (err != DB_SUCCESS
+ && row_mysql_handle_errors(&err, trx, thr, NULL));
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Determine is tablespace encrypted but decryption failed, is table corrupted
+or is tablespace .ibd file missing.
+@param[in] table Table
+@param[in] trx Transaction
+@param[in] push_warning true if we should push warning to user
+@retval DB_DECRYPTION_FAILED table is encrypted but decryption failed
+@retval DB_CORRUPTION table is corrupted
+@retval DB_TABLESPACE_NOT_FOUND tablespace .ibd file not found */
+static
+dberr_t
+row_mysql_get_table_status(
+ const dict_table_t* table,
+ trx_t* trx,
+ bool push_warning = true)
+{
+ dberr_t err;
+ if (const fil_space_t* space = table->space) {
+ if (space->crypt_data && space->crypt_data->is_encrypted()) {
+ // maybe we cannot access the table due to failing
+ // to decrypt
+ if (push_warning) {
+ ib_push_warning(trx, DB_DECRYPTION_FAILED,
+ "Table %s is encrypted."
+ "However key management plugin or used key_id is not found or"
+ " used encryption algorithm or method does not match.",
+ table->name.m_name);
+ }
+
+ err = DB_DECRYPTION_FAILED;
+ } else {
+ if (push_warning) {
+ ib_push_warning(trx, DB_CORRUPTION,
+ "Table %s in tablespace %lu corrupted.",
+ table->name.m_name, table->space);
+ }
+
+ err = DB_CORRUPTION;
+ }
+ } else {
+ ib::error() << ".ibd file is missing for table "
+ << table->name;
+ err = DB_TABLESPACE_NOT_FOUND;
+ }
+
+ return(err);
+}
+
+/** Does an insert for MySQL.
+@param[in] mysql_rec row in the MySQL format
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_insert_for_mysql(
+ const byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ ins_mode_t ins_mode)
+{
+ trx_savept_t savept;
+ que_thr_t* thr;
+ dberr_t err;
+ ibool was_lock_wait;
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+ dict_table_t* table = prebuilt->table;
+
+ /* FIX_ME: This blob heap is used to compensate an issue in server
+ for virtual column blob handling */
+ mem_heap_t* blob_heap = NULL;
+
+ ut_ad(trx);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ if (!table->space) {
+ ib::error() << "The table " << table->name
+ << " doesn't have a corresponding tablespace, it was"
+ " discarded.";
+
+ return(DB_TABLESPACE_DELETED);
+ } else if (!table->is_readable()) {
+ return row_mysql_get_table_status(table, trx, true);
+ } else if (high_level_read_only) {
+ return(DB_READ_ONLY);
+ } else if (UNIV_UNLIKELY(table->corrupted)
+ || dict_table_get_first_index(table)->is_corrupted()) {
+ return DB_TABLE_CORRUPT;
+ }
+
+ trx->op_info = "inserting";
+
+ row_mysql_delay_if_needed();
+
+ if (!table->no_rollback()) {
+ trx_start_if_not_started_xa(trx, true);
+ }
+
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+
+ row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec,
+ &blob_heap);
+
+ if (ins_mode != ROW_INS_NORMAL) {
+ node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL);
+ }
+
+ /* Because we now allow multiple INSERT into the same
+ initially empty table in bulk insert mode, on error we must
+ roll back to the start of the transaction. For correctness, it
+ would suffice to roll back to the start of the first insert
+ into this empty table, but we will keep it simple and efficient. */
+ savept.least_undo_no = trx->bulk_insert ? 0 : trx->undo_no;
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ if (prebuilt->sql_stat_start) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ node->state = INS_NODE_ALLOC_ROW_ID;
+ node->trx_id = trx->id;
+ }
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_ins_step(thr);
+
+ DEBUG_SYNC_C("ib_after_row_insert_step");
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+error_exit:
+ /* FIXME: What's this ? */
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ was_lock_wait = row_mysql_handle_errors(
+ &err, trx, thr, &savept);
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ if (was_lock_wait) {
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES
+ || node->state == INS_NODE_ALLOC_ROW_ID
+ || node->state == INS_NODE_SET_IX_LOCK);
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ if (blob_heap != NULL) {
+ mem_heap_free(blob_heap);
+ }
+
+ return(err);
+ }
+
+ if (dict_table_has_fts_index(table)
+ && (!table->versioned()
+ || !node->row->fields[table->vers_end].vers_history_row())) {
+
+ doc_id_t doc_id;
+
+ /* Extract the doc id from the hidden FTS column */
+ doc_id = fts_get_doc_id_from_row(table, node->row);
+
+ if (doc_id <= 0) {
+ ib::error() << "FTS_DOC_ID must be larger than 0 for table "
+ << table->name;
+ err = DB_FTS_INVALID_DOCID;
+ trx->error_state = DB_FTS_INVALID_DOCID;
+ goto error_exit;
+ }
+
+ if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ doc_id_t next_doc_id
+ = table->fts->cache->next_doc_id;
+
+ if (doc_id < next_doc_id) {
+ ib::error() << "FTS_DOC_ID must be larger than "
+ << next_doc_id - 1 << " for table "
+ << table->name;
+
+ err = DB_FTS_INVALID_DOCID;
+ trx->error_state = DB_FTS_INVALID_DOCID;
+ goto error_exit;
+ }
+ }
+
+ if (table->skip_alter_undo) {
+ if (trx->fts_trx == NULL) {
+ trx->fts_trx = fts_trx_create(trx);
+ }
+
+ fts_trx_table_t ftt;
+ ftt.table = table;
+ ftt.fts_trx = trx->fts_trx;
+
+ fts_add_doc_from_tuple(&ftt, doc_id, node->row);
+ } else {
+ /* Pass NULL for the columns affected, since an INSERT affects
+ all FTS indexes. */
+ fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+ }
+ }
+
+ /* Not protected by dict_sys.latch or table->stats_mutex_lock()
+ for performance
+ reasons, we would rather get garbage in stat_n_rows (which is
+ just an estimate anyway) than protecting the following code
+ with a latch. */
+ dict_table_n_rows_inc(table);
+
+ if (prebuilt->clust_index_was_generated) {
+ /* set row id to prebuilt */
+ memcpy(prebuilt->row_id, node->sys_buf, DATA_ROW_ID_LEN);
+ }
+
+ dict_stats_update_if_needed(table, *trx);
+ trx->op_info = "";
+
+ if (blob_heap != NULL) {
+ mem_heap_free(blob_heap);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ sel_node_t* node;
+
+ ut_ad(prebuilt && prebuilt->trx);
+
+ if (prebuilt->sel_graph == NULL) {
+
+ node = sel_node_create(prebuilt->heap);
+
+ que_thr_t* fork = pars_complete_graph_for_exec(
+ node, prebuilt->trx, prebuilt->heap, prebuilt);
+ fork->state = QUE_THR_RUNNING;
+
+ prebuilt->sel_graph = static_cast<que_fork_t*>(
+ que_node_get_parent(fork));
+
+ prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+ }
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap) /*!< in: mem heap from which allocated */
+{
+ upd_node_t* node;
+
+ DBUG_ENTER("row_create_update_node_for_mysql");
+
+ node = upd_node_create(heap);
+
+ node->in_mysql_interface = true;
+ node->is_delete = NO_DELETE;
+ node->pcur = new (mem_heap_alloc(heap, sizeof(btr_pcur_t)))
+ btr_pcur_t();
+
+ node->table = table;
+
+ node->update = upd_create(dict_table_get_n_cols(table)
+ + dict_table_get_n_v_cols(table), heap);
+
+ node->update_n_fields = dict_table_get_n_cols(table);
+
+ UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
+
+ node->has_clust_rec_x_lock = TRUE;
+
+ DBUG_RETURN(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ if (prebuilt->upd_node == NULL) {
+
+ /* Not called before for this handle: create an update node
+ and query graph to the prebuilt struct */
+
+ prebuilt->upd_node = row_create_update_node_for_mysql(
+ prebuilt->table, prebuilt->heap);
+
+ prebuilt->upd_graph = static_cast<que_fork_t*>(
+ que_node_get_parent(
+ pars_complete_graph_for_exec(
+ prebuilt->upd_node,
+ prebuilt->trx, prebuilt->heap,
+ prebuilt)));
+
+ prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->upd_node->update);
+}
+
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+ trx_t* trx, /* in: transaction */
+ dict_table_t* table, /* in: Table with FTS index */
+ doc_id_t old_doc_id, /* in: old document id */
+ doc_id_t new_doc_id) /* in: new document id */
+{
+ if(trx->fts_next_doc_id) {
+ fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+ if(new_doc_id != FTS_NULL_DOC_ID)
+ fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+ }
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+dberr_t
+row_fts_update_or_delete(
+/*=====================*/
+ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL
+ handle */
+{
+ trx_t* trx = prebuilt->trx;
+ dict_table_t* table = prebuilt->table;
+ upd_node_t* node = prebuilt->upd_node;
+ doc_id_t old_doc_id = prebuilt->fts_doc_id;
+
+ DBUG_ENTER("row_fts_update_or_delete");
+
+ ut_a(dict_table_has_fts_index(prebuilt->table));
+
+ /* Deletes are simple; get them out of the way first. */
+ if (node->is_delete) {
+ /* A delete affects all FTS indexes, so we pass NULL */
+ fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+ } else {
+ doc_id_t new_doc_id;
+ new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+ if (new_doc_id == 0) {
+ ib::error() << "InnoDB FTS: Doc ID cannot be 0";
+ DBUG_RETURN(DB_FTS_INVALID_DOCID);
+ }
+ row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+ }
+
+ DBUG_RETURN(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ ulint* depth) /*!< in: recusive call depth */
+{
+ table->fk_max_recusive_level = 0;
+
+ /* Limit on tables involved in cascading delete/update */
+ if (++*depth > FK_MAX_CASCADE_DEL) {
+ return;
+ }
+
+ /* Loop through this table's referenced list and also
+ recursively traverse each table's foreign table list */
+ for (dict_foreign_t* foreign : table->referenced_set) {
+ ut_ad(foreign->foreign_table);
+
+ if (foreign->foreign_table->fts) {
+ fts_init_doc_id(foreign->foreign_table);
+ }
+
+ if (foreign->foreign_table != table
+ && !foreign->foreign_table->referenced_set.empty()) {
+ init_fts_doc_id_for_ref(
+ foreign->foreign_table, depth);
+ }
+ }
+}
+
+/** Does an update or delete of a row for MySQL.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(row_prebuilt_t* prebuilt)
+{
+ trx_savept_t savept;
+ dberr_t err;
+ que_thr_t* thr;
+ dict_index_t* clust_index;
+ upd_node_t* node;
+ dict_table_t* table = prebuilt->table;
+ trx_t* trx = prebuilt->trx;
+ ulint fk_depth = 0;
+
+ DBUG_ENTER("row_update_for_mysql");
+
+ ut_ad(trx);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(table->stat_initialized);
+
+ if (!table->is_readable()) {
+ return(row_mysql_get_table_status(table, trx, true));
+ }
+
+ if (high_level_read_only) {
+ return(DB_READ_ONLY);
+ }
+
+ DEBUG_SYNC_C("innodb_row_update_for_mysql_begin");
+
+ trx->op_info = "updating or deleting";
+
+ row_mysql_delay_if_needed();
+
+ init_fts_doc_id_for_ref(table, &fk_depth);
+
+ if (!table->no_rollback()) {
+ trx_start_if_not_started_xa(trx, true);
+ }
+
+ node = prebuilt->upd_node;
+ const bool is_delete = node->is_delete == PLAIN_DELETE;
+ ut_ad(node->table == table);
+
+ clust_index = dict_table_get_first_index(table);
+
+ btr_pcur_copy_stored_position(node->pcur,
+ prebuilt->pcur->index() == clust_index
+ ? prebuilt->pcur
+ : prebuilt->clust_pcur);
+
+ ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+ /* MySQL seems to call rnd_pos before updating each row it
+ has cached: we can get the correct cursor position from
+ prebuilt->pcur; NOTE that we cannot build the row reference
+ from mysql_rec if the clustered index was automatically
+ generated for the table: MySQL does not know anything about
+ the row id used as the clustered index key */
+
+ savept.least_undo_no = trx->undo_no;
+
+ thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ ut_ad(!prebuilt->sql_stat_start);
+
+ ut_ad(!prebuilt->versioned_write || node->table->versioned());
+
+ if (prebuilt->versioned_write && node->is_delete == VERSIONED_DELETE) {
+ node->vers_make_delete(trx);
+ }
+
+ for (;;) {
+ thr->run_node = node;
+ thr->prev_node = node;
+ thr->fk_cascade_depth = 0;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ if (err == DB_SUCCESS) {
+ break;
+ }
+
+ if (err == DB_RECORD_NOT_FOUND) {
+ trx->error_state = DB_SUCCESS;
+ goto error;
+ }
+
+ thr->lock_state= QUE_THR_LOCK_ROW;
+
+ DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error");
+
+ bool was_lock_wait = row_mysql_handle_errors(
+ &err, trx, thr, &savept);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+ if (!was_lock_wait) {
+ goto error;
+ }
+ }
+
+ if (dict_table_has_fts_index(table)
+ && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+ err = row_fts_update_or_delete(prebuilt);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ut_ad("unexpected error" == 0);
+ goto error;
+ }
+ }
+
+ /* Completed cascading operations (if any) */
+ bool update_statistics;
+ ut_ad(is_delete == (node->is_delete == PLAIN_DELETE));
+
+ if (is_delete) {
+ /* Not protected by dict_sys.latch
+ or prebuilt->table->stats_mutex_lock() for performance
+ reasons, we would rather get garbage in stat_n_rows (which is
+ just an estimate anyway) than protecting the following code
+ with a latch. */
+ dict_table_n_rows_dec(prebuilt->table);
+
+ update_statistics = !srv_stats_include_delete_marked;
+ } else {
+ update_statistics
+ = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+ }
+
+ if (update_statistics) {
+ dict_stats_update_if_needed(prebuilt->table, *trx);
+ } else {
+ /* Always update the table modification counter. */
+ prebuilt->table->stat_modified_counter++;
+ }
+
+error:
+ trx->op_info = "";
+ DBUG_RETURN(err);
+}
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_mvcc() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out] prebuilt prebuilt struct in MySQL handle
+@param[in] has_latches_on_recs TRUE if called so that we have the
+ latches on the records under pcur
+ and clust_pcur, and we do not need
+ to reposition the cursors. */
+void
+row_unlock_for_mysql(
+ row_prebuilt_t* prebuilt,
+ ibool has_latches_on_recs)
+{
+ if (prebuilt->new_rec_locks == 1 && prebuilt->index->is_clust()) {
+ trx_t* trx = prebuilt->trx;
+ ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+ trx->op_info = "unlock_row";
+
+ const rec_t* rec;
+ dict_index_t* index;
+ trx_id_t rec_trx_id;
+ mtr_t mtr;
+ btr_pcur_t* pcur = prebuilt->pcur;
+
+ mtr_start(&mtr);
+
+ /* Restore the cursor position and find the record */
+
+ if (!has_latches_on_recs
+ && pcur->restore_position(BTR_SEARCH_LEAF, &mtr)
+ != btr_pcur_t::SAME_ALL) {
+ goto no_unlock;
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ index = pcur->index();
+
+ /* If the record has been modified by this
+ transaction, do not unlock it. */
+
+ if (index->trx_id_offset) {
+ rec_trx_id = trx_read_trx_id(rec
+ + index->trx_id_offset);
+ } else {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ if (rec_trx_id != trx->id) {
+ /* We did not update the record: unlock it */
+
+ rec = btr_pcur_get_rec(pcur);
+
+ lock_rec_unlock(
+ trx,
+ btr_pcur_get_block(pcur)->page.id(),
+ rec,
+ static_cast<enum lock_mode>(
+ prebuilt->select_lock_type));
+ }
+no_unlock:
+ mtr_commit(&mtr);
+ trx->op_info = "";
+ }
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param thd Thread object
+@param buf Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Insert history row when evaluating foreign key referential action.
+
+1. Create new dtuple_t 'row' from node->historical_row;
+2. Update its row_end to current timestamp;
+3. Insert it to a table;
+4. Update table statistics.
+
+This is used in UPDATE CASCADE/SET NULL of a system versioned referenced table.
+
+node->historical_row: dtuple_t containing pointers of row changed by refertial
+action.
+
+@param[in] thr current query thread
+@param[in] node a node which just updated a row in a foreign table
+@return DB_SUCCESS or some error */
+static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node)
+{
+ trx_t* trx = thr_get_trx(thr);
+ dfield_t* row_end;
+ char row_end_data[8];
+ dict_table_t* table = node->table;
+ const unsigned zip_size = table->space->zip_size();
+ ut_ad(table->versioned());
+
+ dtuple_t* row;
+ const ulint n_cols = dict_table_get_n_cols(table);
+ const ulint n_v_cols = dict_table_get_n_v_cols(table);
+
+ ut_ad(n_cols == dtuple_get_n_fields(node->historical_row));
+ ut_ad(n_v_cols == dtuple_get_n_v_fields(node->historical_row));
+
+ row = dtuple_create_with_vcol(node->historical_heap, n_cols, n_v_cols);
+
+ dict_table_copy_types(row, table);
+
+ ins_node_t* insert_node =
+ ins_node_create(INS_DIRECT, table, node->historical_heap);
+
+ if (!insert_node) {
+ trx->error_state = DB_OUT_OF_MEMORY;
+ goto exit;
+ }
+
+ insert_node->common.parent = thr;
+ ins_node_set_new_row(insert_node, row);
+
+ ut_ad(n_cols > DATA_N_SYS_COLS);
+ // Exclude DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR
+ for (ulint i = 0; i < n_cols - DATA_N_SYS_COLS; i++) {
+ dfield_t *src= dtuple_get_nth_field(node->historical_row, i);
+ dfield_t *dst= dtuple_get_nth_field(row, i);
+ dfield_copy(dst, src);
+ if (dfield_is_ext(src)) {
+ byte *field_data
+ = static_cast<byte*>(dfield_get_data(src));
+ ulint ext_len;
+ ulint field_len = dfield_get_len(src);
+
+ ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ ut_a(memcmp(field_data + field_len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+
+ byte *data = btr_copy_externally_stored_field(
+ &ext_len, field_data, zip_size, field_len,
+ node->historical_heap);
+ dfield_set_data(dst, data, ext_len);
+ }
+ }
+
+ for (ulint i = 0; i < n_v_cols; i++) {
+ dfield_t *dst= dtuple_get_nth_v_field(row, i);
+ dfield_t *src= dtuple_get_nth_v_field(node->historical_row, i);
+ dfield_copy(dst, src);
+ }
+
+ node->historical_row = NULL;
+
+ row_end = dtuple_get_nth_field(row, table->vers_end);
+ if (dict_table_get_nth_col(table, table->vers_end)->vers_native()) {
+ mach_write_to_8(row_end_data, trx->id);
+ dfield_set_data(row_end, row_end_data, 8);
+ } else {
+ thd_get_query_start_data(trx->mysql_thd, row_end_data);
+ dfield_set_data(row_end, row_end_data, 7);
+ }
+
+ for (;;) {
+ thr->run_node = insert_node;
+ thr->prev_node = insert_node;
+
+ row_ins_step(thr);
+
+ switch (trx->error_state) {
+ case DB_LOCK_WAIT:
+ if (lock_wait(thr) == DB_SUCCESS) {
+ continue;
+ }
+
+ /* fall through */
+ default:
+ /* Other errors are handled for the parent node. */
+ thr->fk_cascade_depth = 0;
+ goto exit;
+
+ case DB_SUCCESS:
+ dict_stats_update_if_needed(table, *trx);
+ goto exit;
+ }
+ }
+exit:
+ que_graph_free_recursive(insert_node);
+ mem_heap_free(node->historical_heap);
+ node->historical_heap = NULL;
+ return trx->error_state;
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table) /*!< in: table where we do the operation */
+{
+ /* Increment fk_cascade_depth to record the recursive call depth on
+ a single update/delete that affects multiple tables chained
+ together with foreign key relations. */
+
+ if (++thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+ return(DB_FOREIGN_EXCEED_MAX_CASCADE);
+ }
+
+ trx_t* trx = thr_get_trx(thr);
+
+ if (table->versioned()) {
+ if (node->is_delete == PLAIN_DELETE) {
+ node->vers_make_delete(trx);
+ } else if (node->update->affects_versioned()) {
+ dberr_t err = row_update_vers_insert(thr, node);
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+ node->vers_make_update(trx);
+ }
+ }
+
+ for (;;) {
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ DEBUG_SYNC_C("foreign_constraint_update_cascade");
+ {
+ TABLE *mysql_table = thr->prebuilt->m_mysql_table;
+ thr->prebuilt->m_mysql_table = NULL;
+ row_upd_step(thr);
+ thr->prebuilt->m_mysql_table = mysql_table;
+ }
+
+ switch (trx->error_state) {
+ case DB_LOCK_WAIT:
+ if (lock_wait(thr) == DB_SUCCESS) {
+ continue;
+ }
+
+ /* fall through */
+ default:
+ /* Other errors are handled for the parent node. */
+ thr->fk_cascade_depth = 0;
+ return trx->error_state;
+
+ case DB_SUCCESS:
+ thr->fk_cascade_depth = 0;
+ bool stats;
+
+ if (node->is_delete == PLAIN_DELETE) {
+ /* Not protected by dict_sys.latch
+ or node->table->stats_mutex_lock() for
+ performance reasons, we would rather
+ get garbage in stat_n_rows (which is
+ just an estimate anyway) than
+ protecting the following code with a
+ latch. */
+ dict_table_n_rows_dec(node->table);
+
+ stats = !srv_stats_include_delete_marked;
+ } else {
+ stats = !(node->cmpl_info
+ & UPD_NODE_NO_ORD_CHANGE);
+ }
+
+ if (stats) {
+ dict_stats_update_if_needed(node->table, *trx);
+ } else {
+ /* Always update the table
+ modification counter. */
+ node->table->stat_modified_counter++;
+ }
+
+ return(DB_SUCCESS);
+ }
+ }
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed, or on DB_SUCCESS
+ added to the data dictionary cache) */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ tab_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ ut_ad(dict_sys.sys_tables_exist());
+ ut_ad(dict_sys.locked());
+ ut_ad(trx->dict_operation_lock_mode);
+
+ DEBUG_SYNC_C("create_table");
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_at_start_of_row_create_table_for_mysql",
+ dict_mem_table_free(table); return DB_ERROR;
+ );
+
+ trx->op_info = "creating table";
+
+ heap = mem_heap_create(512);
+
+ trx->dict_operation = true;
+
+ node = tab_create_graph_create(table, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+ ut_a(thr == que_fork_start_command(
+ static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+ que_run_threads(thr);
+
+ dberr_t err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ dict_mem_table_free(table);
+ }
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths, /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+ fil_encryption_t mode, /*!< in: encryption mode */
+ uint32_t key_id) /*!< in: encryption key_id */
+{
+ ind_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ dberr_t err;
+ ulint i;
+ ulint len;
+ dict_table_t* table = index->table;
+
+ ut_ad(dict_sys.locked());
+
+ for (i = 0; i < index->n_def; i++) {
+ /* Check that prefix_len and actual length
+ < DICT_MAX_INDEX_COL_LEN */
+
+ len = dict_index_get_nth_field(index, i)->prefix_len;
+
+ if (field_lengths && field_lengths[i]) {
+ len = ut_max(len, field_lengths[i]);
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_at_create_index",
+ len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1;
+ );
+
+ /* Column or prefix length exceeds maximum column length */
+ if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) {
+ dict_mem_index_free(index);
+ return DB_TOO_BIG_INDEX_COL;
+ }
+ }
+
+ /* For temp-table we avoid insertion into SYSTEM TABLES to
+ maintain performance and so we have separate path that directly
+ just updates dictonary cache. */
+ if (!table->is_temporary()) {
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ ut_ad(trx->dict_operation);
+ trx->op_info = "creating index";
+
+ /* Note that the space id where we store the index is
+ inherited from the table in dict_build_index_def_step()
+ in dict0crea.cc. */
+
+ heap = mem_heap_create(512);
+ node = ind_create_graph_create(index, table->name.m_name,
+ heap, mode, key_id);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+ ut_a(thr == que_fork_start_command(
+ static_cast<que_fork_t*>(
+ que_node_get_parent(thr))));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ index = node->index;
+
+ ut_ad(!index == (err != DB_SUCCESS));
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ if (index && (index->type & DICT_FTS)) {
+ err = fts_create_index_tables(trx, index, table->id);
+ }
+
+ trx->op_info = "";
+ } else {
+ dict_build_index_def(table, index, trx);
+
+ err = dict_index_add_to_cache(index, FIL_NULL);
+ ut_ad((index == NULL) == (err != DB_SUCCESS));
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+ ut_ad(!index->is_instant());
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+ err = dict_create_index_tree_in_mem(index, trx);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (err != DB_SUCCESS) {
+ dict_index_remove_from_cache(table, index);
+ }
+ }
+ }
+
+ return(err);
+}
+
+/** Reassigns the table identifier of a table.
+@param[in,out] table table
+@param[in,out] trx transaction
+@param[out] new_id new table id
+@return error code or DB_SUCCESS */
+static
+dberr_t
+row_mysql_table_id_reassign(
+ dict_table_t* table,
+ trx_t* trx,
+ table_id_t* new_id)
+{
+ if (!dict_sys.sys_tables || dict_sys.sys_tables->corrupted ||
+ !dict_sys.sys_columns || dict_sys.sys_columns->corrupted ||
+ !dict_sys.sys_indexes || dict_sys.sys_indexes->corrupted ||
+ !dict_sys.sys_virtual || dict_sys.sys_virtual->corrupted) {
+ return DB_CORRUPTION;
+ }
+
+ dberr_t err;
+ pars_info_t* info = pars_info_create();
+
+ dict_hdr_get_new_id(new_id, NULL, NULL);
+
+ pars_info_add_ull_literal(info, "old_id", table->id);
+ pars_info_add_ull_literal(info, "new_id", *new_id);
+
+ /* Note: This cannot be rolled back. Rollback would see the
+ UPDATE SYS_INDEXES as two operations: DELETE and INSERT.
+ It would invoke btr_free_if_exists() when rolling back the
+ INSERT, effectively dropping all indexes of the table. */
+ err = que_eval_sql(
+ info,
+ "PROCEDURE RENUMBER_TABLE_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET ID = :new_id\n"
+ " WHERE ID = :old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "UPDATE SYS_VIRTUAL SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "END;\n", trx);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Do the foreign key constraint checks.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace_foreign_key_checks(
+/*======================================*/
+ const trx_t* trx, /*!< in: transaction handle */
+ const dict_table_t* table) /*!< in: table to be discarded */
+{
+
+ if (srv_read_only_mode || !trx->check_foreigns) {
+ return(DB_SUCCESS);
+ }
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+ dict_foreign_set::const_iterator it
+ = std::find_if(table->referenced_set.begin(),
+ table->referenced_set.end(),
+ dict_foreign_different_tables());
+
+ if (it == table->referenced_set.end()) {
+ return(DB_SUCCESS);
+ }
+
+ const dict_foreign_t* foreign = *it;
+ FILE* ef = dict_foreign_err_file;
+
+ ut_ad(foreign->foreign_table != table);
+ ut_ad(foreign->referenced_table == table);
+
+ /* We only allow discarding a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ mysql_mutex_lock(&dict_foreign_err_mutex);
+
+ rewind(ef);
+
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot DISCARD table ", ef);
+ ut_print_name(ef, trx, table->name.m_name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(ef, trx, foreign->foreign_table_name);
+ putc('\n', ef);
+
+ mysql_mutex_unlock(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*********************************************************************//**
+Do the DISCARD TABLESPACE operation.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction handle */
+ dict_table_t* table) /*!< in/out: table to be discarded */
+{
+ dberr_t err;
+
+ /* How do we prevent crashes caused by ongoing operations on
+ the table? Old operations could try to access non-existent
+ pages. The SQL layer will block all DML on the table using MDL and a
+ DISCARD will not start unless all existing operations on the
+ table to be discarded are completed.
+
+ 1) Acquire the data dictionary latch in X mode. This will
+ prevent any internal operations that are not covered by
+ MDL or InnoDB table locks.
+
+ 2) Purge and rollback: we assign a new table id for the
+ table. Since purge and rollback look for the table based on
+ the table id, they see the table as 'dropped' and discard
+ their operations.
+
+ 3) Insert buffer: we remove all entries for the tablespace in
+ the insert buffer tree. */
+
+ ibuf_delete_for_discarded_space(table->space_id);
+
+ table_id_t new_id;
+
+ /* Set the TABLESPACE DISCARD flag in the table definition
+ on disk. */
+ err = row_import_update_discarded_flag(trx, table->id, true);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Update the index root pages in the system tables, on disk */
+ err = row_import_update_index_root(trx, table, true);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Drop all the FTS auxiliary tables. */
+ if (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ fts_drop_tables(trx, *table);
+ }
+
+ /* Assign a new space ID to the table definition so that purge
+ can ignore the changes. Update the system table on disk. */
+
+ err = row_mysql_table_id_reassign(table, trx, &new_id);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* All persistent operations successful, update the
+ data dictionary memory cache. */
+
+ dict_table_change_id_in_cache(table, new_id);
+
+ dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ if (index) index->clear_instant_alter();
+
+ /* Reset the root page numbers. */
+ for (; index; index = UT_LIST_GET_NEXT(indexes, index)) {
+ index->page = FIL_NULL;
+ }
+
+ /* If the tablespace did not already exist or we couldn't
+ write to it, we treat that as a successful DISCARD. It is
+ unusable anyway. */
+ return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function renames the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
+{
+ ut_ad(!is_system_tablespace(table->space_id));
+ ut_ad(!table->is_temporary());
+
+ const auto fts_exist = table->flags2 &
+ (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+
+ dberr_t err;
+
+ if (fts_exist)
+ {
+ fts_optimize_remove_table(table);
+ purge_sys.stop_FTS(*table);
+ err= fts_lock_tables(trx, *table);
+ if (err != DB_SUCCESS)
+ {
+rollback:
+ if (fts_exist)
+ {
+ purge_sys.resume_FTS();
+ fts_optimize_add_table(table);
+ }
+ trx->rollback();
+ if (trx->dict_operation_lock_mode)
+ row_mysql_unlock_data_dictionary(trx);
+ return err;
+ }
+ }
+
+ row_mysql_lock_data_dictionary(trx);
+ trx->op_info = "discarding tablespace";
+ trx->dict_operation= true;
+
+ /* We serialize data dictionary operations with dict_sys.latch:
+ this is to avoid deadlocks during data dictionary operations */
+
+ err= row_discard_tablespace_foreign_key_checks(trx, table);
+ if (err != DB_SUCCESS)
+ goto rollback;
+
+ /* Note: The following cannot be rolled back. Rollback would see the
+ UPDATE of SYS_INDEXES.TABLE_ID as two operations: DELETE and INSERT.
+ It would invoke btr_free_if_exists() when rolling back the INSERT,
+ effectively dropping all indexes of the table. Furthermore, calls like
+ ibuf_delete_for_discarded_space() are already discarding data
+ before the transaction is committed.
+
+ It would be better to remove the integrity-breaking
+ ALTER TABLE...DISCARD TABLESPACE operation altogether. */
+ table->file_unreadable= true;
+ table->space= nullptr;
+ table->flags2|= DICT_TF2_DISCARDED;
+ err= row_discard_tablespace(trx, table);
+ DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+ log_buffer_flush_to_disk(); DBUG_SUICIDE(););
+ /* FTS_ tables may be deleted */
+ std::vector<pfs_os_file_t> deleted;
+ trx->commit(deleted);
+ const auto space_id= table->space_id;
+ pfs_os_file_t d= fil_delete_tablespace(space_id);
+ DBUG_EXECUTE_IF("ib_discard_after_commit_crash", DBUG_SUICIDE(););
+ row_mysql_unlock_data_dictionary(trx);
+
+ if (d != OS_FILE_CLOSED)
+ os_file_close(d);
+ for (pfs_os_file_t d : deleted)
+ os_file_close(d);
+
+ if (fts_exist)
+ purge_sys.resume_FTS();
+
+ ibuf_delete_for_discarded_space(space_id);
+ buf_flush_remove_pages(space_id);
+ trx->op_info= "";
+ return err;
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint_low(
+/*======================*/
+ const char* id, /*!< in: constraint id */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", id);
+
+ return(que_eval_sql(info,
+ "PROCEDURE DELETE_CONSTRAINT () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+ "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+ "END;\n", trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint(
+/*==================*/
+ const char* id, /*!< in: constraint id */
+ const char* database_name, /*!< in: database name, with the
+ trailing '/' */
+ mem_heap_t* heap, /*!< in: memory heap */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dberr_t err;
+
+ /* New format constraints have ids <databasename>/<constraintname>. */
+ err = row_delete_constraint_low(
+ mem_heap_strcat(heap, database_name, id), trx);
+
+ if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+ /* Old format < 4.0.18 constraints have constraint ids
+ NUMBER_NUMBER. We only try deleting them if the
+ constraint name does not contain a '/' character, otherwise
+ deleting a new format constraint named 'foo/bar' from
+ database 'baz' would remove constraint 'bar' from database
+ 'foo', if it existed. */
+
+ err = row_delete_constraint_low(id, trx);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in/out: transaction */
+ bool use_fk) /*!< in: whether to parse and enforce
+ FOREIGN KEY constraints */
+{
+ dict_table_t* table = NULL;
+ dberr_t err = DB_ERROR;
+ mem_heap_t* heap = NULL;
+ const char** constraints_to_drop = NULL;
+ ulint n_constraints_to_drop = 0;
+ ibool old_is_tmp, new_is_tmp;
+ pars_info_t* info = NULL;
+
+ ut_a(old_name != NULL);
+ ut_a(new_name != NULL);
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ ut_ad(trx->dict_operation_lock_mode);
+
+ if (high_level_read_only) {
+ return(DB_READ_ONLY);
+ }
+
+ trx->op_info = "renaming table";
+
+ old_is_tmp = dict_table_t::is_temporary_name(old_name);
+ new_is_tmp = dict_table_t::is_temporary_name(new_name);
+
+ table = dict_table_open_on_name(old_name, true,
+ DICT_ERR_IGNORE_FK_NOKEY);
+
+ /* MariaDB partition engine hard codes the file name
+ separator as "#P#" and "#SP#". The text case is fixed even if
+ lower_case_table_names is set to 1 or 2. InnoDB always
+ normalises file names to lower case on Windows, this
+ can potentially cause problems when copying/moving
+ tables between platforms.
+
+ 1) If boot against an installation from Windows
+ platform, then its partition table name could
+ be all be in lower case in system tables. So we
+ will need to check lower case name when load table.
+
+ 2) If we boot an installation from other case
+ sensitive platform in Windows, we might need to
+ check the existence of table name without lowering
+ case them in the system table. */
+ if (!table && lower_case_table_names == 1
+ && strstr(old_name, table_name_t::part_suffix)) {
+ char par_case_name[MAX_FULL_NAME_LEN + 1];
+#ifndef _WIN32
+ /* Check for the table using lower
+ case name, including the partition
+ separator "P" */
+ memcpy(par_case_name, old_name,
+ strlen(old_name));
+ par_case_name[strlen(old_name)] = 0;
+ innobase_casedn_str(par_case_name);
+#else
+ /* On Windows platfrom, check
+ whether there exists table name in
+ system table whose name is
+ not being normalized to lower case */
+ normalize_table_name_c_low(
+ par_case_name, old_name, FALSE);
+#endif
+ table = dict_table_open_on_name(par_case_name, true,
+ DICT_ERR_IGNORE_FK_NOKEY);
+ }
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+ goto funct_exit;
+ }
+
+ ut_ad(!table->is_temporary());
+
+ if (!table->is_readable() && !table->space
+ && !(table->flags2 & DICT_TF2_DISCARDED)) {
+
+ err = DB_TABLE_NOT_FOUND;
+
+ ib::error() << "Table " << old_name << " does not have an .ibd"
+ " file in the database directory. "
+ << TROUBLESHOOTING_MSG;
+
+ goto funct_exit;
+
+ } else if (use_fk && !old_is_tmp && new_is_tmp) {
+ /* MySQL is doing an ALTER TABLE command and it renames the
+ original table to a temporary table name. We want to preserve
+ the original foreign key constraint definitions despite the
+ name change. An exception is those constraints for which
+ the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+ heap = mem_heap_create(100);
+
+ err = dict_foreign_parse_drop_constraints(
+ heap, trx, table, &n_constraints_to_drop,
+ &constraints_to_drop);
+
+ if (err != DB_SUCCESS) {
+ goto funct_exit;
+ }
+ }
+
+ err = trx_undo_report_rename(trx, table);
+
+ if (err != DB_SUCCESS) {
+ goto funct_exit;
+ }
+
+ /* We use the private SQL parser of Innobase to generate the query
+ graphs needed in updating the dictionary data from system tables. */
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "old_table_name", old_name);
+
+ err = que_eval_sql(info,
+ "PROCEDURE RENAME_TABLE () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES"
+ " SET NAME = :new_table_name\n"
+ " WHERE NAME = :old_table_name;\n"
+ "END;\n", trx);
+
+ if (err != DB_SUCCESS) {
+ // Assume the caller guarantees destination name doesn't exist.
+ ut_ad(err != DB_DUPLICATE_KEY);
+ goto rollback_and_exit;
+ }
+
+ if (!new_is_tmp) {
+ /* Rename all constraints. */
+ char new_table_name[MAX_TABLE_NAME_LEN + 1];
+ char old_table_utf8[MAX_TABLE_NAME_LEN + 1];
+ uint errors = 0;
+
+ strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+ old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+ innobase_convert_to_system_charset(
+ strchr(old_table_utf8, '/') + 1,
+ strchr(old_name, '/') +1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ /* Table name could not be converted from charset
+ my_charset_filename to UTF-8. This means that the
+ table name is already in UTF-8 (#mysql#50). */
+ strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+ old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+ }
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "old_table_name", old_name);
+ pars_info_add_str_literal(info, "old_table_name_utf8",
+ old_table_utf8);
+
+ strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+ new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+ innobase_convert_to_system_charset(
+ strchr(new_table_name, '/') + 1,
+ strchr(new_name, '/') +1,
+ MAX_TABLE_NAME_LEN, &errors);
+
+ if (errors) {
+ /* Table name could not be converted from charset
+ my_charset_filename to UTF-8. This means that the
+ table name is already in UTF-8 (#mysql#50). */
+ strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+ new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+ }
+
+ pars_info_add_str_literal(info, "new_table_utf8", new_table_name);
+
+ err = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+ "gen_constr_prefix CHAR;\n"
+ "new_db_name CHAR;\n"
+ "foreign_id CHAR;\n"
+ "new_foreign_id CHAR;\n"
+ "old_db_name_len INT;\n"
+ "old_t_name_len INT;\n"
+ "new_db_name_len INT;\n"
+ "id_len INT;\n"
+ "offset INT;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "found := 1;\n"
+ "old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+ "new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+ "new_db_name := SUBSTR(:new_table_name, 0,\n"
+ " new_db_name_len);\n"
+ "old_t_name_len := LENGTH(:old_table_name);\n"
+ "gen_constr_prefix := CONCAT(:old_table_name_utf8,\n"
+ " '_ibfk_');\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO foreign_id\n"
+ " FROM SYS_FOREIGN\n"
+ " WHERE FOR_NAME = :old_table_name\n"
+ " AND TO_BINARY(FOR_NAME)\n"
+ " = TO_BINARY(:old_table_name)\n"
+ " LOCK IN SHARE MODE;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET FOR_NAME = :new_table_name\n"
+ " WHERE ID = foreign_id;\n"
+ " id_len := LENGTH(foreign_id);\n"
+ " IF (INSTR(foreign_id, '/') > 0) THEN\n"
+ " IF (INSTR(foreign_id,\n"
+ " gen_constr_prefix) > 0)\n"
+ " THEN\n"
+ " offset := INSTR(foreign_id, '_ibfk_') - 1;\n"
+ " new_foreign_id :=\n"
+ " CONCAT(:new_table_utf8,\n"
+ " SUBSTR(foreign_id, offset,\n"
+ " id_len - offset));\n"
+ " ELSE\n"
+ " new_foreign_id :=\n"
+ " CONCAT(new_db_name,\n"
+ " SUBSTR(foreign_id,\n"
+ " old_db_name_len,\n"
+ " id_len - old_db_name_len));\n"
+ " END IF;\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " UPDATE SYS_FOREIGN_COLS\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+ "WHERE REF_NAME = :old_table_name\n"
+ " AND TO_BINARY(REF_NAME)\n"
+ " = TO_BINARY(:old_table_name);\n"
+ "END;\n", trx);
+
+ } else if (n_constraints_to_drop > 0) {
+ /* Drop some constraints of tmp tables. */
+
+ ulint db_name_len = dict_get_db_name_len(old_name) + 1;
+ char* db_name = mem_heap_strdupl(heap, old_name,
+ db_name_len);
+ ulint i;
+
+ for (i = 0; i < n_constraints_to_drop; i++) {
+ err = row_delete_constraint(constraints_to_drop[i],
+ db_name, heap, trx);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+ }
+
+ if (err == DB_SUCCESS
+ && (dict_table_has_fts_index(table)
+ || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
+ && !dict_tables_have_same_db(old_name, new_name)) {
+ err = fts_rename_aux_tables(table, new_name, trx);
+ }
+
+ switch (err) {
+ case DB_DUPLICATE_KEY:
+ ib::error() << "Table rename might cause two"
+ " FOREIGN KEY constraints to have the same"
+ " internal name in case-insensitive comparison.";
+ ib::info() << TROUBLESHOOTING_MSG;
+ /* fall through */
+ rollback_and_exit:
+ default:
+ trx->error_state = DB_SUCCESS;
+ trx->rollback();
+ trx->error_state = DB_SUCCESS;
+ break;
+ case DB_SUCCESS:
+ DEBUG_SYNC_C("innodb_rename_in_cache");
+ /* The following call will also rename the .ibd file */
+ err = dict_table_rename_in_cache(
+ table, span<const char>{new_name,strlen(new_name)},
+ false);
+ if (err != DB_SUCCESS) {
+ goto rollback_and_exit;
+ }
+
+ /* In case of copy alter, template db_name and
+ table_name should be renamed only for newly
+ created table. */
+ if (table->vc_templ != NULL && !new_is_tmp) {
+ innobase_rename_vc_templ(table);
+ }
+
+ /* We only want to switch off some of the type checking in
+ an ALTER TABLE, not in a RENAME. */
+ dict_names_t fk_tables;
+
+ err = dict_load_foreigns(
+ new_name, nullptr, trx->id,
+ !old_is_tmp || trx->check_foreigns,
+ use_fk
+ ? DICT_ERR_IGNORE_NONE
+ : DICT_ERR_IGNORE_FK_NOKEY,
+ fk_tables);
+
+ if (err != DB_SUCCESS) {
+ if (old_is_tmp) {
+ /* In case of copy alter, ignore the
+ loading of foreign key constraint
+ when foreign_key_check is disabled */
+ ib::error_or_warn(trx->check_foreigns)
+ << "In ALTER TABLE "
+ << ut_get_name(trx, new_name)
+ << " has or is referenced in foreign"
+ " key constraints which are not"
+ " compatible with the new table"
+ " definition.";
+ if (!trx->check_foreigns) {
+ err = DB_SUCCESS;
+ break;
+ }
+ } else {
+ ib::error() << "In RENAME TABLE table "
+ << ut_get_name(trx, new_name)
+ << " is referenced in foreign key"
+ " constraints which are not compatible"
+ " with the new table definition.";
+ }
+
+ goto rollback_and_exit;
+ }
+
+ /* Check whether virtual column or stored column affects
+ the foreign key constraint of the table. */
+ if (dict_foreigns_has_s_base_col(table->foreign_set, table)) {
+ err = DB_NO_FK_ON_S_BASE_COL;
+ goto rollback_and_exit;
+ }
+
+ /* Fill the virtual column set in foreign when
+ the table undergoes copy alter operation. */
+ dict_mem_table_free_foreign_vcol_set(table);
+ dict_mem_table_fill_foreign_vcol_set(table);
+
+ while (!fk_tables.empty()) {
+ const char *f = fk_tables.front();
+ dict_sys.load_table({f, strlen(f)});
+ fk_tables.pop_front();
+ }
+
+ table->data_dir_path= NULL;
+ }
+
+funct_exit:
+ if (table) {
+ table->release();
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
new file mode 100644
index 00000000..4756cc37
--- /dev/null
+++ b/storage/innobase/row/row0purge.cc
@@ -0,0 +1,1304 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.cc
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+#include "btr0cur.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "dict0crea.h"
+#include "dict0stats.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "handler.h"
+#include "ha_innodb.h"
+#include "fil0fil.h"
+#include "debug_sync.h"
+#include <mysql/service_thd_mdl.h>
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found. If the record is not found, close pcur.
+@return TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+ btr_latch_mode mode, /*!< in: latching mode */
+ purge_node_t* node, /*!< in: row purge node */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (node->found_clust) {
+ ut_ad(node->validate_pcur());
+
+ node->found_clust =
+ node->pcur.restore_position(mode, mtr) ==
+ btr_pcur_t::SAME_ALL;
+
+ } else {
+ node->found_clust = row_search_on_row_ref(
+ &node->pcur, mode, node->table, node->ref, mtr);
+
+ if (node->found_clust) {
+ btr_pcur_store_position(&node->pcur, mtr);
+ }
+ }
+
+ /* Close the current cursor if we fail to position it correctly. */
+ if (!node->found_clust) {
+ btr_pcur_close(&node->pcur);
+ }
+
+ return(node->found_clust);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@retval true if the row was not found, or it was successfully removed
+@retval false if the row was modified after the delete marking */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+ purge_node_t* node, /*!< in/out: row purge node */
+ btr_latch_mode mode) /*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
+{
+ dict_index_t* index = dict_table_get_first_index(node->table);
+ table_id_t table_id = 0;
+ index_id_t index_id = 0;
+ dict_table_t *table = nullptr;
+ pfs_os_file_t f = OS_FILE_CLOSED;
+
+ if (table_id) {
+retry:
+ dict_sys.lock(SRW_LOCK_CALL);
+ table = dict_sys.find_table(table_id);
+ if (!table) {
+ dict_sys.unlock();
+ } else if (table->n_rec_locks) {
+ for (dict_index_t* ind = UT_LIST_GET_FIRST(
+ table->indexes); ind;
+ ind = UT_LIST_GET_NEXT(indexes, ind)) {
+ if (ind->id == index_id) {
+ lock_discard_for_index(*ind);
+ }
+ }
+ }
+ }
+ mtr_t mtr;
+ mtr.start();
+ index->set_modified(mtr);
+ log_free_check();
+ bool success = true;
+
+ if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+ /* The record was already removed. */
+removed:
+ mtr.commit();
+close_and_exit:
+ if (table) {
+ dict_sys.unlock();
+ }
+ return success;
+ }
+
+ if (node->table->id == DICT_INDEXES_ID) {
+ /* If this is a record of the SYS_INDEXES table, then
+ we have to free the file segments of the index tree
+ associated with the index */
+ if (!table_id) {
+ const rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+ table_id = mach_read_from_8(rec);
+ index_id = mach_read_from_8(rec + 8);
+ if (table_id) {
+ mtr.commit();
+ goto retry;
+ }
+ ut_ad("corrupted SYS_INDEXES record" == 0);
+ }
+
+ const uint32_t space_id = dict_drop_index_tree(
+ &node->pcur, nullptr, &mtr);
+ if (space_id) {
+ if (table) {
+ if (table->get_ref_count() == 0) {
+ dict_sys.remove(table);
+ } else if (table->space_id == space_id) {
+ table->space = nullptr;
+ table->file_unreadable = true;
+ }
+ dict_sys.unlock();
+ table = nullptr;
+ }
+ f = fil_delete_tablespace(space_id);
+ }
+
+ mtr.commit();
+
+ if (table) {
+ dict_sys.unlock();
+ table = nullptr;
+ }
+
+ if (space_id) {
+ ibuf_delete_for_discarded_space(space_id);
+ }
+
+ mtr.start();
+ index->set_modified(mtr);
+
+ if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+ goto removed;
+ }
+ }
+
+ rec_t* rec = btr_pcur_get_rec(&node->pcur);
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
+ /* Someone else has modified the record later: do not remove */
+ goto func_exit;
+ }
+
+ ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = DB_FAIL != btr_cur_optimistic_delete(
+ btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
+ } else {
+ dberr_t err;
+ ut_ad(mode == BTR_PURGE_TREE);
+ btr_cur_pessimistic_delete(
+ &err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
+ false, &mtr);
+ success = err == DB_SUCCESS;
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ /* Persistent cursor is closed if reposition fails. */
+ if (node->found_clust) {
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ } else {
+ mtr_commit(&mtr);
+ }
+
+ goto close_and_exit;
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of running out
+of file space. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss(
+/*===========================*/
+ purge_node_t* node) /*!< in/out: row purge node */
+{
+ if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) {
+ return(true);
+ }
+
+ for (ulint n_tries = 0;
+ n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
+ n_tries++) {
+ if (row_purge_remove_clust_if_poss_low(node, BTR_PURGE_TREE)) {
+ return(true);
+ }
+
+ std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+ }
+
+ return(false);
+}
+
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page). It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out] node row purge node
+@param[in] index secondary index
+@param[in] entry secondary index entry
+@param[in,out] sec_pcur secondary index cursor or NULL
+ if it is called for purge buffering
+ operation.
+@param[in,out] sec_mtr mini-transaction which holds
+ secondary index entry or NULL if it is
+ called for purge buffering operation.
+@param[in] is_tree true=pessimistic purge,
+ false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+ purge_node_t* node,
+ dict_index_t* index,
+ const dtuple_t* entry,
+ btr_pcur_t* sec_pcur,
+ mtr_t* sec_mtr,
+ bool is_tree)
+{
+ bool can_delete;
+ mtr_t mtr;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ mtr_start(&mtr);
+
+ can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
+ || !row_vers_old_has_index_entry(true,
+ btr_pcur_get_rec(&node->pcur),
+ &mtr, index, entry,
+ node->roll_ptr, node->trx_id);
+
+ /* Persistent cursor is closed if reposition fails. */
+ if (node->found_clust) {
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ } else {
+ mtr.commit();
+ }
+
+ ut_ad(mtr.has_committed());
+
+ return can_delete;
+}
+
+/***************************************************************
+Removes a secondary index entry if possible, by modifying the
+index tree. Does not try to buffer the delete.
+@return TRUE if success or if not found */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+row_purge_remove_sec_if_poss_tree(
+/*==============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ btr_pcur_t pcur;
+ ibool success = TRUE;
+ dberr_t err;
+ mtr_t mtr;
+
+ log_free_check();
+ mtr.start();
+ index->set_modified(mtr);
+ pcur.btr_cur.page_cur.index = index;
+
+ if (index->is_spatial()) {
+ if (!rtr_search(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+ goto found;
+ }
+ goto func_exit;
+ }
+
+ switch (row_search_index_entry(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+ case ROW_NOT_FOUND:
+ /* Not found. This is a legitimate condition. In a
+ rollback, InnoDB will remove secondary recs that would
+ be purged anyway. Then the actual purge will not find
+ the secondary index record. Also, the purge itself is
+ eager: if it comes to consider a secondary index
+ record, and notices it does not need to exist in the
+ index, it will remove it. Then if/when the purge
+ comes to consider the secondary index record a second
+ time, it will not exist any more in the index. */
+
+ /* fputs("PURGE:........sec entry not found\n", stderr); */
+ /* dtuple_print(stderr, entry); */
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ }
+
+ /* We should remove the index record if no later version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should do nothing. */
+
+found:
+ if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
+
+ /* Remove the index record, which should have been
+ marked for deletion. */
+ if (!rec_get_deleted_flag(btr_cur_get_rec(
+ btr_pcur_get_btr_cur(&pcur)),
+ dict_table_is_comp(index->table))) {
+ ib::error()
+ << "tried to purge non-delete-marked record"
+ " in index " << index->name
+ << " of table " << index->table->name
+ << ": tuple: " << *entry
+ << ", record: " << rec_index_print(
+ btr_cur_get_rec(
+ btr_pcur_get_btr_cur(&pcur)),
+ index);
+
+ ut_ad(0);
+
+ goto func_exit;
+ }
+
+ btr_cur_pessimistic_delete(&err, FALSE,
+ btr_pcur_get_btr_cur(&pcur),
+ 0, false, &mtr);
+ switch (UNIV_EXPECT(err, DB_SUCCESS)) {
+ case DB_SUCCESS:
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ success = FALSE;
+ break;
+ default:
+ ut_error;
+ }
+ }
+
+func_exit:
+ btr_pcur_close(&pcur); // FIXME: need this?
+ mtr.commit();
+
+ return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry without modifying the index tree,
+if possible.
+@retval true if success or if not found
+@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_sec_if_poss_leaf(
+/*==============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ bool success = true;
+
+ log_free_check();
+ ut_ad(index->table == node->table);
+ ut_ad(!index->table->is_temporary());
+ mtr.start();
+ index->set_modified(mtr);
+
+ pcur.btr_cur.page_cur.index = index;
+
+ /* Set the purge node for the call to row_purge_poss_sec(). */
+ pcur.btr_cur.purge_node = node;
+ if (index->is_spatial()) {
+ pcur.btr_cur.thr = NULL;
+ if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, &mtr)) {
+ goto found;
+ }
+ goto func_exit;
+ }
+
+ /* Set the query thread, so that ibuf_insert_low() will be
+ able to invoke thd_get_trx(). */
+ pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
+
+ switch (row_search_index_entry(entry, index->has_virtual()
+ ? BTR_MODIFY_LEAF : BTR_PURGE_LEAF,
+ &pcur, &mtr)) {
+ case ROW_FOUND:
+found:
+ /* Before attempting to purge a record, check
+ if it is safe to do so. */
+ if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ /* Only delete-marked records should be purged. */
+ if (!rec_get_deleted_flag(
+ btr_cur_get_rec(btr_cur),
+ dict_table_is_comp(index->table))) {
+
+ ib::error()
+ << "tried to purge non-delete-marked"
+ " record" " in index " << index->name
+ << " of table " << index->table->name
+ << ": tuple: " << *entry
+ << ", record: "
+ << rec_index_print(
+ btr_cur_get_rec(btr_cur),
+ index);
+ mtr.commit();
+ dict_set_corrupted(index, "purge");
+ goto cleanup;
+ }
+
+ if (index->is_spatial()) {
+ const buf_block_t* block = btr_cur_get_block(
+ btr_cur);
+
+ if (block->page.id().page_no()
+ != index->page
+ && page_get_n_recs(block->page.frame) < 2
+ && !lock_test_prdt_page_lock(
+ btr_cur->rtr_info
+ && btr_cur->rtr_info->thr
+ ? thr_get_trx(
+ btr_cur->rtr_info->thr)
+ : nullptr,
+ block->page.id())) {
+ /* this is the last record on page,
+ and it has a "page" lock on it,
+ which mean search is still depending
+ on it, so do not delete */
+ DBUG_LOG("purge",
+ "skip purging last"
+ " record on page "
+ << block->page.id());
+ goto func_exit;
+ }
+ }
+
+ success = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+ != DB_FAIL;
+ }
+
+ /* (The index entry is still needed,
+ or the deletion succeeded) */
+ /* fall through */
+ case ROW_NOT_DELETED_REF:
+ /* The index entry is still needed. */
+ case ROW_BUFFERED:
+ /* The deletion was buffered. */
+ case ROW_NOT_FOUND:
+ /* The index entry does not exist, nothing to do. */
+func_exit:
+ mtr.commit();
+cleanup:
+ btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
+ return(success);
+ }
+
+ ut_error;
+ return(false);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE MY_ATTRIBUTE((nonnull(1,2)))
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+ /* fputs("Purge: Removing secondary record\n", stderr); */
+
+ if (!entry) {
+ /* The node->row must have lacked some fields of this
+ index. This is possible when the undo log record was
+ written before this index was created. */
+ return;
+ }
+
+ if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_sec_if_poss_tree(node, index, entry);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of
+running out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_del_mark(
+/*===============*/
+ purge_node_t* node) /*!< in/out: row purge node */
+{
+ if (node->index)
+ {
+ mem_heap_t *heap= mem_heap_create(1024);
+
+ do
+ {
+ if (node->index->type & (DICT_FTS | DICT_CORRUPT))
+ continue;
+ if (!node->index->is_committed())
+ continue;
+ dtuple_t* entry= row_build_index_entry_low(node->row, nullptr,
+ node->index, heap,
+ ROW_BUILD_FOR_PURGE);
+ row_purge_remove_sec_if_poss(node, node->index, entry);
+ mem_heap_empty(heap);
+ }
+ while ((node->index= dict_table_get_next_index(node->index)));
+
+ mem_heap_free(heap);
+ }
+
+ bool result= row_purge_remove_clust_if_poss(node);
+
+#ifdef ENABLED_DEBUG_SYNC
+ DBUG_EXECUTE_IF("enable_row_purge_del_mark_exit_sync_point",
+ debug_sync_set_action
+ (current_thd,
+ STRING_WITH_LEN("now SIGNAL row_purge_del_mark_finished"));
+ );
+#endif
+
+ return result;
+}
+
+/** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record
+whose old history can no longer be observed.
+@param[in,out] node purge node
+@param[in,out] mtr mini-transaction (will be started and committed) */
+static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
+{
+ /* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */
+ mtr->start();
+
+ if (row_purge_reposition_pcur(BTR_MODIFY_LEAF, node, mtr)) {
+ dict_index_t* index = dict_table_get_first_index(
+ node->table);
+ ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+ rec_t* rec = btr_pcur_get_rec(&node->pcur);
+ mem_heap_t* heap = NULL;
+ /* Reserve enough offsets for the PRIMARY KEY and 2 columns
+ so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ rec_offs_init(offsets_);
+ rec_offs* offsets = rec_get_offsets(
+ rec, index, offsets_, index->n_core_fields,
+ trx_id_pos + 2, &heap);
+ ut_ad(heap == NULL);
+
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+ ->col->mtype == DATA_SYS);
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+ ->col->prtype == (DATA_TRX_ID | DATA_NOT_NULL));
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+ ->col->mtype == DATA_SYS);
+ ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+ ->col->prtype == (DATA_ROLL_PTR | DATA_NOT_NULL));
+
+ /* Only update the record if DB_ROLL_PTR matches (the
+ record has not been modified after this transaction
+ became purgeable) */
+ if (node->roll_ptr
+ == row_get_rec_roll_ptr(rec, index, offsets)) {
+ ut_ad(!rec_get_deleted_flag(
+ rec, rec_offs_comp(offsets))
+ || rec_is_alter_metadata(rec, *index));
+ DBUG_LOG("purge", "reset DB_TRX_ID="
+ << ib::hex(row_get_rec_trx_id(
+ rec, index, offsets)));
+
+ index->set_modified(*mtr);
+ buf_block_t* block = btr_pcur_get_block(&node->pcur);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ block, rec, offsets, trx_id_pos,
+ 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+ mtr);
+ } else {
+ ulint len;
+ byte* ptr = rec_get_nth_field(
+ rec, offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ size_t offs = page_offset(ptr);
+ mtr->memset(block, offs, DATA_TRX_ID_LEN, 0);
+ offs += DATA_TRX_ID_LEN;
+ mtr->write<1,mtr_t::MAYBE_NOP>(
+ *block, block->page.frame + offs,
+ 0x80U);
+ mtr->memset(block, offs + 1,
+ DATA_ROLL_PTR_LEN - 1, 0);
+ }
+ }
+ }
+
+ mtr->commit();
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern_func(
+/*===============================*/
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
+ purge_node_t* node, /*!< in: row purge node */
+ const trx_undo_rec_t* undo_rec) /*!< in: record to purge */
+{
+ mem_heap_t* heap;
+
+ ut_ad(!node->table->skip_alter_undo);
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+ || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+ || !node->index) {
+
+ goto skip_secondaries;
+ }
+
+ heap = mem_heap_create(1024);
+
+ do {
+ if (node->index->type & (DICT_FTS | DICT_CORRUPT)) {
+ continue;
+ }
+
+ if (!node->index->is_committed()) {
+ continue;
+ }
+
+ if (row_upd_changes_ord_field_binary(node->index, node->update,
+ thr, NULL, NULL)) {
+ /* Build the older version of the index entry */
+ dtuple_t* entry = row_build_index_entry_low(
+ node->row, NULL, node->index,
+ heap, ROW_BUILD_FOR_PURGE);
+ row_purge_remove_sec_if_poss(node, node->index, entry);
+
+ ut_ad(node->table);
+
+ mem_heap_empty(heap);
+ }
+ } while ((node->index = dict_table_get_next_index(node->index)));
+
+ mem_heap_free(heap);
+
+skip_secondaries:
+ mtr_t mtr;
+ dict_index_t* index = dict_table_get_first_index(node->table);
+ /* Free possible externally stored fields */
+ for (ulint i = 0; i < upd_get_n_fields(node->update); i++) {
+
+ const upd_field_t* ufield
+ = upd_get_nth_field(node->update, i);
+
+ if (dfield_is_ext(&ufield->new_val)) {
+ bool is_insert;
+ ulint rseg_id;
+ uint32_t page_no;
+ uint16_t offset;
+
+ /* We use the fact that new_val points to
+ undo_rec and get thus the offset of
+ dfield data inside the undo record. Then we
+ can calculate from node->roll_ptr the file
+ address of the new_val data */
+
+ const uint16_t internal_offset = uint16_t(
+ static_cast<const byte*>
+ (dfield_get_data(&ufield->new_val))
+ - undo_rec);
+
+ ut_a(internal_offset < srv_page_size);
+
+ trx_undo_decode_roll_ptr(node->roll_ptr,
+ &is_insert, &rseg_id,
+ &page_no, &offset);
+
+ const trx_rseg_t &rseg = trx_sys.rseg_array[rseg_id];
+ ut_ad(rseg.is_persistent());
+
+ mtr.start();
+
+ /* We have to acquire an SX-latch to the clustered
+ index tree (exclude other tree changes) */
+
+ mtr_sx_lock_index(index, &mtr);
+
+ index->set_modified(mtr);
+
+ /* NOTE: we must also acquire a U latch to the
+ root page of the tree. We will need it when we
+ free pages from the tree. If the tree is of height 1,
+ the tree X-latch does NOT protect the root page,
+ because it is also a leaf page. Since we will have a
+ latch on an undo log page, we would break the
+ latching order if we would only later latch the
+ root page of such a tree! */
+
+ dberr_t err;
+ if (!btr_root_block_get(index, RW_SX_LATCH, &mtr,
+ &err)) {
+ } else if (buf_block_t* block =
+ buf_page_get(page_id_t(rseg.space->id,
+ page_no),
+ 0, RW_X_LATCH, &mtr)) {
+ block->page.set_accessed();
+ buf_page_make_young_if_needed(&block->page);
+
+ byte* data_field = block->page.frame
+ + offset + internal_offset;
+
+ ut_a(dfield_get_len(&ufield->new_val)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ btr_free_externally_stored_field(
+ index,
+ data_field
+ + dfield_get_len(&ufield->new_val)
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ NULL, NULL, block, 0, false, &mtr);
+ }
+
+ mtr.commit();
+ }
+ }
+
+ row_purge_reset_trx_id(node, &mtr);
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \
+ row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
+#else /* UNIV_DEBUG */
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \
+ row_purge_upd_exist_or_extern_func(node,undo_rec)
+#endif /* UNIV_DEBUG */
+
+/** Build a partial row from an update undo log record for purge.
+Any columns which occur as ordering in any index of the table are present.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+
+@param ptr remaining part of the undo log record
+@param index clustered index
+@param node purge node
+@return pointer to remaining part of undo record */
+static byte *row_purge_get_partial(const byte *ptr, const dict_index_t &index,
+ purge_node_t *node)
+{
+ bool first_v_col= true;
+ bool is_undo_log= true;
+
+ ut_ad(index.is_primary());
+ ut_ad(index.n_uniq == node->ref->n_fields);
+
+ node->row= dtuple_create_with_vcol(node->heap, index.table->n_cols,
+ index.table->n_v_cols);
+
+ /* Mark all columns in the row uninitialized, so that
+ we can distinguish missing fields from fields that are SQL NULL. */
+ for (ulint i= 0; i < index.table->n_cols; i++)
+ node->row->fields[i].type.mtype= DATA_MISSING;
+
+ dtuple_init_v_fld(node->row);
+
+ for (const upd_field_t *uf= node->update->fields, *const ue=
+ node->update->fields + node->update->n_fields; uf != ue; uf++)
+ {
+ if (!uf->old_v_val)
+ {
+ const dict_col_t &c= *dict_index_get_nth_col(&index, uf->field_no);
+ if (!c.is_dropped())
+ node->row->fields[c.ind]= uf->new_val;
+ }
+ }
+
+ const byte *end_ptr= ptr + mach_read_from_2(ptr);
+ ptr+= 2;
+
+ while (ptr != end_ptr)
+ {
+ dfield_t *dfield;
+ const byte *field;
+ const dict_col_t *col;
+ uint32_t len, orig_len, field_no= mach_read_next_compressed(&ptr);
+
+ if (field_no >= REC_MAX_N_FIELDS)
+ {
+ ptr= trx_undo_read_v_idx(index.table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col= false;
+
+ ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ if (field_no == FIL_NULL)
+ continue; /* there no longer is an index on the virtual column */
+
+ dict_v_col_t *vcol= dict_table_get_nth_v_col(index.table, field_no);
+ col =&vcol->m_col;
+ dfield= dtuple_get_nth_v_field(node->row, vcol->v_pos);
+ dict_col_copy_type(&vcol->m_col, &dfield->type);
+ }
+ else
+ {
+ ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+ col= dict_index_get_nth_col(&index, field_no);
+ if (col->is_dropped())
+ continue;
+ dfield= dtuple_get_nth_field(node->row, col->ind);
+ ut_ad(dfield->type.mtype == DATA_MISSING ||
+ dict_col_type_assert_equal(col, &dfield->type));
+ ut_ad(dfield->type.mtype == DATA_MISSING ||
+ dfield->len == len ||
+ (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD));
+ dict_col_copy_type(dict_table_get_nth_col(index.table, col->ind),
+ &dfield->type);
+ }
+
+ dfield_set_data(dfield, field, len);
+
+ if (len == UNIV_SQL_NULL || len < UNIV_EXTERN_STORAGE_FIELD)
+ continue;
+
+ spatial_status_t spatial_status= static_cast<spatial_status_t>
+ ((len & SPATIAL_STATUS_MASK) >> SPATIAL_STATUS_SHIFT);
+ len&= ~SPATIAL_STATUS_MASK;
+
+ /* Keep compatible with 5.7.9 format. */
+ if (spatial_status == SPATIAL_UNKNOWN)
+ spatial_status= dict_col_get_spatial_status(col);
+
+ switch (UNIV_EXPECT(spatial_status, SPATIAL_NONE)) {
+ case SPATIAL_ONLY:
+ ut_ad(len - UNIV_EXTERN_STORAGE_FIELD == DATA_MBR_LEN);
+ dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD);
+ break;
+
+ case SPATIAL_MIXED:
+ dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD - DATA_MBR_LEN);
+ break;
+
+ default:
+ dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD);
+ break;
+ }
+
+ dfield_set_ext(dfield);
+ dfield_set_spatial_status(dfield, spatial_status);
+
+ if (!col->ord_part || spatial_status == SPATIAL_ONLY ||
+ node->rec_type == TRX_UNDO_UPD_DEL_REC)
+ continue;
+ /* If the prefix of this BLOB column is indexed, ensure that enough
+ prefix is stored in the undo log record. */
+ ut_a(dfield_get_len(dfield) >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(dict_table_has_atomic_blobs(index.table) ||
+ dfield_get_len(dfield) >=
+ REC_ANTELOPE_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE);
+ }
+
+ for (ulint i= 0; i < index.n_uniq; i++)
+ {
+ dfield_t &field= node->row->fields[index.fields[i].col->ind];
+ if (field.type.mtype == DATA_MISSING)
+ field= node->ref->fields[i];
+ }
+
+ return const_cast<byte*>(ptr);
+}
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/** Parses the row reference and other info in a modify undo log record.
+@param[in] node row undo node
+@param[in] undo_rec record to purge
+@param[in] thr query thread
+@param[out] updated_extern true if an externally stored field was
+ updated
+@return true if purge operation required */
+static
+bool
+row_purge_parse_undo_rec(
+ purge_node_t* node,
+ const trx_undo_rec_t* undo_rec,
+ que_thr_t* thr,
+ bool* updated_extern)
+{
+ dict_index_t* clust_index;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ roll_ptr_t roll_ptr;
+ byte info_bits;
+ byte type;
+
+ const byte* ptr = trx_undo_rec_get_pars(
+ undo_rec, &type, &node->cmpl_info,
+ updated_extern, &undo_no, &table_id);
+
+ node->rec_type = type;
+
+ switch (type) {
+ case TRX_UNDO_RENAME_TABLE:
+ return false;
+ case TRX_UNDO_EMPTY:
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ /* These records do not store any transaction identifier. */
+ node->trx_id = TRX_ID_MAX;
+ break;
+ default:
+#ifdef UNIV_DEBUG
+ ut_ad("unknown undo log record type" == 0);
+ return false;
+ case TRX_UNDO_UPD_DEL_REC:
+ case TRX_UNDO_UPD_EXIST_REC:
+ case TRX_UNDO_DEL_MARK_REC:
+#endif /* UNIV_DEBUG */
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id,
+ &roll_ptr, &info_bits);
+ break;
+ }
+
+ auto &tables_entry= node->tables[table_id];
+ node->table = tables_entry.first;
+ if (!node->table) {
+ return false;
+ }
+
+#ifndef DBUG_OFF
+ if (MDL_ticket* mdl = tables_entry.second) {
+ static_cast<MDL_context*>(thd_mdl_context(current_thd))
+ ->lock_warrant = mdl->get_ctx();
+ }
+#endif
+ ut_ad(!node->table->is_temporary());
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (clust_index->is_corrupted()) {
+ /* The table was corrupt in the data dictionary.
+ dict_set_corrupted() works on an index, and
+ we do not have an index to call it with. */
+ DBUG_ASSERT(table_id == node->table->id);
+ return false;
+ }
+
+ switch (type) {
+ case TRX_UNDO_INSERT_METADATA:
+ node->ref = &trx_undo_metadata;
+ return true;
+ case TRX_UNDO_EMPTY:
+ node->ref = nullptr;
+ return true;
+ }
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ if (type == TRX_UNDO_INSERT_REC) {
+ return(true);
+ }
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type,
+ node->trx_id,
+ roll_ptr, info_bits,
+ node->heap, &(node->update));
+
+ /* Read to the partial row the fields that occur in indexes */
+
+ if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ ut_ad(!(node->update->info_bits & REC_INFO_MIN_REC_FLAG));
+ ptr = row_purge_get_partial(ptr, *clust_index, node);
+ } else if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+ node->ref = &trx_undo_metadata;
+ }
+
+ return(true);
+}
+
+/** Purges the parsed record.
+@param[in] node row purge node
+@param[in] undo_rec record to purge
+@param[in] thr query thread
+@param[in] updated_extern whether external columns were updated
+@return true if purged, false if skipped */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_record_func(
+ purge_node_t* node,
+ const trx_undo_rec_t* undo_rec,
+#if defined UNIV_DEBUG || defined WITH_WSREP
+ const que_thr_t*thr,
+#endif /* UNIV_DEBUG || WITH_WSREP */
+ bool updated_extern)
+{
+ ut_ad(!node->found_clust);
+ ut_ad(!node->table->skip_alter_undo);
+ ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ bool purged = true;
+
+ switch (node->rec_type) {
+ case TRX_UNDO_EMPTY:
+ break;
+ case TRX_UNDO_DEL_MARK_REC:
+ purged = row_purge_del_mark(node);
+ if (purged) {
+ if (node->table->stat_initialized
+ && srv_stats_include_delete_marked) {
+ dict_stats_update_if_needed(
+ node->table, *thr->graph->trx);
+ }
+ MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+ }
+ break;
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+ /* fall through */
+ default:
+ if (!updated_extern) {
+ mtr_t mtr;
+ row_purge_reset_trx_id(node, &mtr);
+ break;
+ }
+ /* fall through */
+ case TRX_UNDO_UPD_EXIST_REC:
+ row_purge_upd_exist_or_extern(thr, node, undo_rec);
+ MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+ break;
+ }
+
+ if (node->found_clust) {
+ node->found_clust = false;
+ btr_pcur_close(&node->pcur);
+ }
+
+ return(purged);
+}
+
+#if defined UNIV_DEBUG || defined WITH_WSREP
+# define row_purge_record(node,undo_rec,thr,updated_extern) \
+ row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG || WITH_WSREP */
+# define row_purge_record(node,undo_rec,thr,updated_extern) \
+ row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG || WITH_WSREP */
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_purge(
+/*======*/
+ purge_node_t* node, /*!< in: row purge node */
+ const trx_undo_rec_t* undo_rec, /*!< in: record to purge */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ if (undo_rec != reinterpret_cast<trx_undo_rec_t*>(-1)) {
+ bool updated_extern;
+
+ while (row_purge_parse_undo_rec(
+ node, undo_rec, thr, &updated_extern)) {
+
+ bool purged = row_purge_record(
+ node, undo_rec, thr, updated_extern);
+
+ if (purged
+ || srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+ return;
+ }
+
+ /* Retry the purge in a second. */
+ std::this_thread::sleep_for(std::chrono::seconds(1));
+ }
+ }
+}
+
+inline void purge_node_t::start()
+{
+ ut_ad(in_progress);
+ DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+
+ row= nullptr;
+ ref= nullptr;
+ index= nullptr;
+ update= nullptr;
+ found_clust= false;
+ rec_type= 0;
+ cmpl_info= 0;
+}
+
+/** Reset the state at end
+@return the query graph parent */
+inline que_node_t *purge_node_t::end(THD *thd)
+{
+ DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+ ut_ad(undo_recs.empty());
+ ut_d(in_progress= false);
+ innobase_reset_background_thd(thd);
+#ifndef DBUG_OFF
+ static_cast<MDL_context*>(thd_mdl_context(thd))->lock_warrant= nullptr;
+#endif
+ mem_heap_empty(heap);
+ return common.parent;
+}
+
+
+/***********************************************************//**
+Does the purge operation.
+@return query thread to run next */
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ purge_node_t* node;
+
+ node = static_cast<purge_node_t*>(thr->run_node);
+
+ node->start();
+
+ while (!node->undo_recs.empty()) {
+ trx_purge_rec_t purge_rec = node->undo_recs.front();
+ node->undo_recs.pop();
+ node->roll_ptr = purge_rec.roll_ptr;
+
+ row_purge(node, purge_rec.undo_rec, thr);
+ }
+
+ thr->run_node = node->end(current_thd);
+ return(thr);
+}
+
+#ifdef UNIV_DEBUG
+/***********************************************************//**
+Validate the persisent cursor. The purge node has two references
+to the clustered index record - one via the ref member, and the
+other via the persistent cursor. These two references must match
+each other if the found_clust flag is set.
+@return true if the stored copy of persistent cursor is consistent
+with the ref member.*/
+bool
+purge_node_t::validate_pcur()
+{
+ if (!found_clust) {
+ return(true);
+ }
+
+ if (index == NULL) {
+ return(true);
+ }
+
+ if (index->type == DICT_FTS) {
+ return(true);
+ }
+
+ if (!pcur.old_rec) {
+ return(true);
+ }
+
+ dict_index_t* clust_index = pcur.index();
+
+ rec_offs* offsets = rec_get_offsets(
+ pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields,
+ pcur.old_n_fields, &heap);
+
+ /* Here we are comparing the purge ref record and the stored initial
+ part in persistent cursor. Both cases we store n_uniq fields of the
+ cluster index and so it is fine to do the comparison. We note this
+ dependency here as pcur and ref belong to different modules. */
+ int st = cmp_dtuple_rec(ref, pcur.old_rec, clust_index, offsets);
+
+ if (st != 0) {
+ ib::error() << "Purge node pcur validation failed";
+ ib::error() << rec_printer(ref).str();
+ ib::error() << rec_printer(pcur.old_rec, offsets).str();
+ return(false);
+ }
+
+ return(true);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
new file mode 100644
index 00000000..e927096f
--- /dev/null
+++ b/storage/innobase/row/row0quiesce.cc
@@ -0,0 +1,715 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0quiesce.cc
+Quiesce a tablespace.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0quiesce.h"
+#include "row0mysql.h"
+#include "buf0flu.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_index_fields(
+/*===========================*/
+ const dict_index_t* index, /*!< in: write the meta data for
+ this index */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ byte row[sizeof(ib_uint32_t) * 2];
+
+ for (ulint i = 0; i < index->n_fields; ++i) {
+ byte* ptr = row;
+ const dict_field_t* field = &index->fields[i];
+
+ mach_write_to_4(ptr, field->prefix_len);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, field->fixed_len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_9",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index fields.");
+
+ return(DB_IO_ERROR);
+ }
+
+ const char* field_name = field->name ? field->name : "";
+ /* Include the NUL byte in the length. */
+ ib_uint32_t len = static_cast<ib_uint32_t>(strlen(field_name) + 1);
+ mach_write_to_4(row, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_10",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+ || fwrite(field_name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index column.");
+
+ return(DB_IO_ERROR);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file index information.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_indexes(
+/*======================*/
+ const dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ ulint n_indexes = 0;
+ for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index; index = UT_LIST_GET_NEXT(indexes, index)) {
+ n_indexes += index->is_committed();
+ }
+
+ {
+ byte row[sizeof(ib_uint32_t)];
+
+ /* Write the number of indexes in the table. */
+ mach_write_to_4(row, n_indexes);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_11",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index count.");
+
+ return(DB_IO_ERROR);
+ }
+ }
+
+ dberr_t err = DB_SUCCESS;
+
+ /* Write the index meta data. */
+ for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != 0 && err == DB_SUCCESS;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (!index->is_committed()) {
+ continue;
+ }
+
+ ut_ad(n_indexes); ut_d(n_indexes--);
+
+ byte* ptr;
+ byte row[sizeof(index_id_t)
+ + sizeof(ib_uint32_t) * 8];
+
+ ptr = row;
+
+ ut_ad(sizeof(index_id_t) == 8);
+ mach_write_to_8(ptr, index->id);
+ ptr += sizeof(index_id_t);
+
+ mach_write_to_4(ptr, table->space_id);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->page);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->type);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->trx_id_offset);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_user_defined_cols);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_uniq);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_nullable);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, index->n_fields);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_12",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index meta-data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* Write the length of the index name.
+ NUL byte is included in the length. */
+ ib_uint32_t len = static_cast<ib_uint32_t>(strlen(index->name) + 1);
+ ut_a(len > 1);
+
+ mach_write_to_4(row, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_1",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+ || fwrite(index->name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing index name.");
+
+ return(DB_IO_ERROR);
+ }
+
+ err = row_quiesce_write_index_fields(index, file, thd);
+ }
+
+ ut_ad(!n_indexes);
+ return(err);
+}
+
+/*********************************************************************//**
+Write the meta data (table columns) config file. Serialise the contents of
+dict_col_t structure, along with the column name. All fields are serialized
+as ib_uint32_t.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_table(
+/*====================*/
+ const dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ dict_col_t* col;
+ byte row[sizeof(ib_uint32_t) * 7];
+
+ col = table->cols;
+
+ for (ulint i = 0; i < table->n_cols; ++i, ++col) {
+ byte* ptr = row;
+
+ mach_write_to_4(ptr, col->prtype);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->mtype);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->len);
+ ptr += sizeof(ib_uint32_t);
+
+ /* FIXME: This will not work if mbminlen>4.
+ This field is also redundant, because the lengths
+ are a property of the character set encoding, which
+ in turn is encodedin prtype above. */
+ mach_write_to_4(ptr, ulint(col->mbmaxlen * 5 + col->mbminlen));
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->ind);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->ord_part);
+ ptr += sizeof(ib_uint32_t);
+
+ mach_write_to_4(ptr, col->max_prefix);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_2",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table column data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* Write out the column name as [len, byte array]. The len
+ includes the NUL byte. */
+ ib_uint32_t len;
+ const char* col_name;
+
+ col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+ /* Include the NUL byte in the length. */
+ len = static_cast<ib_uint32_t>(strlen(col_name) + 1);
+ ut_a(len > 1);
+
+ mach_write_to_4(row, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_3",
+ close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+ || fwrite(col_name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing column name.");
+
+ return(DB_IO_ERROR);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file header.
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_header(
+/*=====================*/
+ const dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ FILE* file, /*!< in: file to write to */
+ THD* thd) /*!< in/out: session */
+{
+ byte value[sizeof(ib_uint32_t)];
+
+ /* Write the meta-data version number. */
+ mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file)););
+
+ if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing meta-data version number.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* Write the server hostname. */
+ ib_uint32_t len;
+ const char* hostname = server_get_hostname();
+
+ /* Play it safe and check for NULL. */
+ if (hostname == 0) {
+ static const char NullHostname[] = "Hostname unknown";
+
+ ib::warn() << "Unable to determine server hostname.";
+
+ hostname = NullHostname;
+ }
+
+ /* The server hostname includes the NUL byte. */
+ len = static_cast<ib_uint32_t>(strlen(hostname) + 1);
+ mach_write_to_4(value, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file)););
+
+ if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)
+ || fwrite(hostname, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing hostname.");
+
+ return(DB_IO_ERROR);
+ }
+
+ /* The table name includes the NUL byte. */
+ ut_a(table->name.m_name != NULL);
+ len = static_cast<ib_uint32_t>(strlen(table->name.m_name) + 1);
+
+ /* Write the table name. */
+ mach_write_to_4(value, len);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file)););
+
+ if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)
+ || fwrite(table->name.m_name, 1, len, file) != len) {
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table name.");
+
+ return(DB_IO_ERROR);
+ }
+
+ byte row[sizeof(ib_uint32_t) * 3];
+
+ /* Write the next autoinc value. */
+ mach_write_to_8(row, table->autoinc);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table autoinc value.");
+
+ return(DB_IO_ERROR);
+ }
+
+ byte* ptr = row;
+
+ /* Write the system page size. */
+ mach_write_to_4(ptr, srv_page_size);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Write the table->flags. */
+ mach_write_to_4(ptr, table->flags);
+ ptr += sizeof(ib_uint32_t);
+
+ /* Write the number of columns in the table. */
+ mach_write_to_4(ptr, table->n_cols);
+
+ DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file)););
+
+ if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno),
+ "while writing table meta-data.");
+
+ return(DB_IO_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the table meta data after quiesce.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_cfg(
+/*==================*/
+ dict_table_t* table, /*!< in: write the meta data for
+ this table */
+ THD* thd) /*!< in/out: session */
+{
+ dberr_t err;
+ char name[OS_FILE_MAX_PATH];
+
+ srv_get_meta_data_filename(table, name, sizeof(name));
+
+ ib::info() << "Writing table metadata to '" << name << "'";
+
+ FILE* file = fopen(name, "w+b");
+
+ if (file == NULL) {
+ ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE,
+ name, errno, strerror(errno));
+
+ err = DB_IO_ERROR;
+ } else {
+ err = row_quiesce_write_header(table, file, thd);
+
+ if (err == DB_SUCCESS) {
+ err = row_quiesce_write_table(table, file, thd);
+ }
+
+ if (err == DB_SUCCESS) {
+ err = row_quiesce_write_indexes(table, file, thd);
+ }
+
+ if (fflush(file) != 0) {
+
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg), "%s flush() failed", name);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno), msg);
+ }
+
+ if (fclose(file) != 0) {
+ char msg[BUFSIZ];
+
+ snprintf(msg, sizeof(msg), "%s flose() failed", name);
+
+ ib_senderrf(
+ thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+ (ulong) errno, strerror(errno), msg);
+ }
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Check whether a table has an FTS index defined on it.
+@return true if an FTS index exists on the table */
+static
+bool
+row_quiesce_table_has_fts_index(
+/*============================*/
+ const dict_table_t* table) /*!< in: quiesce this table */
+{
+ bool exists = false;
+
+ for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+ index != 0;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+
+ if (index->type & DICT_FTS) {
+ exists = true;
+ break;
+ }
+ }
+
+ return(exists);
+}
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+{
+ ut_a(trx->mysql_thd != 0);
+ ut_a(srv_n_purge_threads > 0);
+ ut_ad(!srv_read_only_mode);
+
+ ut_a(trx->mysql_thd != 0);
+
+ ut_ad(table->space != NULL);
+ ib::info() << "Sync to disk of " << table->name << " started.";
+
+ if (srv_undo_sources) {
+ purge_sys.stop();
+ }
+
+ for (ulint count = 0;
+ ibuf_merge_space(table->space_id);
+ ++count) {
+ if (trx_is_interrupted(trx)) {
+ goto aborted;
+ }
+ if (!(count % 20)) {
+ ib::info() << "Merging change buffer entries for "
+ << table->name;
+ }
+ }
+
+ while (buf_flush_list_space(table->space)) {
+ if (trx_is_interrupted(trx)) {
+ goto aborted;
+ }
+ }
+
+ if (!trx_is_interrupted(trx)) {
+ /* Ensure that all asynchronous IO is completed. */
+ os_aio_wait_until_no_pending_writes(true);
+ table->space->flush<false>();
+
+ if (row_quiesce_write_cfg(table, trx->mysql_thd)
+ != DB_SUCCESS) {
+ ib::warn() << "There was an error writing to the"
+ " meta data file";
+ } else {
+ ib::info() << "Table " << table->name
+ << " flushed to disk";
+ }
+ } else {
+aborted:
+ ib::warn() << "Quiesce aborted!";
+ }
+
+ dberr_t err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx);
+ ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ trx_t* trx) /*!< in/out: transaction/session */
+{
+ ulint count = 0;
+
+ ut_a(trx->mysql_thd != 0);
+
+ /* We need to wait for the operation to complete if the
+ transaction has been killed. */
+
+ while (table->quiesce != QUIESCE_COMPLETE) {
+
+ /* Print a warning after every minute. */
+ if (!(count % 60)) {
+ ib::warn() << "Waiting for quiesce of " << table->name
+ << " to complete";
+ }
+
+ std::this_thread::sleep_for(std::chrono::seconds(1));
+
+ ++count;
+ }
+
+ if (!opt_bootstrap) {
+ /* Remove the .cfg file now that the user has resumed
+ normal operations. Otherwise it will cause problems when
+ the user tries to drop the database (remove directory). */
+ char cfg_name[OS_FILE_MAX_PATH];
+
+ srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name));
+
+ os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
+
+ ib::info() << "Deleting the meta-data file '" << cfg_name << "'";
+ }
+
+ if (srv_undo_sources) {
+ purge_sys.resume();
+ }
+
+ dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx);
+ ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+ dict_table_t* table, /*!< in: quiesce this table */
+ ib_quiesce_t state, /*!< in: quiesce state to set */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_a(srv_n_purge_threads > 0);
+
+ if (srv_read_only_mode) {
+
+ ib_senderrf(trx->mysql_thd,
+ IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+ return(DB_UNSUPPORTED);
+
+ } else if (table->is_temporary()) {
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+ return(DB_UNSUPPORTED);
+ } else if (table->space_id == TRX_SYS_SPACE) {
+
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ table->name.m_name);
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+ return(DB_UNSUPPORTED);
+ } else if (row_quiesce_table_has_fts_index(table)) {
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_NOT_SUPPORTED_YET,
+ "FLUSH TABLES on tables that have an FTS index."
+ " FTS auxiliary tables will not be flushed.");
+
+ } else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+ /* If this flag is set then the table may not have any active
+ FTS indexes but it will still have the auxiliary tables. */
+
+ ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+ ER_NOT_SUPPORTED_YET,
+ "FLUSH TABLES on a table that had an FTS index,"
+ " created on a hidden column, the"
+ " auxiliary tables haven't been dropped as yet."
+ " FTS auxiliary tables will not be flushed.");
+ }
+
+ dict_index_t* clust_index = dict_table_get_first_index(table);
+
+ for (dict_index_t* index = dict_table_get_next_index(clust_index);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ index->lock.x_lock(SRW_LOCK_CALL);
+ }
+
+ clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+ switch (state) {
+ case QUIESCE_START:
+ break;
+
+ case QUIESCE_COMPLETE:
+ ut_a(table->quiesce == QUIESCE_START);
+ break;
+
+ case QUIESCE_NONE:
+ ut_a(table->quiesce == QUIESCE_COMPLETE);
+ break;
+ }
+
+ table->quiesce = state;
+
+ for (dict_index_t* index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ index->lock.x_unlock();
+ }
+
+ return(DB_SUCCESS);
+}
+
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
new file mode 100644
index 00000000..4a00b2a4
--- /dev/null
+++ b/storage/innobase/row/row0row.cc
@@ -0,0 +1,1720 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.cc
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "ut0mem.h"
+#include "gis0geo.h"
+#include "row0mysql.h"
+
+/** Build a spatial index key.
+@param[in] index spatial index
+@param[in] ext externally stored column prefixes, or NULL
+@param[in,out] dfield field of the tuple to be copied
+@param[in] dfield2 field of the tuple to copy
+@param[in] flag ROW_BUILD_NORMAL, ROW_BUILD_FOR_PURGE or
+ ROW_BUILD_FOR_UNDO
+@param[in,out] heap memory heap from which the memory
+ of the field entry is allocated.
+@retval false if undo log is logged before spatial index creation. */
+static bool row_build_spatial_index_key(
+ const dict_index_t* index,
+ const row_ext_t* ext,
+ dfield_t* dfield,
+ const dfield_t* dfield2,
+ ulint flag,
+ mem_heap_t* heap)
+{
+ if (dfield2->type.mtype == DATA_MISSING) {
+ return false;
+ }
+
+ double* mbr;
+
+ dfield_copy(dfield, dfield2);
+ dfield->type.prtype |= DATA_GIS_MBR;
+
+ /* Allocate memory for mbr field */
+ mbr = static_cast<double*>(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+ /* Set mbr field data. */
+ dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+ const fil_space_t* space = index->table->space;
+
+ if (UNIV_UNLIKELY(!dfield2->data || !space)) {
+ /* FIXME: dfield contains uninitialized data,
+ but row_build_index_entry_low() will not return NULL.
+ This bug is inherited from MySQL 5.7.5
+ commit b66ad511b61fffe75c58d0a607cdb837c6e6c821. */
+ return true;
+ }
+
+ const byte* dptr = NULL;
+ ulint dlen = 0;
+ ulint flen = 0;
+ double tmp_mbr[SPDIMS * 2];
+ mem_heap_t* temp_heap = NULL;
+
+ if (!dfield_is_ext(dfield2)) {
+ dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+ dlen = dfield_get_len(dfield2);
+ ut_ad(dptr != &data_error);
+ goto write_mbr;
+ }
+
+ if (flag == ROW_BUILD_FOR_PURGE) {
+ const byte* ptr = static_cast<const byte*>(
+ dfield_get_data(dfield2));
+
+ switch (dfield_get_spatial_status(dfield2)) {
+ case SPATIAL_ONLY:
+ ut_ad(dfield_get_len(dfield2) == DATA_MBR_LEN);
+ break;
+
+ case SPATIAL_MIXED:
+ ptr += dfield_get_len(dfield2);
+ break;
+
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ /* fall through */
+ case SPATIAL_NONE:
+ /* Undo record is logged before
+ spatial index is created.*/
+ return false;
+ }
+
+ memcpy(mbr, ptr, DATA_MBR_LEN);
+ return true;
+ }
+
+ if (flag == ROW_BUILD_FOR_UNDO
+ && dict_table_has_atomic_blobs(index->table)) {
+ /* For ROW_FORMAT=DYNAMIC or COMPRESSED, a prefix of
+ off-page records is stored in the undo log record (for
+ any column prefix indexes). For SPATIAL INDEX, we
+ must ignore this prefix. The full column value is
+ stored in the BLOB. For non-spatial index, we would
+ have already fetched a necessary prefix of the BLOB,
+ available in the "ext" parameter.
+
+ Here, for SPATIAL INDEX, we are fetching the full
+ column, which is potentially wasting a lot of I/O,
+ memory, and possibly involving a concurrency problem,
+ similar to ones that existed before the introduction
+ of row_ext_t.
+
+ MDEV-11657 FIXME: write the MBR directly to the undo
+ log record, and avoid recomputing it here! */
+ flen = BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(dfield_get_len(dfield2) >= BTR_EXTERN_FIELD_REF_SIZE);
+ dptr = static_cast<const byte*>(dfield_get_data(dfield2))
+ + dfield_get_len(dfield2)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+ flen = dfield_get_len(dfield2);
+ dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+ }
+
+ temp_heap = mem_heap_create(1000);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, dptr, ext ? ext->zip_size : space->zip_size(),
+ flen, temp_heap);
+
+write_mbr:
+ if (dlen <= GEO_DATA_HEADER_SIZE) {
+ for (uint i = 0; i < SPDIMS; i += 2) {
+ tmp_mbr[i] = DBL_MAX;
+ tmp_mbr[i + 1] = -DBL_MAX;
+ }
+ } else {
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ uint(dlen - GEO_DATA_HEADER_SIZE),
+ SPDIMS, tmp_mbr);
+ }
+
+ dfield_write_mbr(dfield, tmp_mbr);
+ if (temp_heap) {
+ mem_heap_free(temp_heap);
+ }
+
+ return true;
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ const row_ext_t* ext, /*!< in: externally stored column
+ prefixes, or NULL */
+ const dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap, /*!< in,out: memory heap from which
+ the memory for the index entry
+ is allocated */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE
+ or ROW_BUILD_FOR_UNDO */
+{
+ dtuple_t* entry;
+ ulint entry_len;
+ ulint i = 0;
+ ulint num_v = 0;
+
+ entry_len = dict_index_get_n_fields(index);
+
+ if (flag == ROW_BUILD_FOR_INSERT && dict_index_is_clust(index)) {
+ num_v = dict_table_get_n_v_cols(index->table);
+ entry = dtuple_create_with_vcol(heap, entry_len, num_v);
+ } else {
+ entry = dtuple_create(heap, entry_len);
+ }
+
+ if (dict_index_is_ibuf(index)) {
+ dtuple_set_n_fields_cmp(entry, entry_len);
+ /* There may only be externally stored columns
+ in a clustered index B-tree of a user table. */
+ ut_a(!ext);
+ } else {
+ dtuple_set_n_fields_cmp(
+ entry, dict_index_get_n_unique_in_tree(index));
+ if (dict_index_is_spatial(index)) {
+ /* Set the MBR field */
+ if (!row_build_spatial_index_key(
+ index, ext,
+ dtuple_get_nth_field(entry, 0),
+ dtuple_get_nth_field(
+ row,
+ dict_index_get_nth_field(index, i)
+ ->col->ind), flag, heap)) {
+ return NULL;
+ }
+
+ i = 1;
+ }
+ }
+
+ for (; i < entry_len; i++) {
+ const dict_field_t& f = index->fields[i];
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+ if (f.col->is_dropped()) {
+ ut_ad(index->is_primary());
+ ut_ad(index->is_instant());
+ ut_ad(!f.col->is_virtual());
+ dict_col_copy_type(f.col, &dfield->type);
+ if (f.col->is_nullable()) {
+ dfield_set_null(dfield);
+ } else {
+ dfield_set_data(dfield, field_ref_zero,
+ f.fixed_len);
+ }
+ continue;
+ }
+
+ const dfield_t* dfield2;
+
+ if (f.col->is_virtual()) {
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(f.col);
+
+ ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+ dfield2 = dtuple_get_nth_v_field(row, v_col->v_pos);
+
+ ut_ad(dfield_is_null(dfield2) ||
+ dfield_get_len(dfield2) == 0 || dfield2->data);
+ ut_ad(!dfield_is_ext(dfield2));
+ if (UNIV_UNLIKELY(dfield2->type.mtype
+ == DATA_MISSING)) {
+ ut_ad(flag == ROW_BUILD_FOR_PURGE);
+ return(NULL);
+ }
+ } else {
+ dfield2 = dtuple_get_nth_field(row, f.col->ind);
+ if (UNIV_UNLIKELY(dfield2->type.mtype
+ == DATA_MISSING)) {
+ /* The field has not been initialized in
+ the row. This should be from
+ trx_undo_rec_get_partial_row(). */
+ return(NULL);
+ }
+
+ ut_ad(!(dfield2->type.prtype & DATA_VIRTUAL));
+ }
+
+ compile_time_assert(DATA_MISSING == 0);
+
+ *dfield = *dfield2;
+
+ if (dfield_is_null(dfield)) {
+ continue;
+ }
+
+ ut_ad(!(index->type & DICT_FTS));
+
+ ulint len = dfield_get_len(dfield);
+
+ if (f.prefix_len == 0
+ && (!dfield_is_ext(dfield)
+ || dict_index_is_clust(index))) {
+ /* The *dfield = *dfield2 above suffices for
+ columns that are stored in-page, or for
+ clustered index record columns that are not
+ part of a column prefix in the PRIMARY KEY. */
+ continue;
+ }
+
+ /* If the column is stored externally (off-page) in
+ the clustered index, it must be an ordering field in
+ the secondary index. If !atomic_blobs, the only way
+ we may have a secondary index pointing to a clustered
+ index record with an off-page column is when it is a
+ column prefix index. If atomic_blobs, also fully
+ indexed long columns may be stored off-page. */
+ ut_ad(f.col->ord_part);
+
+ if (ext && !f.col->is_virtual()) {
+ /* See if the column is stored externally. */
+ const byte* buf = row_ext_lookup(ext, f.col->ind,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+ return(NULL);
+ }
+ dfield_set_data(dfield, buf, len);
+ }
+
+ if (f.prefix_len == 0) {
+ /* If ROW_FORMAT=DYNAMIC or
+ ROW_FORMAT=COMPRESSED, we can have a
+ secondary index on an entire column
+ that is stored off-page in the
+ clustered index. As this is not a
+ prefix index (prefix_len == 0),
+ include the entire off-page column in
+ the secondary index record. */
+ continue;
+ }
+ } else if (dfield_is_ext(dfield)) {
+ /* This table is either in
+ (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
+ or a purge record where the ordered part of
+ the field is not external.
+ In ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT,
+ the maximum column prefix
+ index length is 767 bytes, and the clustered
+ index record contains a 768-byte prefix of
+ each off-page column. */
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ dfield_set_len(dfield, len);
+ }
+
+ /* If a column prefix index, take only the prefix. */
+ if (f.prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ f.col->prtype,
+ f.col->mbminlen, f.col->mbmaxlen,
+ f.prefix_len, len,
+ static_cast<char*>(dfield_get_data(dfield)));
+ dfield_set_len(dfield, len);
+ }
+ }
+
+ for (i = num_v; i--; ) {
+ ut_ad(index->is_primary());
+ ut_ad(flag == ROW_BUILD_FOR_INSERT);
+ dfield_t* dfield = dtuple_get_nth_v_field(entry, i);
+ const dict_v_col_t* v_col = dict_table_get_nth_v_col(
+ index->table, i);
+ ut_ad(!v_col->m_col.is_dropped());
+ ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+ const dfield_t* dfield2 = dtuple_get_nth_v_field(
+ row, v_col->v_pos);
+ ut_ad(dfield_is_null(dfield2) ||
+ dfield_get_len(dfield2) == 0 || dfield2->data);
+ ut_ad(dfield2->type.mtype != DATA_MISSING);
+ *dfield = *dfield2;
+ }
+
+ return entry;
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in] index clustered index
+@param[in] rec record in the clustered index
+@param[in] offsets rec_get_offsets(rec,index) or NULL
+@param[in] col_table table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead
+@param[in] defaults default values of added/changed columns, or NULL
+@param[in] add_v new virtual columns added
+ along with new indexes
+@param[in] col_map mapping of old column
+ numbers to new ones, or NULL
+@param[in] ext cache of externally stored column
+ prefixes, or NULL
+@param[in] heap memory heap from which
+ the memory needed is allocated
+@return own: row built; */
+static inline
+dtuple_t*
+row_build_low(
+ ulint type,
+ const dict_index_t* index,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ const dict_table_t* col_table,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ row_ext_t** ext,
+ mem_heap_t* heap)
+{
+ const byte* copy;
+ dtuple_t* row;
+ ulint n_ext_cols;
+ ulint* ext_cols = NULL; /* remove warning */
+ ulint len;
+ byte* buf;
+ ulint j;
+ mem_heap_t* tmp_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ ut_ad(index != NULL);
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!col_map || col_table);
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* Some blob refs can be NULL during crash recovery before
+ trx_rollback_active() has completed execution, or when a concurrently
+ executing insert or update has committed the B-tree mini-transaction
+ but has not yet managed to restore the cursor position for writing
+ the big_rec. Note that the mini-transaction can be committed multiple
+ times, and the cursor restore can happen multiple times for single
+ insert or update statement. */
+ ut_a(!rec_offs_any_null_extern(rec, offsets)
+ || trx_sys.is_registered(current_trx(),
+ row_get_rec_trx_id(rec, index,
+ offsets)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ if (type != ROW_COPY_POINTERS) {
+ /* Take a copy of rec to heap */
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+ copy = rec_copy(buf, rec, offsets);
+ } else {
+ copy = rec;
+ }
+
+ n_ext_cols = rec_offs_n_extern(offsets);
+ if (n_ext_cols) {
+ ext_cols = static_cast<ulint*>(
+ mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
+ }
+
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(copy, index, true, const_cast<rec_offs*>(offsets));
+
+ if (!col_table) {
+ ut_ad(!col_map);
+ ut_ad(!defaults);
+ col_table = index->table;
+ }
+
+ if (defaults) {
+ ut_ad(col_map);
+ row = dtuple_copy(defaults, heap);
+ /* dict_table_copy_types() would set the fields to NULL */
+ for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) {
+ dict_col_copy_type(
+ dict_table_get_nth_col(col_table, i),
+ dfield_get_type(dtuple_get_nth_field(row, i)));
+ }
+ } else if (add_v != NULL) {
+ row = dtuple_create_with_vcol(
+ heap, dict_table_get_n_cols(col_table),
+ dict_table_get_n_v_cols(col_table) + add_v->n_v_col);
+ dict_table_copy_types(row, col_table);
+
+ for (ulint i = 0; i < add_v->n_v_col; i++) {
+ dict_col_copy_type(
+ &add_v->v_col[i].m_col,
+ dfield_get_type(dtuple_get_nth_v_field(
+ row, i + col_table->n_v_def)));
+ }
+ } else {
+ row = dtuple_create_with_vcol(
+ heap, dict_table_get_n_cols(col_table),
+ dict_table_get_n_v_cols(col_table));
+ dict_table_copy_types(row, col_table);
+ }
+
+ dtuple_set_info_bits(row, rec_get_info_bits(
+ copy, rec_offs_comp(offsets)));
+
+ j = 0;
+
+ const dict_field_t* ind_field = index->fields;
+
+ for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (i == index->first_user_field()
+ && rec_is_alter_metadata(rec, *index)) {
+ ut_ad(rec_offs_nth_extern(offsets, i));
+ ut_d(ulint len);
+ ut_d(rec_get_nth_field_offs(offsets, i, &len));
+ ut_ad(len == FIELD_REF_SIZE);
+ continue;
+ }
+
+ if (UNIV_UNLIKELY(ind_field
+ >= &index->fields[index->n_fields])) {
+ ut_ad(rec_is_metadata(rec, *index));
+ continue;
+ }
+
+ const dict_col_t* col = dict_field_get_col(ind_field);
+
+ if ((ind_field++)->prefix_len) {
+ /* Column prefixes can only occur in key
+ fields, which cannot be stored externally. For
+ a column prefix, there should also be the full
+ field in the clustered index tuple. The row
+ tuple comprises full fields, not prefixes. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ continue;
+ }
+
+ if (col->is_dropped()) {
+ continue;
+ }
+
+ ulint col_no = dict_col_get_no(col);
+
+ if (col_map) {
+ col_no = col_map[col_no];
+
+ if (col_no == ULINT_UNDEFINED) {
+ /* dropped column */
+ continue;
+ }
+ }
+
+ dfield_t* dfield = dtuple_get_nth_field(row, col_no);
+
+ const void* field = rec_get_nth_field(
+ copy, offsets, i, &len);
+ if (len == UNIV_SQL_DEFAULT) {
+ field = index->instant_field_value(i, &len);
+ if (field && type != ROW_COPY_POINTERS) {
+ field = mem_heap_dup(heap, field, len);
+ }
+ }
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+
+ col = dict_table_get_nth_col(col_table, col_no);
+
+ if (col->ord_part) {
+ /* We will have to fetch prefixes of
+ externally stored columns that are
+ referenced by column prefixes. */
+ ext_cols[j++] = col_no;
+ }
+ }
+ }
+
+ rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets));
+
+ ut_ad(dtuple_check_typed(row));
+
+ if (!ext) {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored
+ column. No cache is needed.
+
+ During online table rebuild,
+ row_log_table_apply_delete_low()
+ may use a cache that was set up by
+ row_log_table_delete(). */
+
+ } else if (j) {
+ *ext = row_ext_create(j, ext_cols, *index->table, row,
+ heap);
+ } else {
+ *ext = NULL;
+ }
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(row);
+}
+
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead */
+ const dtuple_t* defaults,
+ /*!< in: default values of
+ added and changed columns, or NULL */
+ const ulint* col_map,/*!< in: mapping of old column
+ numbers to new ones, or NULL */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ return(row_build_low(type, index, rec, offsets, col_table,
+ defaults, NULL, col_map, ext, heap));
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in] index clustered index
+@param[in] rec record in the clustered index
+@param[in] offsets rec_get_offsets(rec,index) or NULL
+@param[in] col_table table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead
+@param[in] defaults default values of added, changed columns, or NULL
+@param[in] add_v new virtual columns added
+ along with new indexes
+@param[in] col_map mapping of old column
+ numbers to new ones, or NULL
+@param[in] ext cache of externally stored column
+ prefixes, or NULL
+@param[in] heap memory heap from which
+ the memory needed is allocated
+@return own: row built; */
+dtuple_t*
+row_build_w_add_vcol(
+ ulint type,
+ const dict_index_t* index,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ const dict_table_t* col_table,
+ const dtuple_t* defaults,
+ const dict_add_v_col_t* add_v,
+ const ulint* col_map,
+ row_ext_t** ext,
+ mem_heap_t* heap)
+{
+ return(row_build_low(type, index, rec, offsets, col_table,
+ defaults, add_v, col_map, ext, heap));
+}
+
+/** Convert an index record to a data tuple.
+@tparam metadata whether the index->instant_field_value() needs to be accessed
+@tparam mblob 1 if rec_is_alter_metadata();
+2 if we want converted metadata corresponding to info_bits
+@param[in] rec index record
+@param[in] index index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[out] n_ext number of externally stored columns
+@param[in,out] heap memory heap for allocations
+@param[in] info_bits (only used if mblob=2)
+@param[in] pad (only used if mblob=2)
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+template<bool metadata, int mblob = 0>
+static inline
+dtuple_t*
+row_rec_to_index_entry_impl(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap,
+ ulint info_bits = 0,
+ bool pad = false)
+{
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(index != NULL);
+ ut_ad(!mblob || index->is_primary());
+ ut_ad(!mblob || !index->table->is_temporary());
+ ut_ad(!mblob || !dict_index_is_spatial(index));
+ compile_time_assert(!mblob || metadata);
+ compile_time_assert(mblob <= 2);
+ /* Because this function may be invoked by row0merge.cc
+ on a record whose header is in different format, the check
+ rec_offs_validate(rec, index, offsets) must be avoided here. */
+
+ const bool got = mblob == 2 && rec_is_alter_metadata(rec, *index);
+ ulint rec_len = rec_offs_n_fields(offsets);
+ if (mblob == 2) {
+ ut_ad(info_bits == REC_INFO_METADATA_ALTER
+ || info_bits == REC_INFO_METADATA_ADD);
+ if (pad) {
+ ut_ad(rec_len <= ulint(index->n_fields + got));
+ rec_len = ulint(index->n_fields)
+ + (info_bits == REC_INFO_METADATA_ALTER);
+ } else if (got) {
+ rec_len = std::min(rec_len,
+ ulint(index->n_fields + got));
+ } else if (info_bits == REC_INFO_METADATA_ALTER) {
+ ut_ad(rec_len <= index->n_fields);
+ rec_len++;
+ }
+ } else {
+ ut_ad(info_bits == 0);
+ ut_ad(!pad);
+ }
+ dtuple_t* entry = dtuple_create(heap, rec_len);
+ dfield_t* dfield = entry->fields;
+
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ ut_ad(mblob == 2
+ || rec_len == dict_index_get_n_fields(index) + uint(mblob == 1)
+ /* a record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ || (!index->table->is_temporary()
+ && index->table->id == DICT_INDEXES_ID
+ && rec_len + 1 == dict_index_get_n_fields(index)));
+
+ ulint i;
+ for (i = 0; i < (mblob ? index->first_user_field() : rec_len);
+ i++, dfield++) {
+ dict_col_copy_type(dict_index_get_nth_col(index, i),
+ &dfield->type);
+ if (!mblob
+ && dict_index_is_spatial(index)
+ && DATA_GEOMETRY_MTYPE(dfield->type.mtype)) {
+ dfield->type.prtype |= DATA_GIS_MBR;
+ }
+
+ ulint len;
+ const byte* field = metadata
+ ? rec_get_nth_cfield(rec, index, offsets, i, &len)
+ : rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+ }
+ }
+
+ if (mblob) {
+ ulint len;
+ const byte* field;
+ ulint j = i;
+
+ if (mblob == 2) {
+ const bool want = info_bits == REC_INFO_METADATA_ALTER;
+ if (got == want) {
+ if (got) {
+ goto copy_metadata;
+ }
+ } else {
+ if (want) {
+ /* Allocate a placeholder for
+ adding metadata in an update. */
+ len = FIELD_REF_SIZE;
+ field = static_cast<byte*>(
+ mem_heap_zalloc(heap, len));
+ /* In reality there is one fewer
+ field present in the record. */
+ rec_len--;
+ goto init_metadata;
+ }
+
+ /* Skip the undesired metadata blob
+ (for example, when rolling back an
+ instant ALTER TABLE). */
+ i++;
+ }
+ goto copy_user_fields;
+ }
+copy_metadata:
+ ut_ad(rec_offs_nth_extern(offsets, i));
+ field = rec_get_nth_field(rec, offsets, i++, &len);
+init_metadata:
+ dfield->type.metadata_blob_init();
+ ut_ad(len == FIELD_REF_SIZE);
+ dfield_set_data(dfield, field, len);
+ dfield_set_ext(dfield++);
+copy_user_fields:
+ for (; i < rec_len; i++, dfield++) {
+ dict_col_copy_type(dict_index_get_nth_col(index, j++),
+ &dfield->type);
+ if (mblob == 2 && pad
+ && i >= rec_offs_n_fields(offsets)) {
+ field = index->instant_field_value(j - 1,
+ &len);
+ dfield_set_data(dfield, field, len);
+ continue;
+ }
+
+ field = rec_get_nth_field(rec, offsets, i, &len);
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+ }
+ }
+ }
+
+ if (mblob == 2) {
+ ulint n_fields = ulint(dfield - entry->fields);
+ ut_ad(entry->n_fields >= n_fields);
+ entry->n_fields = n_fields;
+ }
+ ut_ad(dfield == entry->fields + entry->n_fields);
+ ut_ad(dtuple_check_typed(entry));
+ return entry;
+}
+
+/** Convert an index record to a data tuple.
+@param[in] rec index record
+@param[in] index index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in,out] heap memory heap for allocations */
+dtuple_t*
+row_rec_to_index_entry_low(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap)
+{
+ return row_rec_to_index_entry_impl<false>(rec, index, offsets, heap);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(index != NULL);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* Take a copy of rec to heap */
+ const rec_t* copy_rec = rec_copy(
+ static_cast<byte*>(mem_heap_alloc(heap,
+ rec_offs_size(offsets))),
+ rec, offsets);
+
+ rec_offs_make_valid(copy_rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_t* entry = rec_is_alter_metadata(copy_rec, *index)
+ ? row_rec_to_index_entry_impl<true,1>(
+ copy_rec, index, offsets, heap)
+ : row_rec_to_index_entry_impl<true>(
+ copy_rec, index, offsets, heap);
+
+ rec_offs_make_valid(rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_set_info_bits(entry,
+ rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+ return(entry);
+}
+
+/** Convert a metadata record to a data tuple.
+@param[in] rec metadata record
+@param[in] index clustered index after instant ALTER TABLE
+@param[in] offsets rec_get_offsets(rec)
+@param[in,out] heap memory heap for allocations
+@param[in] info_bits the info_bits after an update
+@param[in] pad whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ mem_heap_t* heap,
+ ulint info_bits,
+ bool pad)
+{
+ ut_ad(info_bits == REC_INFO_METADATA_ALTER
+ || info_bits == REC_INFO_METADATA_ADD);
+ ut_ad(rec_is_metadata(rec, *index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ const rec_t* copy_rec = rec_copy(
+ static_cast<byte*>(mem_heap_alloc(heap,
+ rec_offs_size(offsets))),
+ rec, offsets);
+
+ rec_offs_make_valid(copy_rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_t* entry = info_bits == REC_INFO_METADATA_ALTER
+ || rec_is_alter_metadata(copy_rec, *index)
+ ? row_rec_to_index_entry_impl<true,2>(
+ copy_rec, index, offsets, heap, info_bits, pad)
+ : row_rec_to_index_entry_impl<true>(
+ copy_rec, index, offsets, heap);
+
+ rec_offs_make_valid(rec, index, true,
+ const_cast<rec_offs*>(offsets));
+
+ dtuple_set_info_bits(entry, info_bits);
+ return entry;
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dtuple_t* ref;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ byte* buf;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* tmp_heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(index != NULL);
+ ut_ad(rec != NULL);
+ ut_ad(heap != NULL);
+ ut_ad(!dict_index_is_clust(index));
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &tmp_heap);
+ /* Secondary indexes must not contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+ rec = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(rec, index, true, offsets);
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ ut_ad(!rec_offs_nth_default(offsets, pos));
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len = dict_index_get_nth_field(
+ clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ const dtype_t* dtype
+ = dfield_get_type(dfield);
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dtype->prtype,
+ dtype->mbminlen,
+ dtype->mbmaxlen,
+ clust_col_prefix_len,
+ len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ rec_offs* offsets)/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+{
+ const dict_index_t* clust_index;
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_a(index->table);
+
+ clust_index = dict_table_get_first_index(index->table);
+ ut_ad(clust_index);
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ /* Secondary indexes must not contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ ut_ad(!rec_offs_nth_default(offsets, pos));
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len = dict_index_get_nth_field(
+ clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ const dtype_t* dtype
+ = dfield_get_type(dfield);
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dtype->prtype,
+ dtype->mbminlen,
+ dtype->mbmaxlen,
+ clust_col_prefix_len,
+ len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return TRUE if found */
+bool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ut_ad(dtuple_check_typed(ref));
+
+ dict_index_t *index = dict_table_get_first_index(table);
+ btr_pcur_init(pcur);
+ pcur->btr_cur.page_cur.index = index;
+
+ if (UNIV_UNLIKELY(ref->info_bits != 0)) {
+ ut_ad(ref->is_metadata());
+ ut_ad(ref->n_fields <= index->n_uniq);
+ if (pcur->open_leaf(true, index, mode, mtr) != DB_SUCCESS
+ || !btr_pcur_move_to_next_user_rec(pcur, mtr)) {
+ return false;
+ }
+ /* We do not necessarily have index->is_instant() here,
+ because we could be executing a rollback of an
+ instant ADD COLUMN operation. The function
+ rec_is_metadata() asserts index->is_instant();
+ we do not want to call it here. */
+ return rec_get_info_bits(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(index->table))
+ & REC_INFO_MIN_REC_FLAG;
+ } else {
+ ut_a(ref->n_fields == index->n_uniq);
+ if (btr_pcur_open(ref, PAGE_CUR_LE, mode, pcur, mtr)
+ != DB_SUCCESS) {
+ return false;
+ }
+ }
+
+ return !page_rec_is_infimum(btr_pcur_get_rec(pcur))
+ && btr_pcur_get_low_match(pcur) == dtuple_get_n_fields(ref);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* ref;
+ dict_table_t* table;
+ btr_pcur_t pcur;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ table = index->table;
+
+ heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+ auto found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+ mem_heap_free(heap);
+
+ *clust_index = dict_table_get_first_index(table);
+ return found ? btr_pcur_get_rec(&pcur) : nullptr;
+}
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+ const dtuple_t* entry, /*!< in: index entry */
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint n_fields;
+ ulint low_match;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(entry));
+
+ if (btr_pcur_open(entry, PAGE_CUR_LE, mode, pcur, mtr) != DB_SUCCESS) {
+ return ROW_NOT_FOUND;
+ }
+
+ switch (btr_pcur_get_btr_cur(pcur)->flag) {
+ case BTR_CUR_DELETE_REF:
+ ut_ad(!(~mode & BTR_DELETE));
+ return(ROW_NOT_DELETED_REF);
+
+ case BTR_CUR_DEL_MARK_IBUF:
+ case BTR_CUR_DELETE_IBUF:
+ case BTR_CUR_INSERT_TO_IBUF:
+ return(ROW_BUFFERED);
+
+ case BTR_CUR_HASH:
+ case BTR_CUR_HASH_FAIL:
+ case BTR_CUR_BINARY:
+ break;
+ }
+
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ if (page_rec_is_infimum(rec)) {
+
+ return(ROW_NOT_FOUND);
+ } else if (low_match != n_fields) {
+
+ return(ROW_NOT_FOUND);
+ }
+
+ return(ROW_FOUND);
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint prtype, /*!< in: precise type */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size, /*!< in: output buffer size
+ in bytes */
+ ibool* format_in_hex) /*!< out: should the data be
+ formatted in hex */
+{
+ ulint ret;
+
+ if (data_len <= sizeof(ib_uint64_t)) {
+
+ ib_uint64_t value;
+ ibool unsigned_type = prtype & DATA_UNSIGNED;
+
+ value = mach_read_int_type(
+ (const byte*) data, data_len, unsigned_type);
+
+ ret = (ulint) snprintf(
+ buf, buf_size,
+ unsigned_type ? "%llu" : "%lld", (longlong) value)+1;
+ } else {
+
+ *format_in_hex = TRUE;
+ ret = 0;
+ }
+
+ return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint prtype, /*!< in: precise type */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size, /*!< in: output buffer size
+ in bytes */
+ ibool* format_in_hex) /*!< out: should the data be
+ formatted in hex */
+{
+ ulint charset_coll;
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ /* we assume system_charset_info is UTF-8 */
+
+ charset_coll = dtype_get_charset_coll(prtype);
+
+ if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+ return(ut_str_sql_format(data, data_len, buf, buf_size));
+ }
+ /* else */
+
+ if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+ *format_in_hex = TRUE;
+ return(0);
+ }
+ /* else */
+
+ return(innobase_raw_format(data, data_len, charset_coll,
+ buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint mtype;
+ ulint prtype;
+ ulint ret;
+ ibool format_in_hex;
+
+ ut_ad(data_len != UNIV_SQL_DEFAULT);
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ if (data_len == UNIV_SQL_NULL) {
+
+ ret = snprintf((char*) buf, buf_size, "NULL") + 1;
+
+ return(ut_min(ret, buf_size));
+ }
+
+ mtype = dict_field->col->mtype;
+ prtype = dict_field->col->prtype;
+
+ format_in_hex = FALSE;
+
+ switch (mtype) {
+ case DATA_INT:
+
+ ret = row_raw_format_int(data, data_len, prtype,
+ buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+ break;
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ case DATA_MYSQL:
+ case DATA_VARMYSQL:
+
+ ret = row_raw_format_str(data, data_len, prtype,
+ buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+
+ break;
+ /* XXX support more data types */
+ default:
+ format_in_hex:
+
+ if (UNIV_LIKELY(buf_size > 2)) {
+
+ memcpy(buf, "0x", 2);
+ buf += 2;
+ buf_size -= 2;
+ ret = 2 + ut_raw_to_hex(data, data_len,
+ buf, buf_size);
+ } else {
+
+ buf[0] = '\0';
+ ret = 1;
+ }
+ }
+
+ return(ret);
+}
+
+#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+
+#ifdef HAVE_UT_CHRONO_T
+
+void
+test_row_raw_format_int()
+{
+ ulint ret;
+ char buf[128];
+ ibool format_in_hex;
+ ulint i;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+ ret_expected, buf_expected, format_in_hex_expected)\
+ do {\
+ ibool ok = TRUE;\
+ ulint i;\
+ memset(buf, 'x', 10);\
+ buf[10] = '\0';\
+ format_in_hex = FALSE;\
+ fprintf(stderr, "TESTING \"\\x");\
+ for (i = 0; i < data_len; i++) {\
+ fprintf(stderr, "%02hhX", data[i]);\
+ }\
+ fprintf(stderr, "\", %lu, %lu, %lu\n",\
+ (ulint) data_len, (ulint) prtype,\
+ (ulint) buf_size);\
+ ret = row_raw_format_int(data, data_len, prtype,\
+ buf, buf_size, &format_in_hex);\
+ if (ret != ret_expected) {\
+ fprintf(stderr, "expected ret %lu, got %lu\n",\
+ (ulint) ret_expected, ret);\
+ ok = FALSE;\
+ }\
+ if (strcmp((char*) buf, buf_expected) != 0) {\
+ fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+ buf_expected, buf);\
+ ok = FALSE;\
+ }\
+ if (format_in_hex != format_in_hex_expected) {\
+ fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+ (int) format_in_hex_expected,\
+ (int) format_in_hex);\
+ ok = FALSE;\
+ }\
+ if (ok) {\
+ fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+ (ulint) ret, buf, (int) format_in_hex);\
+ } else {\
+ return;\
+ }\
+ } while (0)
+
+#if 1
+ /* min values for signed 1-8 byte integers */
+
+ CALL_AND_TEST("\x00", 1, 0,
+ buf, sizeof(buf), 5, "-128", 0);
+
+ CALL_AND_TEST("\x00\x00", 2, 0,
+ buf, sizeof(buf), 7, "-32768", 0);
+
+ CALL_AND_TEST("\x00\x00\x00", 3, 0,
+ buf, sizeof(buf), 9, "-8388608", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+ buf, sizeof(buf), 12, "-2147483648", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+ buf, sizeof(buf), 14, "-549755813888", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+ buf, sizeof(buf), 17, "-140737488355328", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+ buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+ buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+ /* min values for unsigned 1-8 byte integers */
+
+ CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ /* max values for signed 1-8 byte integers */
+
+ CALL_AND_TEST("\xFF", 1, 0,
+ buf, sizeof(buf), 4, "127", 0);
+
+ CALL_AND_TEST("\xFF\xFF", 2, 0,
+ buf, sizeof(buf), 6, "32767", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+ buf, sizeof(buf), 8, "8388607", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+ buf, sizeof(buf), 11, "2147483647", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+ buf, sizeof(buf), 13, "549755813887", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+ buf, sizeof(buf), 16, "140737488355327", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+ buf, sizeof(buf), 18, "36028797018963967", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+ buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+ /* max values for unsigned 1-8 byte integers */
+
+ CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 4, "255", 0);
+
+ CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "65535", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 9, "16777215", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 11, "4294967295", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+ buf, sizeof(buf), 14, "1099511627775", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+ buf, sizeof(buf), 16, "281474976710655", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+ buf, sizeof(buf), 18, "72057594037927935", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+ /* some random values */
+
+ CALL_AND_TEST("\x52", 1, 0,
+ buf, sizeof(buf), 4, "-46", 0);
+
+ CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 3, "14", 0);
+
+ CALL_AND_TEST("\x62\xCE", 2, 0,
+ buf, sizeof(buf), 6, "-7474", 0);
+
+ CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "10710", 0);
+
+ CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+ buf, sizeof(buf), 5, "-112", 0);
+
+ CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "41238", 0);
+
+ CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+ buf, sizeof(buf), 3, "-9", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 3, "92", 0);
+
+ CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+ buf, sizeof(buf), 6, "-9117", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+ /* speed test */
+
+ ut_chrono_t ch(__func__);
+
+ for (i = 0; i < 1000000; i++) {
+ row_raw_format_int("\x23", 1,
+ 0, buf, sizeof(buf),
+ &format_in_hex);
+ row_raw_format_int("\x23", 1,
+ DATA_UNSIGNED, buf, sizeof(buf),
+ &format_in_hex);
+
+ row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+ 0, buf, sizeof(buf),
+ &format_in_hex);
+ row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+ DATA_UNSIGNED, buf, sizeof(buf),
+ &format_in_hex);
+ }
+}
+
+#endif /* HAVE_UT_CHRONO_T */
+
+#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
new file mode 100644
index 00000000..6c76dd91
--- /dev/null
+++ b/storage/innobase/row/row0sel.cc
@@ -0,0 +1,6947 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.cc
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "gis0rtree.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#include "sql_error.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */
+#endif
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH 16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT 1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT 100
+
+/* Flags for search shortcut */
+#define SEL_FOUND 0
+#define SEL_EXHAUSTED 1
+#define SEL_RETRY 2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return whether the columns are equal */
+static
+bool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of
+ a character, in bytes */
+ ulint mbmaxlen, /*!< in: maximum length of
+ a character, in bytes */
+ const byte* clust_field, /*!< in: the locally stored part of
+ the clustered index column, including
+ the BLOB pointer; the clustered
+ index record must be covered by
+ a lock or a page latch to protect it
+ against deletion (rollback or purge) */
+ ulint clust_len, /*!< in: length of clust_field */
+ const byte* sec_field, /*!< in: column in secondary index */
+ ulint sec_len, /*!< in: length of sec_field */
+ ulint prefix_len, /*!< in: index column prefix length
+ in bytes, or 0 for full column */
+ dict_table_t* table) /*!< in: table */
+{
+ ulint len;
+ byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN + 1];
+
+ /* This function should never be invoked on tables in
+ ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they
+ should always contain enough prefix in the clustered index record. */
+ ut_ad(dict_table_has_atomic_blobs(table));
+ ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(!prefix_len || prefix_len >= sec_len);
+ ut_a(prefix_len <= sizeof buf);
+
+ if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* The externally stored field was not written yet.
+ This record should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+ return false;
+ }
+
+ len = btr_copy_externally_stored_field_prefix(
+ buf, prefix_len ? prefix_len : sizeof buf,
+ table->space->zip_size(),
+ clust_field, clust_len);
+
+ if (len == 0) {
+ /* The BLOB was being deleted as the server crashed.
+ There should not be any secondary index records
+ referring to this clustered index record, because
+ btr_free_externally_stored_field() is called after all
+ secondary index entries of the row have been purged. */
+ return false;
+ }
+
+ if (prefix_len) {
+ len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+ prefix_len, len,
+ reinterpret_cast<const char*>
+ (buf));
+ } else if (len >= sizeof buf) {
+ ut_ad("too long column" == 0);
+ return false;
+ }
+
+ return !cmp_data(mtype, prtype, false, buf, len, sec_field, sec_len);
+}
+
+/** Function to read the secondary spatial index, calculate
+the minimum bounding rectangle for clustered index record
+and secondary index record and compare it.
+@param sec_rec secondary index record
+@param sec_index spatial secondary index
+@param clust_rec clustered index record
+@param clust_index clustered index
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+ corresponding fields in the clustered record, when compared with
+ collation;
+@retval DB_SUCCESS if not equal */
+static
+dberr_t
+row_sel_spatial_sec_rec_is_for_clust_rec(
+ const rec_t *sec_rec, const dict_index_t *sec_index,
+ const rec_t *clust_rec, dict_index_t *clust_index)
+{
+ mem_heap_t *heap= mem_heap_create(256);
+ rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *clust_offs= clust_offsets_;
+ ulint clust_len;
+
+ rec_offs_init(clust_offsets_);
+ ulint clust_pos= dict_col_get_clust_pos(
+ dict_index_get_nth_col(sec_index, 0), clust_index);
+ clust_offs= rec_get_offsets(clust_rec, clust_index, clust_offs,
+ clust_index->n_core_fields, clust_pos + 1,
+ &heap);
+ ut_ad(sec_index->n_user_defined_cols == 1);
+ const byte *clust_field= rec_get_nth_field(clust_rec, clust_offs,
+ clust_pos, &clust_len);
+ if (clust_len == UNIV_SQL_NULL || clust_len < GEO_DATA_HEADER_SIZE)
+ {
+ ut_ad("corrupted geometry column" == 0);
+err_exit:
+ mem_heap_free(heap);
+ return DB_SUCCESS;
+ }
+
+ /* For externally stored field, we need to get full
+ geo data to generate the MBR for comparing. */
+ if (rec_offs_nth_extern(clust_offs, clust_pos))
+ {
+ clust_field= btr_copy_externally_stored_field(
+ &clust_len, clust_field, sec_index->table->space->zip_size(),
+ clust_len, heap);
+ if (clust_field == NULL)
+ {
+ ut_ad("corrupted geometry blob" == 0);
+ goto err_exit;
+ }
+ }
+
+ ut_ad(clust_len >= GEO_DATA_HEADER_SIZE);
+ rtr_mbr_t tmp_mbr;
+ rtr_mbr_t sec_mbr;
+
+ rtree_mbr_from_wkb(
+ clust_field + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(clust_len - GEO_DATA_HEADER_SIZE),
+ SPDIMS, reinterpret_cast<double*>(&tmp_mbr));
+
+ rtr_read_mbr(sec_rec, &sec_mbr);
+
+ mem_heap_free(heap);
+ return MBR_EQUAL_CMP(&sec_mbr, &tmp_mbr)
+ ? DB_SUCCESS_LOCKED_REC
+ : DB_SUCCESS;
+}
+
+/** Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@param[in] sec_rec secondary index record
+@param[in] sec_index secondary index
+@param[in] clust_rec clustered index record;
+ must be protected by a page s-latch
+@param[in] clust_index clustered index
+@param[in] thr query thread
+@retval DB_COMPUTE_VALUE_FAILED in case of virtual column value computation
+ failure.
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+ corresponding fields in the clustered record, when compared with
+ collation;
+@retval DB_SUCCESS if not equal or if the clustered record has been marked
+ for deletion */
+static
+dberr_t
+row_sel_sec_rec_is_for_clust_rec(
+ const rec_t* sec_rec,
+ dict_index_t* sec_index,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ que_thr_t* thr)
+{
+ if (rec_get_deleted_flag(clust_rec,
+ dict_table_is_comp(clust_index->table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(rec_get_trx_id(clust_rec, clust_index));
+
+ /* The clustered index record is delete-marked;
+ it is not visible in the read view. Besides,
+ if there are any externally stored columns,
+ some of them may have already been purged. */
+ return DB_SUCCESS;
+ }
+
+ if (dict_index_is_spatial(sec_index)) {
+ return row_sel_spatial_sec_rec_is_for_clust_rec(
+ sec_rec, sec_index, clust_rec,
+ clust_index);
+ }
+
+ const byte* sec_field;
+ ulint sec_len;
+ const byte* clust_field;
+ ulint n;
+ ulint i;
+ mem_heap_t* heap = mem_heap_create(256);
+ rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs sec_offsets_[REC_OFFS_SMALL_SIZE];
+ rec_offs* clust_offs = clust_offsets_;
+ rec_offs* sec_offs = sec_offsets_;
+
+ rec_offs_init(clust_offsets_);
+ rec_offs_init(sec_offsets_);
+
+ ib_vcol_row vc(heap);
+
+ clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+ sec_index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+ for (i = 0; i < n; i++) {
+ const dict_field_t* ifield;
+ const dict_col_t* col;
+ ulint clust_pos = 0;
+ ulint clust_len = 0;
+ ulint len;
+
+ ifield = dict_index_get_nth_field(sec_index, i);
+ col = dict_field_get_col(ifield);
+
+ sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+ const bool is_virtual = col->is_virtual();
+
+ /* For virtual column, its value will need to be
+ reconstructed from base column in cluster index */
+ if (is_virtual) {
+ const dict_v_col_t* v_col;
+ dfield_t* vfield;
+ row_ext_t* ext;
+
+ byte *record = vc.record(thr_get_trx(thr)->mysql_thd,
+ clust_index,
+ &thr->prebuilt->m_mysql_table);
+
+ v_col = reinterpret_cast<const dict_v_col_t*>(col);
+
+ dtuple_t* row = row_build(
+ ROW_COPY_POINTERS,
+ clust_index, clust_rec,
+ clust_offs,
+ NULL, NULL, NULL, &ext, heap);
+
+ vfield = innobase_get_computed_value(
+ row, v_col, clust_index,
+ &heap, NULL, NULL,
+ thr_get_trx(thr)->mysql_thd,
+ thr->prebuilt->m_mysql_table,
+ record, NULL, NULL,
+ true);
+
+ if (vfield == NULL) {
+ innobase_report_computed_value_failed(row);
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+ len = clust_len = vfield->len;
+ clust_field = static_cast<byte*>(vfield->data);
+ } else {
+ clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+ clust_field = rec_get_nth_cfield(
+ clust_rec, clust_index, clust_offs,
+ clust_pos, &clust_len);
+ if (clust_len == UNIV_SQL_NULL) {
+ if (sec_len == UNIV_SQL_NULL) {
+ continue;
+ }
+ return DB_SUCCESS;
+ }
+ if (sec_len == UNIV_SQL_NULL) {
+ return DB_SUCCESS;
+ }
+
+ len = clust_len;
+ ulint prefix_len = ifield->prefix_len;
+ if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+ /* BLOB can contain prefix. */
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ if (!len) {
+ goto compare_blobs;
+ }
+ }
+
+ if (prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen,
+ col->mbmaxlen, prefix_len, len,
+ reinterpret_cast<const char*>(
+ clust_field));
+ if (len < sec_len) {
+ goto check_for_blob;
+ }
+ } else {
+check_for_blob:
+ if (rec_offs_nth_extern(clust_offs,
+ clust_pos)) {
+compare_blobs:
+ if (!row_sel_sec_rec_is_for_blob(
+ col->mtype, col->prtype,
+ col->mbminlen,
+ col->mbmaxlen,
+ clust_field, clust_len,
+ sec_field, sec_len,
+ prefix_len,
+ clust_index->table)) {
+ return DB_SUCCESS;
+ }
+
+ continue;
+ }
+ }
+ }
+
+ if (cmp_data(col->mtype, col->prtype, false,
+ clust_field, len, sec_field, sec_len)) {
+ return DB_SUCCESS;
+ }
+ }
+
+ return DB_SUCCESS_LOCKED_REC;
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ sel_node_t* node;
+
+ node = static_cast<sel_node_t*>(
+ mem_heap_alloc(heap, sizeof(sel_node_t)));
+
+ node->common.type = QUE_NODE_SELECT;
+ node->state = SEL_NODE_OPEN;
+
+ node->plans = NULL;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node) /*!< in: select node struct */
+{
+ ulint i;
+ plan_t* plan;
+
+ if (node->plans != NULL) {
+ for (i = 0; i < node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(node, i);
+
+ btr_pcur_close(&(plan->pcur));
+ btr_pcur_close(&(plan->clust_pcur));
+
+ if (plan->old_vers_heap) {
+ mem_heap_free(plan->old_vers_heap);
+ }
+ }
+ }
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ que_node_t* exp;
+
+ exp = node->select_list;
+
+ while (exp) {
+ eval_exp(exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+ sym_node_t* var, /*!< in: first variable in a list of
+ variables */
+ sel_node_t* node) /*!< in: select node */
+{
+ que_node_t* exp;
+
+ if (var == NULL) {
+
+ return;
+ }
+
+ for (exp = node->select_list;
+ var != 0;
+ var = static_cast<sym_node_t*>(que_node_get_next(var))) {
+
+ ut_ad(exp);
+
+ eval_node_copy_val(var->alias, exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ func_node_t* func_node;
+
+ ut_ad(node->is_aggregate);
+
+ for (func_node = static_cast<func_node_t*>(node->select_list);
+ func_node != 0;
+ func_node = static_cast<func_node_t*>(
+ que_node_get_next(func_node))) {
+
+ eval_node_set_int_val(func_node, 0);
+ }
+
+ node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ sym_node_t* var;
+
+ var = UT_LIST_GET_FIRST(node->copy_variables);
+
+ while (var) {
+ eval_node_copy_val(var, var->alias);
+
+ var->indirection = NULL;
+
+ var = UT_LIST_GET_NEXT(col_var_list, var);
+ }
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+ dict_index_t* index, /*!< in: record index */
+ const rec_t* rec, /*!< in: record in a clustered or non-clustered
+ index; must be protected by a page latch */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ sym_node_t* column) /*!< in: first column in a column list, or
+ NULL */
+{
+ dfield_t* val;
+ ulint index_type;
+ ulint field_no;
+ const byte* data;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ index_type = SYM_CLUST_FIELD_NO;
+ } else {
+ index_type = SYM_SEC_FIELD_NO;
+ }
+
+ while (column) {
+ mem_heap_t* heap = NULL;
+ ibool needs_copy;
+
+ field_no = column->field_nos[index_type];
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ if (UNIV_UNLIKELY(rec_offs_nth_extern(
+ offsets, field_no) != 0)) {
+
+ /* Copy an externally stored field to the
+ temporary heap, if possible. */
+
+ heap = mem_heap_create(1);
+
+ data = btr_rec_copy_externally_stored_field(
+ rec, offsets,
+ index->table->space->zip_size(),
+ field_no, &len, heap);
+
+ /* data == NULL means that the
+ externally stored field was not
+ written yet. This record
+ should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED
+ transactions. The InnoDB SQL parser
+ (the sole caller of this function)
+ does not implement READ UNCOMMITTED,
+ and it is not involved during rollback. */
+ ut_a(data);
+ ut_a(len != UNIV_SQL_NULL);
+
+ needs_copy = TRUE;
+ } else {
+ data = rec_get_nth_cfield(rec, index, offsets,
+ field_no, &len);
+ needs_copy = column->copy_val;
+ }
+
+ if (needs_copy) {
+ eval_node_copy_and_alloc_val(column, data,
+ len);
+ } else {
+ val = que_node_get_val(column);
+ dfield_set_data(val, data, len);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+ sym_node_t* column) /*!< in: symbol table node for a column */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+ column->prefetch_buf = static_cast<sel_buf_t*>(
+ ut_malloc_nokey(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = column->prefetch_buf + i;
+
+ sel_buf->data = NULL;
+ sel_buf->len = 0;
+ sel_buf->val_buf_size = 0;
+ }
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = prefetch_buf + i;
+
+ if (sel_buf->val_buf_size > 0) {
+
+ ut_free(sel_buf->data);
+ }
+ }
+
+ ut_free(prefetch_buf);
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_dequeue_prefetched_row(
+/*=======================*/
+ plan_t* plan) /*!< in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint val_buf_size;
+
+ ut_ad(plan->n_rows_prefetched > 0);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ val = que_node_get_val(column);
+
+ if (!column->copy_val) {
+ /* We did not really push any value for the
+ column */
+
+ ut_ad(!column->prefetch_buf);
+ ut_ad(que_node_get_val_buf_size(column) == 0);
+ ut_d(dfield_set_null(val));
+
+ goto next_col;
+ }
+
+ ut_ad(column->prefetch_buf);
+ ut_ad(!dfield_is_ext(val));
+
+ sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+ data = sel_buf->data;
+ len = sel_buf->len;
+ val_buf_size = sel_buf->val_buf_size;
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ sel_buf->data = static_cast<byte*>(dfield_get_data(val));
+ sel_buf->len = dfield_get_len(val);
+ sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+ dfield_set_data(val, data, len);
+ que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+
+ plan->n_rows_prefetched--;
+
+ plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_enqueue_prefetched_row(
+/*=======================*/
+ plan_t* plan) /*!< in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint pos;
+ ulint val_buf_size;
+
+ if (plan->n_rows_prefetched == 0) {
+ pos = 0;
+ plan->first_prefetched = 0;
+ } else {
+ pos = plan->n_rows_prefetched;
+
+ /* We have the convention that pushing new rows starts only
+ after the prefetch stack has been emptied: */
+
+ ut_ad(plan->first_prefetched == 0);
+ }
+
+ plan->n_rows_prefetched++;
+
+ ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+ for (column = UT_LIST_GET_FIRST(plan->columns);
+ column != 0;
+ column = UT_LIST_GET_NEXT(col_var_list, column)) {
+
+ if (!column->copy_val) {
+ /* There is no sense to push pointers to database
+ page fields when we do not keep latch on the page! */
+ continue;
+ }
+
+ if (!column->prefetch_buf) {
+ /* Allocate a new prefetch buffer */
+
+ sel_col_prefetch_buf_alloc(column);
+ }
+
+ sel_buf = column->prefetch_buf + pos;
+
+ val = que_node_get_val(column);
+
+ data = static_cast<byte*>(dfield_get_data(val));
+ len = dfield_get_len(val);
+ val_buf_size = que_node_get_val_buf_size(column);
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ dfield_set_data(val, sel_buf->data, sel_buf->len);
+ que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+ sel_buf->data = data;
+ sel_buf->len = len;
+ sel_buf->val_buf_size = val_buf_size;
+ }
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_build_prev_vers(
+/*====================*/
+ ReadView* read_view, /*!< in: read view */
+ dict_index_t* index, /*!< in: plan node for table */
+ rec_t* rec, /*!< in: record in a clustered index */
+ rec_offs** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, plan->index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
+ rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dberr_t err;
+
+ if (*old_vers_heap) {
+ mem_heap_empty(*old_vers_heap);
+ } else {
+ *old_vers_heap = mem_heap_create(512);
+ }
+
+ err = row_vers_build_for_consistent_read(
+ rec, mtr, index, offsets, read_view, offset_heap,
+ *old_vers_heap, old_vers, NULL);
+ return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read. */
+static
+void
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+ dict_index_t* clust_index, /*!< in: clustered index */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: record in a clustered index */
+ rec_offs** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ const rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ dtuple_t** vrow, /*!< out: to be filled with old virtual
+ column version if any */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(
+ rec_offs_size(*offsets));
+ }
+
+ row_vers_build_for_semi_consistent_read(prebuilt->trx,
+ rec, mtr, clust_index, offsets, offset_heap,
+ prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ plan_t* plan) /*!< in: plan for the table; the column values must
+ already have been retrieved and the right sides of
+ comparisons evaluated */
+{
+ func_node_t* cond;
+
+ /* All conditions in end_conds are comparisons of a column to an
+ expression */
+
+ for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+ cond != 0;
+ cond = UT_LIST_GET_NEXT(cond_list, cond)) {
+
+ /* Evaluate the left side of the comparison, i.e., get the
+ column value if there is an indirection */
+
+ eval_sym(static_cast<sym_node_t*>(cond->args));
+
+ /* Do the comparison */
+
+ if (!eval_cmp(cond)) {
+
+ return(FALSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+ plan_t* plan) /*!< in: plan for the table; the column values must
+ already have been retrieved */
+{
+ func_node_t* cond;
+
+ cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+ while (cond) {
+ eval_exp(cond);
+
+ if (!eval_node_get_ibool_val(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/** Check that a clustered index record is visible in a consistent read view.
+@param rec clustered index record (in leaf page, or in memory)
+@param index clustered index
+@param offsets rec_get_offsets(rec, index)
+@param view consistent read view
+@retval DB_SUCCESS if rec is visible in view
+@retval DB_SUCCESS_LOCKED_REC if rec is not visible in view
+@retval DB_CORRUPTION if the DB_TRX_ID is corrupted */
+static dberr_t row_sel_clust_sees(const rec_t *rec, const dict_index_t &index,
+ const rec_offs *offsets,
+ const ReadView &view)
+{
+ ut_ad(index.is_primary());
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, &index, offsets));
+ ut_ad(!rec_is_metadata(rec, index));
+ ut_ad(!index.table->is_temporary());
+
+ const trx_id_t id= row_get_rec_trx_id(rec, &index, offsets);
+
+ if (view.changes_visible(id))
+ return DB_SUCCESS;
+ if (UNIV_LIKELY(id < view.low_limit_id() || id < trx_sys.get_max_trx_id()))
+ return DB_SUCCESS_LOCKED_REC;
+
+ ib::warn() << "A transaction id in a record of table " << index.table->name
+ << " is newer than the system-wide maximum.";
+ return DB_CORRUPTION;
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_get_clust_rec(
+/*==================*/
+ sel_node_t* node, /*!< in: select_node */
+ plan_t* plan, /*!< in: plan node for table */
+ rec_t* rec, /*!< in: record in a non-clustered index */
+ que_thr_t* thr, /*!< in: query thread */
+ rec_t** out_rec,/*!< out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ mtr_t* mtr) /*!< in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ *out_rec = NULL;
+
+ offsets = rec_get_offsets(rec, plan->pcur.index(), offsets,
+ plan->pcur.index()->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+ index = dict_table_get_first_index(plan->table);
+ plan->clust_pcur.old_rec = nullptr;
+ plan->clust_pcur.btr_cur.page_cur.index = index;
+ dberr_t err = btr_pcur_open_with_no_init(plan->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ &plan->clust_pcur, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto err_exit;
+ }
+
+ clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(&(plan->clust_pcur))
+ < dict_index_get_n_unique(index)) {
+
+ if (!node->read_view ||
+ !rec_get_deleted_flag(rec, plan->table->not_redundant())) {
+ err = DB_CORRUPTION;
+ }
+
+ /* In a rare case it is possible that no clust rec is found
+ for a delete-marked secondary index record: if in row0umod.cc
+ in row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case we know that the
+ clustered index record did not exist in the read view of
+ trx. */
+
+ goto err_exit;
+ }
+
+ offsets = rec_get_offsets(clust_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (!node->read_view) {
+ /* Try to place a lock on the index record */
+ trx_t* trx = thr_get_trx(thr);
+
+ /* At READ UNCOMMITTED or READ COMMITTED isolation level
+ we lock only the record, i.e., next-key locking is
+ not used. */
+ err = lock_clust_rec_read_check_and_lock(
+ 0, btr_pcur_get_block(&plan->clust_pcur),
+ clust_rec, index, offsets,
+ node->row_lock_mode,
+ trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ ? LOCK_REC_NOT_GAP : LOCK_ORDINARY,
+ thr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ /* Declare the variable uninitialized.
+ It should be set to DB_SUCCESS at func_exit. */
+ MEM_UNDEFINED(&err, sizeof err);
+ break;
+ default:
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ err = row_sel_clust_sees(clust_rec, *index, offsets,
+ *node->read_view);
+
+ switch (err) {
+ default:
+ goto err_exit;
+ case DB_SUCCESS:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ err = row_sel_build_prev_vers(
+ node->read_view, index, clust_rec,
+ &offsets, &heap, &plan->old_vers_heap,
+ &old_vers, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+
+ if (clust_rec == NULL) {
+ goto err_exit;
+ }
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ if (old_vers || rec_get_deleted_flag(rec, dict_table_is_comp(
+ plan->table))) {
+ err = row_sel_sec_rec_is_for_clust_rec(rec,
+ plan->index, clust_rec,
+ index, thr);
+ if (err != DB_SUCCESS_LOCKED_REC) {
+ goto err_exit;
+ }
+ }
+ }
+
+ /* Fetch the columns needed in test conditions. The clustered
+ index record is protected by a page latch that was acquired
+ when plan->clust_pcur was positioned. The latch will not be
+ released until mtr->commit(). */
+
+ ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+ row_sel_fetch_columns(index, clust_rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+ *out_rec = clust_rec;
+ err = DB_SUCCESS;
+err_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a page of R-Tree record. This is all or none action,
+mostly due to we cannot reposition a record in R-Tree (with the
+nature of splitting)
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rtr_rec_lock(
+/*=================*/
+ btr_pcur_t* pcur, /*!< in: cursor */
+ const rec_t* first_rec,/*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ unsigned mode, /*!< in: lock mode */
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOC_REC_NOT_GAP */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ matched_rec_t* match = pcur->btr_cur.rtr_info->matches;
+ mem_heap_t* heap = NULL;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx = thr_get_trx(thr);
+ buf_block_t* cur_block = btr_pcur_get_block(pcur);
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* my_offsets = const_cast<rec_offs*>(offsets);
+ rec_t* rec = const_cast<rec_t*>(first_rec);
+ rtr_rec_vector* match_rec;
+ rtr_rec_vector::iterator end;
+
+ rec_offs_init(offsets_);
+
+ if (match->locked || page_rec_is_supremum(first_rec)) {
+ return(DB_SUCCESS_LOCKED_REC);
+ }
+
+ ut_ad(page_align(first_rec) == cur_block->page.frame);
+ ut_ad(match->valid);
+
+ match->block.page.lock.x_lock();
+retry:
+ cur_block = btr_pcur_get_block(pcur);
+ ut_ad(match->block.page.lock.have_x()
+ || match->block.page.lock.have_s());
+ ut_ad(page_is_leaf(cur_block->page.frame));
+
+ err = lock_sec_rec_read_check_and_lock(
+ 0, cur_block, rec, index, my_offsets,
+ static_cast<lock_mode>(mode), type, thr);
+
+ if (err == DB_LOCK_WAIT) {
+re_scan:
+ mtr->commit();
+ trx->error_state = err;
+ thr->lock_state = QUE_THR_LOCK_ROW;
+ if (row_mysql_handle_errors(
+ &err, trx, thr, NULL)) {
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+ mtr->start();
+
+ mysql_mutex_lock(&match->rtr_match_mutex);
+ if (!match->valid && match->matched_recs->empty()) {
+ mysql_mutex_unlock(&match->rtr_match_mutex);
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+ mysql_mutex_unlock(&match->rtr_match_mutex);
+
+ /* MDEV-14059 FIXME: why re-latch the block?
+ pcur is already positioned on it! */
+ cur_block = buf_page_get_gen(
+ btr_pcur_get_block(pcur)->page.id(),
+ btr_pcur_get_block(pcur)->zip_size(),
+ RW_X_LATCH, NULL, BUF_GET, mtr, &err);
+ if (!cur_block) {
+ goto func_end;
+ }
+ } else {
+ mtr->start();
+ goto func_end;
+ }
+
+ DEBUG_SYNC_C("rtr_set_lock_wait");
+
+ if (!match->valid) {
+ /* Page got deleted */
+ mtr->commit();
+ mtr->start();
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+
+ match->matched_recs->clear();
+ // FIXME: check for !cur_block
+
+ rtr_cur_search_with_match(
+ cur_block, index,
+ pcur->btr_cur.rtr_info->search_tuple,
+ pcur->btr_cur.rtr_info->search_mode,
+ &pcur->btr_cur.page_cur,
+ pcur->btr_cur.rtr_info);
+
+ if (!page_is_leaf(buf_block_get_frame(cur_block))) {
+ /* Page got splitted and promoted (only for
+ root page it is possible). Release the
+ page and ask for a re-search */
+ mtr->commit();
+ mtr->start();
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ my_offsets = offsets_;
+ my_offsets = rec_get_offsets(rec, index, my_offsets,
+ index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* No match record */
+ if (page_rec_is_supremum(rec) || !match->valid) {
+ mtr->commit();
+ mtr->start();
+ err = DB_RECORD_NOT_FOUND;
+ goto func_end;
+ }
+
+ goto retry;
+ }
+
+ my_offsets = offsets_;
+ match_rec = match->matched_recs;
+ end = match_rec->end();
+
+ for (rtr_rec_vector::iterator it = match_rec->begin();
+ it != end; ++it) {
+ rtr_rec_t* rtr_rec = &(*it);
+
+ my_offsets = rec_get_offsets(
+ rtr_rec->r_rec, index, my_offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ err = lock_sec_rec_read_check_and_lock(
+ 0, &match->block, rtr_rec->r_rec, index,
+ my_offsets, static_cast<lock_mode>(mode),
+ type, thr);
+
+ if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+ rtr_rec->locked = true;
+ } else if (err == DB_LOCK_WAIT) {
+ goto re_scan;
+ } else {
+ goto func_end;
+ }
+ }
+
+ match->locked = true;
+
+func_end:
+ match->block.page.lock.x_unlock();
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+
+ ut_ad(err != DB_LOCK_WAIT);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rec_lock(
+/*=============*/
+ btr_pcur_t* pcur, /*!< in: cursor */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ unsigned mode, /*!< in: lock mode */
+ unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOC_REC_NOT_GAP */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_t* trx;
+ dberr_t err = DB_SUCCESS;
+ const buf_block_t* block;
+
+ block = btr_pcur_get_block(pcur);
+
+ trx = thr_get_trx(thr);
+
+ if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000
+ && buf_pool.running_out()) {
+ return DB_LOCK_TABLE_FULL;
+ }
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets,
+ static_cast<lock_mode>(mode), type, thr);
+ } else {
+
+ if (dict_index_is_spatial(index)) {
+ if (type == LOCK_GAP || type == LOCK_ORDINARY) {
+ ut_ad(0);
+ ib::error() << "Incorrectly request GAP lock "
+ "on RTree";
+ return(DB_SUCCESS);
+ }
+ err = sel_set_rtr_rec_lock(pcur, rec, index, offsets,
+ mode, type, thr, mtr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets,
+ static_cast<lock_mode>(mode), type, thr);
+ }
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
+static
+dberr_t
+row_sel_open_pcur(
+/*==============*/
+ plan_t* plan, /*!< in: table plan */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index;
+ func_node_t* cond;
+ que_node_t* exp;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(!plan->n_rows_prefetched);
+ ut_ad(!plan->n_rows_fetched);
+ ut_ad(!plan->cursor_at_end);
+
+ index = plan->index;
+
+ /* Calculate the value of the search tuple: the exact match columns
+ get their expressions evaluated when we evaluate the right sides of
+ end_conds */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ eval_exp(que_node_get_next(cond->args));
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ plan->pcur.old_rec = nullptr;
+ plan->pcur.btr_cur.page_cur.index = index;
+
+ dberr_t err;
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+
+ if (plan->n_exact_match < n_fields) {
+ /* There is a non-exact match field which must be
+ evaluated separately */
+
+ eval_exp(plan->tuple_exps[n_fields - 1]);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ exp = plan->tuple_exps[i];
+
+ dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+ que_node_get_val(exp));
+ }
+
+ err = btr_pcur_open_with_no_init(plan->tuple,
+ plan->mode, BTR_SEARCH_LEAF,
+ &plan->pcur, mtr);
+ } else {
+ err = plan->pcur.open_leaf(plan->asc, index, BTR_SEARCH_LEAF,
+ mtr);
+ }
+
+ plan->pcur_is_open = err == DB_SUCCESS;
+ return err;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+ plan_t* plan, /*!< in: table plan */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool equal_position;
+ ulint relative_position;
+
+ ut_ad(!plan->cursor_at_end);
+
+ relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+ equal_position =
+ plan->pcur.restore_position(BTR_SEARCH_LEAF, mtr) ==
+ btr_pcur_t::SAME_ALL;
+
+ /* If the cursor is traveling upwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+ yet on the successor of the page infimum;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ not yet processed the cursor record: no need to move the cursor to the
+ next record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we must move to the next record;
+ (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the next
+ record, else there is no need to move the cursor. */
+
+ if (plan->asc) {
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(FALSE);
+ }
+
+ /* If the cursor is traveling downwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+ the last record LESS than the successor of a page infimum; we have not
+ processed the cursor record: no need to move the cursor;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ processed the cursor record: we should move the cursor to the previous
+ record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we need not move to the previous
+ record; (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+ record, else there is no need to move the cursor. */
+
+ if (relative_position == BTR_PCUR_BEFORE
+ || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+ return(FALSE);
+ }
+
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(FALSE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+ plan_t* plan) /*!< in: plan */
+{
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+ plan->n_rows_fetched = 0;
+ plan->n_rows_prefetched = 0;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+ sel_node_t* node, /*!< in: select node for a consistent read */
+ plan_t* plan, /*!< in: plan for a unique search in clustered
+ index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index = plan->index;
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(node->read_view);
+ ut_ad(node->read_view->is_open());
+ ut_ad(plan->unique_search);
+ ut_ad(!plan->must_get_clust);
+
+ if (row_sel_open_pcur(plan, mtr) != DB_SUCCESS) {
+ return SEL_RETRY;
+ }
+
+ const rec_t* rec = btr_pcur_get_rec(&(plan->pcur));
+
+ if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+ return SEL_RETRY;
+ }
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+ return SEL_EXHAUSTED;
+ }
+
+ if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+ /* See row_search_mvcc() for a comment on bulk_trx_id */
+ if (!node->read_view->changes_visible(bulk_trx_id)) {
+ return SEL_EXHAUSTED;
+ }
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_index_is_clust(index)) {
+ if (row_sel_clust_sees(rec, *index, offsets, *node->read_view)
+ != DB_SUCCESS) {
+ return SEL_RETRY;
+ }
+ } else if (!srv_read_only_mode) {
+ trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+ ut_ad(trx_id);
+ if (!node->read_view->sees(trx_id)) {
+ return SEL_RETRY;
+ }
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+ return SEL_EXHAUSTED;
+ }
+
+ /* Fetch the columns needed in test conditions. The index
+ record is protected by a page latch that was acquired when
+ plan->pcur was positioned. The latch will not be released
+ until mtr->commit(). */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+ return SEL_EXHAUSTED;
+ }
+
+ ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+ plan->n_rows_fetched++;
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return SEL_FOUND;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Performs a select step.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel(
+/*====*/
+ sel_node_t* node, /*!< in: select node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ plan_t* plan;
+ mtr_t mtr;
+ ibool moved;
+ rec_t* rec;
+ rec_t* old_vers;
+ rec_t* clust_rec;
+
+ /* The following flag becomes TRUE when we are doing a
+ consistent read from a non-clustered index and we must look
+ at the clustered index to find out the previous delete mark
+ state of the non-clustered record: */
+
+ ibool cons_read_requires_clust_rec = FALSE;
+ ulint cost_counter = 0;
+ ibool cursor_just_opened;
+ ibool must_go_to_next;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ /* TRUE if the search was made using
+ a non-clustered index, and we had to
+ access the clustered record: now &mtr
+ contains a clustered index latch, and
+ &mtr must be committed before we move
+ to the next non-clustered record */
+ dberr_t err;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ const trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(thr->run_node == node);
+ ut_ad(!node->read_view || node->read_view == &trx->read_view);
+ ut_ad(!node->read_view || node->read_view->is_open());
+
+table_loop:
+ /* TABLE LOOP
+ ----------
+ This is the outer major loop in calculating a join. We come here when
+ node->fetch_table changes, and after adding a row to aggregate totals
+ and, of course, when this function is called. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ plan = sel_node_get_nth_plan(node, node->fetch_table);
+ index = plan->index;
+
+ if (plan->n_rows_prefetched > 0) {
+ sel_dequeue_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+ if (plan->cursor_at_end) {
+ /* The cursor has already reached the result set end: no more
+ rows to process for this table cursor, as also the prefetch
+ stack was empty */
+
+ ut_ad(plan->pcur_is_open);
+
+ goto table_exhausted_no_mtr;
+ }
+
+ /* Open a cursor to index, or restore an open cursor position */
+
+ mtr.start();
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (node->read_view && plan->unique_search && !plan->pcur_is_open
+ && !plan->must_get_clust) {
+ switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
+ case SEL_FOUND:
+ goto next_table;
+ case SEL_EXHAUSTED:
+ goto table_exhausted;
+ default:
+ ut_ad(0);
+ /* fall through */
+ case SEL_RETRY:
+ break;
+ }
+
+ plan_reset_cursor(plan);
+
+ mtr.commit();
+ mtr.start();
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (!plan->pcur_is_open) {
+ /* Evaluate the expressions to build the search tuple and
+ open the cursor */
+ err = row_sel_open_pcur(plan, &mtr);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto mtr_commit_exit;
+ }
+
+ cursor_just_opened = TRUE;
+
+ /* A new search was made: increment the cost counter */
+ cost_counter++;
+ } else {
+ /* Restore pcur position to the index */
+
+ must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+ cursor_just_opened = FALSE;
+
+ if (must_go_to_next) {
+ /* We have already processed the cursor record: move
+ to the next */
+
+ goto next_rec;
+ }
+ }
+
+ if (!node->read_view
+ || trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+ } else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+ /* See row_search_mvcc() for a comment on bulk_trx_id */
+ if (!trx->read_view.changes_visible(bulk_trx_id)) {
+ goto table_exhausted;
+ }
+ }
+
+rec_loop:
+ /* RECORD LOOP
+ -----------
+ In this loop we use pcur and try to fetch a qualifying row, and
+ also fill the prefetch buffer for this table if n_rows_fetched has
+ exceeded a threshold. While we are inside this loop, the following
+ holds:
+ (1) &mtr is started,
+ (2) pcur is positioned and open.
+
+ NOTE that if cursor_just_opened is TRUE here, it means that we came
+ to this point right after row_sel_open_pcur. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ /* PHASE 1: Set a lock if specified */
+
+ if (!node->asc && cursor_just_opened
+ && !page_rec_is_supremum(rec)) {
+
+ /* Do not support "descending search" for Spatial index */
+ ut_ad(!dict_index_is_spatial(index));
+
+ /* When we open a cursor for a descending search, we must set
+ a next-key lock on the successor record: otherwise it would
+ be possible to insert new records next to the cursor position,
+ and it might be that these new records should appear in the
+ search result set, resulting in the phantom problem. */
+
+ if (!node->read_view) {
+ const rec_t* next_rec = page_rec_get_next_const(rec);
+ if (UNIV_UNLIKELY(!next_rec)) {
+ err = DB_CORRUPTION;
+ goto lock_wait_or_error;
+ }
+ unsigned lock_type;
+
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* At READ UNCOMMITTED or READ COMMITTED
+ isolation level, we lock only the record,
+ i.e., next-key locking is not used. */
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ if (page_rec_is_supremum(next_rec)) {
+ goto skip_lock;
+ }
+
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(&plan->pcur,
+ next_rec, index, offsets,
+ node->row_lock_mode,
+ lock_type, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ /* Note that in this case we will store in pcur
+ the PREDECESSOR of the record we are waiting
+ the lock for */
+ goto lock_wait_or_error;
+ }
+ }
+ }
+
+skip_lock:
+ if (page_rec_is_infimum(rec)) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. We also increment the cost counter as we may have
+ processed yet another page of index. */
+
+ cost_counter++;
+
+ goto next_rec;
+ }
+
+ if (rec_is_metadata(rec, *index)) {
+ /* Skip the metadata pseudo-record. */
+ cost_counter++;
+ goto next_rec;
+ }
+
+ if (!node->read_view) {
+ /* Try to place a lock on the index record */
+ unsigned lock_type;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* At READ UNCOMMITTED or READ COMMITTED isolation level,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ || dict_index_is_spatial(index)) {
+
+ if (page_rec_is_supremum(rec)) {
+
+ goto next_rec;
+ }
+
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(&plan->pcur,
+ rec, index, offsets,
+ node->row_lock_mode, lock_type,
+ thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ /* A page supremum record cannot be in the result set: skip
+ it now when we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (cost_counter > SEL_COST_LIMIT) {
+
+ /* Now that we have placed the necessary locks, we can stop
+ for a while and store the cursor position; NOTE that if we
+ would store the cursor position BEFORE placing a record lock,
+ it might happen that the cursor would jump over some records
+ that another transaction could meanwhile insert adjacent to
+ the cursor: this would result in the phantom problem. */
+
+ goto stop_for_a_while;
+ }
+
+ /* PHASE 2: Check a mixed index mix id if needed */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search
+ with the mode PAGE_CUR_GE, the up_match field in the cursor
+ tells how many fields in the user record matched to the search
+ tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur))
+ < plan->n_exact_match) {
+ goto table_exhausted;
+ }
+
+ /* Ok, no need to test end_conds or mix id */
+
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ /* PHASE 3: Get previous version in a consistent read */
+
+ cons_read_requires_clust_rec = FALSE;
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (node->read_view) {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (dict_index_is_clust(index)) {
+ const trx_id_t id = row_get_rec_trx_id(
+ rec, index, offsets);
+
+ if (!node->read_view->changes_visible(id)) {
+ if (id >= node->read_view->low_limit_id()
+ && id >= trx_sys.get_max_trx_id()) {
+ err = DB_CORRUPTION;
+ goto lock_wait_or_error;
+ }
+
+ err = row_sel_build_prev_vers(
+ node->read_view, index, rec,
+ &offsets, &heap, &plan->old_vers_heap,
+ &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The record does not exist
+ in our read view. Skip it, but
+ first attempt to determine
+ whether the index segment we
+ are searching through has been
+ exhausted. */
+
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* Fetch the columns needed in
+ test conditions. The clustered
+ index record is protected by a
+ page latch that was acquired
+ by row_sel_open_pcur() or
+ row_sel_restore_pcur_pos().
+ The latch will not be released
+ until mtr.commit(). */
+
+ row_sel_fetch_columns(
+ index, rec, offsets,
+ UT_LIST_GET_FIRST(
+ plan->columns));
+
+ if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!srv_read_only_mode) {
+ trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+ ut_ad(trx_id);
+ if (!node->read_view->sees(trx_id)) {
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+ }
+
+ /* PHASE 4: Test search end conditions and deleted flag */
+
+ /* Fetch the columns needed in test conditions. The record is
+ protected by a page latch that was acquired by
+ row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
+ will not be released until mtr.commit(). */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the selection end conditions: these can only contain columns
+ which already are found in the index, even though the index might be
+ non-clustered */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ /* No test necessary: the test was already made above */
+
+ } else if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+ && !cons_read_requires_clust_rec) {
+
+ /* The record is delete marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 5: Get the clustered index record, if needed and if we did
+ not do the search using the clustered index */
+
+ if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+ &mtr);
+ mtr_has_extra_clust_latch = TRUE;
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* Retrieving the clustered record required a search:
+ increment the cost counter */
+
+ cost_counter++;
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(node->read_view);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec,
+ dict_table_is_comp(plan->table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(rec_get_trx_id(clust_rec,
+ dict_table_get_first_index(
+ plan->table)));
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ if (node->can_get_updated) {
+
+ btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+ }
+ }
+
+ /* PHASE 6: Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 7: We found a new qualifying row for the current table; push
+ the row if prefetch is on, or move to the next table in the join */
+
+ plan->n_rows_fetched++;
+
+ ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+ if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+ || plan->unique_search || plan->no_prefetch) {
+
+ /* No prefetch in operation: go to the next table */
+
+ goto next_table;
+ }
+
+ sel_enqueue_prefetched_row(plan);
+
+ if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+ /* The prefetch buffer is now full */
+
+ sel_dequeue_prefetched_row(plan);
+
+ goto next_table;
+ }
+
+next_rec:
+ if (mtr_has_extra_clust_latch) {
+
+ /* We must commit &mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (node->asc) {
+ moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+ }
+
+ if (!moved) {
+
+ goto table_exhausted;
+ }
+
+ cursor_just_opened = FALSE;
+
+ /* END OF RECORD LOOP
+ ------------------ */
+ goto rec_loop;
+
+next_table:
+ /* We found a record which satisfies the conditions: we can move to
+ the next table or return a row in the result set */
+
+ ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+ if (plan->unique_search && !node->can_get_updated) {
+
+ plan->cursor_at_end = TRUE;
+ } else {
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+ }
+
+ mtr.commit();
+
+ mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+ /* If we use 'goto' to this label, it means that the row was popped
+ from the prefetched rows stack, and &mtr is already committed */
+
+ if (node->fetch_table + 1 == node->n_tables) {
+
+ sel_eval_select_list(node);
+
+ if (node->is_aggregate) {
+
+ goto table_loop;
+ }
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ node->fetch_table++;
+
+ /* When we move to the next table, we first reset the plan cursor:
+ we do not care about resetting it when we backtrack from a table */
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+ goto table_loop;
+
+table_exhausted:
+ /* The table cursor pcur reached the result set end: backtrack to the
+ previous table in the join if we do not have cached prefetched rows */
+
+ plan->cursor_at_end = TRUE;
+
+ mtr.commit();
+
+ mtr_has_extra_clust_latch = FALSE;
+
+ if (plan->n_rows_prefetched > 0) {
+ /* The table became exhausted during a prefetch */
+
+ sel_dequeue_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+table_exhausted_no_mtr:
+ if (node->fetch_table == 0) {
+ err = DB_SUCCESS;
+
+ if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+ node->aggregate_already_fetched = TRUE;
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+ } else {
+ node->state = SEL_NODE_NO_MORE_ROWS;
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ goto func_exit;
+ }
+
+ node->fetch_table--;
+
+ goto table_loop;
+
+stop_for_a_while:
+ /* Return control for a while to que_run_threads, so that runaway
+ queries can be canceled. NOTE that when we come here, we must, in a
+ locking read, have placed the necessary (possibly waiting request)
+ record lock on the cursor record or its successor: when we reposition
+ the cursor, this record lock guarantees that nobody can meanwhile have
+ inserted new records which should have appeared in the result set,
+ which would result in the phantom problem. */
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ err = DB_SUCCESS;
+ goto mtr_commit_exit;
+
+commit_mtr_for_a_while:
+ /* Stores the cursor position and commits &mtr; this is used if
+ &mtr may contain latches which would break the latching order if
+ &mtr would not be committed and the latches released. */
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr.commit();
+
+ mtr_has_extra_clust_latch = FALSE;
+
+ goto table_loop;
+
+lock_wait_or_error:
+ /* See the note at stop_for_a_while: the same holds for this case */
+
+ ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+mtr_commit_exit:
+ mtr.commit();
+
+func_exit:
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<sel_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+ /* If this is a new time this node is executed (or when execution
+ resumes after wait for a table intention lock), set intention locks
+ on the tables, or assign a read view */
+
+ if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+ node->state = SEL_NODE_OPEN;
+ }
+
+ if (node->state == SEL_NODE_OPEN) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started_xa(thr_get_trx(thr), false);
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+ if (node->consistent_read) {
+ trx_t *trx = thr_get_trx(thr);
+ /* Assign a read view for the query */
+ trx->read_view.open(trx);
+ node->read_view = trx->read_view.is_open() ?
+ &trx->read_view : NULL;
+ } else {
+ sym_node_t* table_node;
+ lock_mode i_lock_mode;
+
+ if (node->set_x_locks) {
+ i_lock_mode = LOCK_IX;
+ } else {
+ i_lock_mode = LOCK_IS;
+ }
+
+ for (table_node = node->table_list;
+ table_node != 0;
+ table_node = static_cast<sym_node_t*>(
+ que_node_get_next(table_node))) {
+
+ dberr_t err = lock_table(
+ table_node->table, nullptr,
+ i_lock_mode, thr);
+
+ if (err != DB_SUCCESS) {
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+ trx->error_state = err;
+
+ return(NULL);
+ }
+ }
+ }
+
+ /* If this is an explicit cursor, copy stored procedure
+ variable values, so that the values cannot change between
+ fetches (currently, we copy them also for non-explicit
+ cursors) */
+
+ if (node->explicit_cursor
+ && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+ row_sel_copy_input_variable_vals(node);
+ }
+
+ node->state = SEL_NODE_FETCH;
+ node->fetch_table = 0;
+
+ if (node->is_aggregate) {
+ /* Reset the aggregate total values */
+ sel_reset_aggregate_vals(node);
+ }
+ }
+
+ dberr_t err = row_sel(node, thr);
+
+ /* NOTE! if queries are parallelized, the following assignment may
+ have problems; the assignment should be made only if thr is the
+ only top-level thr in the graph: */
+
+ thr->graph->last_sel_node = node;
+
+ if (err != DB_SUCCESS) {
+ thr_get_trx(thr)->error_state = err;
+
+ return(NULL);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* sel_node;
+ fetch_node_t* node;
+
+ ut_ad(thr);
+
+ node = static_cast<fetch_node_t*>(thr->run_node);
+ sel_node = node->cursor_def;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+ if (thr->prev_node != que_node_get_parent(node)) {
+
+ if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+ if (node->into_list) {
+ sel_assign_into_var_values(node->into_list,
+ sel_node);
+ } else {
+ ibool ret = (*node->func->func)(
+ sel_node, node->func->arg);
+
+ if (!ret) {
+ sel_node->state
+ = SEL_NODE_NO_MORE_ROWS;
+ }
+ }
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ /* Make the fetch node the parent of the cursor definition for
+ the time of the fetch, so that execution knows to return to this
+ fetch node after a row has been selected or we know that there is
+ no row left */
+
+ sel_node->common.parent = node;
+
+ if (sel_node->state == SEL_NODE_CLOSED) {
+ ib::error() << "fetch called on a closed cursor";
+
+ thr_get_trx(thr)->error_state = DB_ERROR;
+
+ return(NULL);
+ }
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ row_printf_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* arg;
+
+ ut_ad(thr);
+
+ node = static_cast<row_printf_node_t*>(thr->run_node);
+
+ sel_node = node->sel_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+
+ if (sel_node->state != SEL_NODE_FETCH) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to print */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ arg = sel_node->select_list;
+
+ while (arg) {
+ dfield_print_also_hex(que_node_get_val(arg));
+
+ fputs(" ::: ", stderr);
+
+ arg = que_node_get_next(arg);
+ }
+
+ putc('\n', stderr);
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions; NOTE that dtuple->data
+ may end up pointing inside buf so
+ do not discard that buffer while
+ the tuple is being used. See
+ row_mysql_store_col_in_innobase_format()
+ in the case of DATA_INT */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len) /*!< in: MySQL key value length */
+{
+ byte* original_buf = buf;
+ const byte* original_key_ptr = key_ptr;
+ dict_field_t* field;
+ dfield_t* dfield;
+ ulint data_offset;
+ ulint data_len;
+ ulint data_field_len;
+ ibool is_null;
+ const byte* key_end;
+ ulint n_fields = 0;
+
+ /* For documentation of the key value storage format in MySQL, see
+ ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+ key_end = key_ptr + key_len;
+
+ /* Permit us to access any field in the tuple (ULINT_MAX): */
+
+ dtuple_set_n_fields(tuple, ULINT_MAX);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+ field = dict_index_get_nth_field(index, 0);
+
+ if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+ /* A special case: we are looking for a position in the
+ generated clustered index which InnoDB automatically added
+ to a table with no primary key: the first and the only
+ ordering column is ROW_ID which InnoDB stored to the key_ptr
+ buffer. */
+
+ ut_a(key_len == DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+ dtuple_set_n_fields(tuple, 1);
+
+ return;
+ }
+
+ while (key_ptr < key_end) {
+
+ ulint type = dfield_get_type(dfield)->mtype;
+ ut_a(field->col->mtype == type);
+
+ data_offset = 0;
+ is_null = FALSE;
+
+ if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+ /* The first byte in the field tells if this is
+ an SQL NULL value */
+
+ data_offset = 1;
+
+ if (*key_ptr != 0) {
+ dfield_set_null(dfield);
+
+ is_null = TRUE;
+ }
+ }
+
+ /* Calculate data length and data field total length */
+ if (DATA_LARGE_MTYPE(type) || DATA_GEOMETRY_MTYPE(type)) {
+
+ /* For R-tree index, data length should be the
+ total size of the wkb data.*/
+ if (dict_index_is_spatial(index)) {
+ ut_ad(DATA_GEOMETRY_MTYPE(type));
+ data_len = key_len;
+ data_field_len = data_offset + data_len;
+ } else {
+ /* The key field is a column prefix of a BLOB
+ or TEXT. */
+
+ ut_a(field->prefix_len > 0);
+
+ /* MySQL stores the actual data length to the
+ first 2 bytes after the optional SQL NULL
+ marker byte. The storage format is
+ little-endian, that is, the most significant
+ byte at a higher address. In UTF-8, MySQL
+ seems to reserve field->prefix_len bytes for
+ storing this field in the key value buffer,
+ even though the actual value only takes data
+ len bytes from the start. */
+
+ data_len = ulint(key_ptr[data_offset])
+ | ulint(key_ptr[data_offset + 1]) << 8;
+ data_field_len = data_offset + 2
+ + field->prefix_len;
+
+ data_offset += 2;
+
+ /* Now that we know the length, we store the
+ column value like it would be a fixed char
+ field */
+ }
+
+
+ } else if (field->prefix_len > 0) {
+ /* Looks like MySQL pads unused end bytes in the
+ prefix with space. Therefore, also in UTF-8, it is ok
+ to compare with a prefix containing full prefix_len
+ bytes, and no need to take at most prefix_len / 3
+ UTF-8 characters from the start.
+ If the prefix is used as the upper end of a LIKE
+ 'abc%' query, then MySQL pads the end with chars
+ 0xff. TODO: in that case does it any harm to compare
+ with the full prefix_len bytes. How do characters
+ 0xff in UTF-8 behave? */
+
+ data_len = field->prefix_len;
+ data_field_len = data_offset + data_len;
+ } else {
+ data_len = dfield_get_type(dfield)->len;
+ data_field_len = data_offset + data_len;
+ }
+
+ if ((dtype_get_mysql_type(dfield_get_type(dfield))
+ == DATA_MYSQL_TRUE_VARCHAR)
+ && (type != DATA_INT)) {
+ /* In a MySQL key value format, a true VARCHAR is
+ always preceded by 2 bytes of a length field.
+ dfield_get_type(dfield)->len returns the maximum
+ 'payload' len in bytes. That does not include the
+ 2 bytes that tell the actual data length.
+
+ We added the check != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR! */
+
+ data_len += 2;
+ data_field_len += 2;
+ }
+
+ /* Storing may use at most data_len bytes of buf */
+
+ if (UNIV_LIKELY(!is_null)) {
+ buf = row_mysql_store_col_in_innobase_format(
+ dfield, buf,
+ FALSE, /* MySQL key value format col */
+ key_ptr + data_offset, data_len,
+ dict_table_is_comp(index->table));
+ ut_a(buf <= original_buf + buf_len);
+ }
+
+ key_ptr += data_field_len;
+
+ if (UNIV_UNLIKELY(key_ptr > key_end)) {
+ /* The last field in key was not a complete key field
+ but a prefix of it.
+
+ Print a warning about this! HA_READ_PREFIX_LAST does
+ not currently work in InnoDB with partial-field key
+ value prefixes. Since MySQL currently uses a padding
+ trick to calculate LIKE 'abc%' type queries there
+ should never be partial-field prefixes in searches. */
+
+ ib::warn() << "Using a partial-field key prefix in"
+ " search, index " << index->name
+ << " of table " << index->table->name
+ << ". Last data field length "
+ << data_field_len << " bytes, key ptr now"
+ " exceeds key end by " << (key_ptr - key_end)
+ << " bytes. Key value in the MariaDB format:";
+
+ ut_print_buf(stderr, original_key_ptr, key_len);
+ putc('\n', stderr);
+
+ if (!is_null) {
+ ulint len = dfield_get_len(dfield);
+ dfield_set_len(dfield, len
+ - (ulint) (key_ptr - key_end));
+ }
+ ut_ad(0);
+ }
+
+ n_fields++;
+ field++;
+ dfield++;
+ }
+
+ ut_a(buf <= original_buf + buf_len);
+
+ /* We set the length of tuple to n_fields: we assume that the memory
+ area allocated for it is big enough (usually bigger than n_fields). */
+
+ dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+void
+row_sel_field_store_in_mysql_format_func(
+ byte* dest,
+ const mysql_row_templ_t* templ,
+#ifdef UNIV_DEBUG
+ const dict_index_t* index,
+ ulint field_no,
+#endif /* UNIV_DEBUG */
+ const byte* data,
+ ulint len)
+{
+#ifdef UNIV_DEBUG
+ const dict_field_t* field
+ = templ->is_virtual
+ ? NULL : dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
+
+ ut_ad(len != UNIV_SQL_NULL);
+ MEM_CHECK_DEFINED(data, len);
+ MEM_CHECK_ADDRESSABLE(dest, templ->mysql_col_len);
+ MEM_UNDEFINED(dest, templ->mysql_col_len);
+
+ byte* pad = dest + len;
+
+ switch (templ->type) {
+ const byte* field_end;
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_BINARY:
+ field_end = dest + templ->mysql_col_len;
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR. Store the
+ length of the data to the first byte or the first
+ two bytes of dest. */
+
+ dest = row_mysql_store_true_var_len(
+ dest, len, templ->mysql_length_bytes);
+ /* Copy the actual data. Leave the rest of the
+ buffer uninitialized. */
+ memcpy(dest, data, len);
+ break;
+ }
+
+ /* Copy the actual data */
+ memcpy(dest, data, len);
+
+ /* Pad with trailing spaces. */
+
+ if (pad == field_end) {
+ break;
+ }
+
+ if (UNIV_UNLIKELY(templ->type == DATA_FIXBINARY)) {
+ memset(pad, 0, field_end - pad);
+ break;
+ }
+
+ ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+ /* We treat some Unicode charset strings specially. */
+ switch (templ->mbminlen) {
+ case 4:
+ /* InnoDB should never have stripped partial
+ UTF-32 characters. */
+ ut_a(!(len & 3));
+ break;
+ case 2:
+ /* A space char is two bytes,
+ 0x0020 in UCS2 and UTF-16 */
+
+ if (UNIV_UNLIKELY(len & 1)) {
+ /* A 0x20 has been stripped from the column.
+ Pad it back. */
+
+ if (pad < field_end) {
+ *pad++ = 0x20;
+ }
+ }
+ }
+
+ row_mysql_pad_col(templ->mbminlen, pad,
+ ulint(field_end - pad));
+ break;
+
+ case DATA_BLOB:
+ /* Store a pointer to the BLOB buffer to dest: the BLOB was
+ already copied to the buffer in row_sel_store_mysql_rec */
+
+ row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+ len);
+ break;
+
+ case DATA_GEOMETRY:
+ /* We store all geometry data as BLOB data at server layer. */
+ row_mysql_store_geometry(dest, templ->mysql_col_len, data, len);
+ break;
+
+ case DATA_MYSQL:
+ memcpy(dest, data, len);
+
+ ut_ad(templ->mysql_col_len >= len);
+ ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+ /* If field_no equals to templ->icp_rec_field_no,
+ we are examining a row pointed by "icp_rec_field_no".
+ There is possibility that icp_rec_field_no refers to
+ a field in a secondary index while templ->rec_field_no
+ points to field in a primary index. The length
+ should still be equal, unless the field pointed
+ by icp_rec_field_no has a prefix */
+ ut_ad(templ->mbmaxlen > templ->mbminlen
+ || templ->mysql_col_len == len
+ || (field_no == templ->icp_rec_field_no
+ && field->prefix_len > 0));
+
+ /* The following assertion would fail for old tables
+ containing UTF-8 ENUM columns due to Bug #9526. */
+ ut_ad(!templ->mbmaxlen
+ || !(templ->mysql_col_len % templ->mbmaxlen));
+ ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+ || (field_no == templ->icp_rec_field_no
+ && field->prefix_len > 0)
+ || templ->rec_field_is_prefix);
+
+ ut_ad(templ->is_virtual
+ || !(field->prefix_len % templ->mbmaxlen));
+
+ if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
+ /* Pad with spaces. This undoes the stripping
+ done in row0mysql.cc, function
+ row_mysql_store_col_in_innobase_format(). */
+
+ memset(pad, 0x20, templ->mysql_col_len - len);
+ }
+ break;
+
+ default:
+#ifdef UNIV_DEBUG
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ /* These column types should never be shipped to MySQL. */
+ ut_ad(0);
+ /* fall through */
+
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+#endif /* UNIV_DEBUG */
+ ut_ad((templ->is_virtual && !field)
+ || (field && field->prefix_len
+ ? field->prefix_len == len
+ : templ->mysql_col_len == len));
+ memcpy(dest, data, len);
+ break;
+
+ case DATA_INT:
+ /* Convert InnoDB big-endian integer to little-endian
+ format, sign bit restored to 2's complement form */
+ DBUG_ASSERT(templ->mysql_col_len == len);
+
+ byte* ptr = pad;
+ do *--ptr = *data++; while (ptr != dest);
+ if (!templ->is_unsigned) {
+ pad[-1] ^= 0x80;
+ }
+ }
+}
+
+/** Convert a field in the Innobase format to a field in the MySQL format.
+@param[out] mysql_rec record in the MySQL format
+@param[in,out] prebuilt prebuilt struct
+@param[in] rec InnoDB record; must be protected
+ by a page latch
+@param[in] index index of rec
+@param[in] offsets array returned by rec_get_offsets()
+@param[in] field_no templ->rec_field_no or
+ templ->clust_rec_field_no
+ or templ->icp_rec_field_no
+@param[in] templ row template
+*/
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_sel_store_mysql_field(
+ byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ const rec_t* rec,
+ const dict_index_t* index,
+ const rec_offs* offsets,
+ ulint field_no,
+ const mysql_row_templ_t*templ)
+{
+ DBUG_ENTER("row_sel_store_mysql_field_func");
+
+ const byte* data;
+ ulint len;
+
+ ut_ad(prebuilt->default_rec);
+ ut_ad(templ);
+ ut_ad(templ >= prebuilt->mysql_template);
+ ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
+ ut_ad(field_no == templ->clust_rec_field_no
+ || field_no == templ->rec_field_no
+ || field_no == templ->icp_rec_field_no);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) {
+
+ mem_heap_t* heap;
+ /* Copy an externally stored field to a temporary heap */
+
+ ut_ad(field_no == templ->clust_rec_field_no);
+
+ if (DATA_LARGE_MTYPE(templ->type)) {
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap = mem_heap_create(
+ srv_page_size);
+ }
+
+ heap = prebuilt->blob_heap;
+ } else {
+ heap = mem_heap_create(srv_page_size);
+ }
+
+ /* NOTE: if we are retrieving a big BLOB, we may
+ already run out of memory in the next call, which
+ causes an assert */
+
+ data = btr_rec_copy_externally_stored_field(
+ rec, offsets, prebuilt->table->space->zip_size(),
+ field_no, &len, heap);
+
+ if (UNIV_UNLIKELY(!data)) {
+ /* The externally stored field was not written
+ yet. This record should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+
+ if (heap != prebuilt->blob_heap) {
+ mem_heap_free(heap);
+ }
+
+ ut_a(prebuilt->trx->isolation_level
+ == TRX_ISO_READ_UNCOMMITTED);
+ DBUG_RETURN(FALSE);
+ }
+
+ ut_a(len != UNIV_SQL_NULL);
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, field_no, data, len);
+
+ if (heap != prebuilt->blob_heap) {
+ mem_heap_free(heap);
+ }
+ } else {
+ /* The field is stored in the index record, or
+ in the metadata for instant ADD COLUMN. */
+ data = rec_get_nth_cfield(rec, index, offsets, field_no, &len);
+
+ if (len == UNIV_SQL_NULL) {
+ /* MySQL assumes that the field for an SQL
+ NULL value is set to the default value. */
+ ut_ad(templ->mysql_null_bit_mask);
+
+ MEM_CHECK_DEFINED(prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ memcpy(mysql_rec + templ->mysql_col_offset,
+ (const byte*) prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ DBUG_RETURN(TRUE);
+ }
+
+ if (DATA_LARGE_MTYPE(templ->type)
+ || DATA_GEOMETRY_MTYPE(templ->type)) {
+
+ /* It is a BLOB field locally stored in the
+ InnoDB record: we MUST copy its contents to
+ prebuilt->blob_heap here because
+ row_sel_field_store_in_mysql_format() stores a
+ pointer to the data, and the data passed to us
+ will be invalid as soon as the
+ mini-transaction is committed and the page
+ latch on the clustered index page is
+ released. */
+
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap = mem_heap_create(
+ srv_page_size);
+ DBUG_PRINT("anna", ("blob_heap allocated: %p",
+ prebuilt->blob_heap));
+ }
+
+ data = static_cast<byte*>(
+ mem_heap_dup(prebuilt->blob_heap, data, len));
+ }
+
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, field_no, data, len);
+ }
+
+ ut_ad(len != UNIV_SQL_NULL);
+
+ if (templ->mysql_null_bit_mask) {
+ /* It is a nullable column with a non-NULL
+ value */
+ mysql_rec[templ->mysql_null_byte_offset]
+ &= static_cast<byte>(~templ->mysql_null_bit_mask);
+ }
+
+ DBUG_RETURN(TRUE);
+}
+
+/** Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@param[out] mysql_rec row in the MySQL format
+@param[in] prebuilt cursor
+@param[in] rec Innobase record in the index
+ which was described in prebuilt's
+ template, or in the clustered index;
+ must be protected by a page latch
+@param[in] vrow virtual columns
+@param[in] rec_clust whether index must be the clustered index
+@param[in] index index of rec
+@param[in] offsets array returned by rec_get_offsets(rec)
+@retval true on success
+@retval false if not all columns could be retrieved */
+MY_ATTRIBUTE((warn_unused_result))
+static bool row_sel_store_mysql_rec(
+ byte* mysql_rec,
+ row_prebuilt_t* prebuilt,
+ const rec_t* rec,
+ const dtuple_t* vrow,
+ bool rec_clust,
+ const dict_index_t* index,
+ const rec_offs* offsets)
+{
+ DBUG_ENTER("row_sel_store_mysql_rec");
+
+ ut_ad(rec_clust || index == prebuilt->index);
+ ut_ad(!rec_clust || dict_index_is_clust(index));
+
+ if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+
+ for (ulint i = 0; i < prebuilt->n_template; i++) {
+ const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+ if (templ->is_virtual && dict_index_is_clust(index)) {
+ /* Skip virtual columns if it is not a covered
+ search or virtual key read is not requested. */
+ if (!rec_clust
+ || !prebuilt->index->has_virtual()
+ || !prebuilt->read_just_key) {
+ /* Initialize the NULL bit. */
+ if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ }
+ continue;
+ }
+
+ dict_v_col_t* col;
+ col = dict_table_get_nth_v_col(
+ index->table, templ->clust_rec_field_no);
+
+ ut_ad(vrow);
+
+ const dfield_t* dfield = dtuple_get_nth_v_field(
+ vrow, col->v_pos);
+
+ if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ ut_ad("no ha_innopart in MariaDB" == 0);
+ continue;
+ }
+
+ if (dfield->len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ memcpy(mysql_rec
+ + templ->mysql_col_offset,
+ (const byte*) prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ } else {
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, index, templ->clust_rec_field_no,
+ (const byte*)dfield->data, dfield->len);
+ if (templ->mysql_null_bit_mask) {
+ mysql_rec[
+ templ->mysql_null_byte_offset]
+ &= static_cast<byte>
+ (~templ->mysql_null_bit_mask);
+ }
+ }
+
+ continue;
+ }
+
+ const ulint field_no
+ = rec_clust
+ ? templ->clust_rec_field_no
+ : templ->rec_field_no;
+ /* We should never deliver column prefixes to the SQL layer,
+ except for evaluating handler_index_cond_check()
+ or handler_rowid_filter_check(). */
+ /* ...actually, we do want to do this in order to
+ support the prefix query optimization.
+
+ ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+ == 0);
+
+ ...so we disable this assert. */
+
+ if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+ rec, index, offsets,
+ field_no, templ)) {
+
+ DBUG_RETURN(false);
+ }
+ }
+
+ /* FIXME: We only need to read the doc_id if an FTS indexed
+ column is being updated.
+ NOTE, the record can be cluster or secondary index record.
+ if secondary index is used then FTS_DOC_ID column should be part
+ of this index. */
+ if (dict_table_has_fts_index(prebuilt->table)) {
+ if (dict_index_is_clust(index)
+ || prebuilt->fts_doc_id_in_read_set) {
+ prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+ rec, index, offsets);
+ }
+ }
+
+ DBUG_RETURN(true);
+}
+
+static void row_sel_reset_old_vers_heap(row_prebuilt_t *prebuilt)
+{
+ if (prebuilt->old_vers_heap)
+ mem_heap_empty(prebuilt->old_vers_heap);
+ else
+ prebuilt->old_vers_heap= mem_heap_create(200);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct */
+ dict_index_t* clust_index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in a clustered index */
+ rec_offs** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ dtuple_t** vrow, /*!< out: dtuple to hold old virtual
+ column data */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ row_sel_reset_old_vers_heap(prebuilt);
+
+ return row_vers_build_for_consistent_read(
+ rec, mtr, clust_index, offsets,
+ &prebuilt->trx->read_view, offset_heap,
+ prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/** Helper class to cache clust_rec and old_vers */
+class Row_sel_get_clust_rec_for_mysql
+{
+ const rec_t *cached_clust_rec;
+ rec_t *cached_old_vers;
+ lsn_t cached_lsn;
+ page_id_t cached_page_id;
+
+#ifdef UNIV_DEBUG
+ void check_eq(const dict_index_t *index, const rec_offs *offsets) const
+ {
+ rec_offs vers_offs[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS];
+ rec_offs_init(vers_offs);
+ mem_heap_t *heap= nullptr;
+
+ ut_ad(rec_offs_validate(cached_clust_rec, index, offsets));
+ ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets));
+ ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs,
+ index->n_core_fields,
+ index->db_trx_id(), &heap));
+ ut_ad(!heap);
+ for (auto n= index->db_trx_id(); n--; )
+ {
+ const dict_col_t *col= dict_index_get_nth_col(index, n);
+ ulint len1, len2;
+ const byte *b1= rec_get_nth_field(cached_clust_rec, offsets, n, &len1);
+ const byte *b2= rec_get_nth_field(cached_old_vers, vers_offs, n, &len2);
+ ut_ad(!cmp_data(col->mtype, col->prtype, false, b1, len1, b2, len2));
+ }
+ }
+#endif
+
+public:
+ Row_sel_get_clust_rec_for_mysql() :
+ cached_clust_rec(NULL), cached_old_vers(NULL), cached_lsn(0),
+ cached_page_id(page_id_t(0,0)) {}
+
+ dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index,
+ const rec_t *rec, que_thr_t *thr, const rec_t **out_rec,
+ rec_offs **offsets, mem_heap_t **offset_heap,
+ dtuple_t **vrow, mtr_t *mtr);
+};
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+dberr_t
+Row_sel_get_clust_rec_for_mysql::operator()(
+/*============================*/
+ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
+ dict_index_t* sec_index,/*!< in: secondary index where rec resides */
+ const rec_t* rec, /*!< in: record in a non-clustered index; if
+ this is a locking read, then rec is not
+ allowed to be delete-marked, and that would
+ not make sense either */
+ que_thr_t* thr, /*!< in: query thread */
+ const rec_t** out_rec,/*!< out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ rec_offs** offsets,/*!< in: offsets returned by
+ rec_get_offsets(rec, sec_index);
+ out: offsets returned by
+ rec_get_offsets(out_rec, clust_index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ dtuple_t** vrow, /*!< out: virtual column to fill */
+ mtr_t* mtr) /*!< in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* clust_index;
+ rec_t* old_vers;
+ trx_t* trx;
+
+ prebuilt->clust_pcur->old_rec = nullptr;
+ *out_rec = NULL;
+ trx = thr_get_trx(thr);
+
+ row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+ sec_index, *offsets);
+
+ clust_index = dict_table_get_first_index(sec_index->table);
+ prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+
+ dberr_t err = btr_pcur_open_with_no_init(prebuilt->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ prebuilt->clust_pcur, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+
+ const rec_t* clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+ prebuilt->clust_pcur->trx_if_known = trx;
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(prebuilt->clust_pcur)
+ < dict_index_get_n_unique(clust_index)) {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(prebuilt->pcur);
+
+ /* If this is a spatial index scan, and we are reading
+ from a shadow buffer, the record could be already
+ deleted (due to rollback etc.). So get the original
+ page and verify that */
+ if (dict_index_is_spatial(sec_index)
+ && btr_cur->rtr_info->matches
+ && (page_align(rec)
+ == btr_cur->rtr_info->matches->block.page.frame
+ || rec != btr_pcur_get_rec(prebuilt->pcur))) {
+#ifdef UNIV_DEBUG
+ rtr_info_t* rtr_info = btr_cur->rtr_info;
+ mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex);
+ /* The page could be deallocated (by rollback etc.) */
+ if (!rtr_info->matches->valid) {
+ mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+ clust_rec = NULL;
+ goto func_exit;
+ }
+ mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+
+ if (rec_get_deleted_flag(rec,
+ dict_table_is_comp(sec_index->table))
+ && prebuilt->select_lock_type == LOCK_NONE) {
+
+ clust_rec = NULL;
+ goto func_exit;
+ }
+
+ if (rec != btr_pcur_get_rec(prebuilt->pcur)) {
+ clust_rec = NULL;
+ goto func_exit;
+ }
+
+ /* FIXME: Why is this block not the
+ same as btr_pcur_get_block(prebuilt->pcur),
+ and is it not unsafe to use RW_NO_LATCH here? */
+ buf_block_t* block = buf_page_get_gen(
+ btr_pcur_get_block(prebuilt->pcur)->page.id(),
+ btr_pcur_get_block(prebuilt->pcur)->zip_size(),
+ RW_NO_LATCH, NULL, BUF_GET, mtr, &err);
+ ut_ad(block); // FIXME: avoid crash
+ mem_heap_t* heap = mem_heap_create(256);
+ dtuple_t* tuple = dict_index_build_data_tuple(
+ rec, sec_index, true,
+ sec_index->n_fields, heap);
+ page_cur_t page_cursor;
+ page_cursor.block = block;
+ page_cursor.index = sec_index;
+ ulint up_match = 0, low_match = 0;
+ ut_ad(!page_cur_search_with_match(tuple, PAGE_CUR_LE,
+ &up_match,
+ &low_match,
+ &page_cursor,
+ nullptr));
+ ut_ad(low_match < dtuple_get_n_fields_cmp(tuple));
+ mem_heap_free(heap);
+ err = DB_SUCCESS;
+#endif /* UNIV_DEBUG */
+ } else if (!rec_get_deleted_flag(rec,
+ dict_table_is_comp(sec_index->table))
+ || prebuilt->select_lock_type != LOCK_NONE) {
+ /* In a rare case it is possible that no clust
+ rec is found for a delete-marked secondary index
+ record: if row_undo_mod_clust() has already removed
+ the clust rec, while purge is still cleaning and
+ removing secondary index records associated with
+ earlier versions of the clustered index record.
+ In that case we know that the clustered index
+ record did not exist in the read view of trx. */
+ ib::error() << "Clustered record for sec rec not found"
+ " index " << sec_index->name
+ << " of table " << sec_index->table->name;
+
+ fputs("InnoDB: sec index record ", stderr);
+ rec_print(stderr, rec, sec_index);
+ fputs("\n"
+ "InnoDB: clust index record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ err = DB_CORRUPTION;
+ }
+
+ clust_rec = NULL;
+ goto func_exit;
+ }
+
+ *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, offset_heap);
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; we are searching
+ the clust rec with a unique condition, hence
+ we set a LOCK_REC_NOT_GAP type lock */
+
+ err = lock_clust_rec_read_check_and_lock(
+ 0, btr_pcur_get_block(prebuilt->clust_pcur),
+ clust_rec, clust_index, *offsets,
+ prebuilt->select_lock_type,
+ LOCK_REC_NOT_GAP,
+ thr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ break;
+ default:
+ return err;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+ || clust_index->table->is_temporary()) {
+ } else {
+ /* If the isolation level allows reading of
+ uncommitted data, then we never look for an
+ earlier version */
+ err = row_sel_clust_sees(clust_rec, *clust_index,
+ *offsets, trx->read_view);
+ }
+
+ switch (err) {
+ default:
+ return err;
+ case DB_SUCCESS:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ const buf_page_t& bpage = btr_pcur_get_block(
+ prebuilt->clust_pcur)->page;
+
+ const lsn_t lsn = mach_read_from_8(
+ page_align(clust_rec) + FIL_PAGE_LSN);
+
+ if (lsn != cached_lsn
+ || bpage.id() != cached_page_id
+ || clust_rec != cached_clust_rec) {
+ /* The following call returns 'offsets' associated with
+ 'old_vers' */
+ err = row_sel_build_prev_vers_for_mysql(
+ prebuilt, clust_index,
+ clust_rec, offsets, offset_heap, &old_vers,
+ vrow, mtr);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ cached_lsn = lsn;
+ cached_page_id = bpage.id();
+ cached_clust_rec = clust_rec;
+ cached_old_vers = old_vers;
+ } else {
+ err = DB_SUCCESS;
+ old_vers = cached_old_vers;
+
+ /* The offsets need not be same for the latest
+ version of clust_rec and its old version
+ old_vers. Re-calculate the offsets for old_vers. */
+
+ if (old_vers) {
+ ut_d(check_eq(clust_index, *offsets));
+ *offsets = rec_get_offsets(
+ old_vers, clust_index, *offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, offset_heap);
+ }
+ }
+
+ if (old_vers == NULL) {
+ return err;
+ }
+
+ clust_rec = old_vers;
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ /* And for spatial index, since the rec is from shadow buffer,
+ so we need to check if it's exactly match the clust_rec. */
+ if (clust_rec
+ && (old_vers
+ || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+ || dict_index_is_spatial(sec_index)
+ || rec_get_deleted_flag(rec, dict_table_is_comp(
+ sec_index->table)))) {
+ err = row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+ clust_rec, clust_index, thr);
+ switch (err) {
+ case DB_SUCCESS:
+ clust_rec = NULL;
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ break;
+ default:
+ return err;
+ }
+ }
+
+ err = DB_SUCCESS;
+ }
+
+func_exit:
+ *out_rec = clust_rec;
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* We may use the cursor in update or in unlock_row():
+ store its position */
+
+ btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+ }
+
+ return err;
+}
+
+/** Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@param[out] same_user_rec true if we were able to restore the cursor on a user
+record with the same ordering prefix in in the B-tree index
+@param[in] latch_mode latch mode wished in restoration
+@param[in] pcur cursor whose position has been stored
+@param[in] moves_up true if the cursor moves up in the index
+@param[in,out] mtr mtr; CAUTION: may commit mtr temporarily!
+@return true if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static bool sel_restore_position_for_mysql(bool *same_user_rec,
+ btr_latch_mode latch_mode,
+ btr_pcur_t *pcur,
+ bool moves_up, mtr_t *mtr)
+{
+ auto status = pcur->restore_position(latch_mode, mtr);
+
+ *same_user_rec = status == btr_pcur_t::SAME_ALL;
+
+ ut_ad(!*same_user_rec || pcur->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+ if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
+ ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
+ || pcur->rel_pos == BTR_PCUR_AFTER);
+ } else {
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad((pcur->rel_pos == BTR_PCUR_ON)
+ == btr_pcur_is_on_user_rec(pcur));
+ }
+#endif /* UNIV_DEBUG */
+
+ /* The position may need be adjusted for rel_pos and moves_up. */
+
+ switch (pcur->rel_pos) {
+ case BTR_PCUR_ON:
+ if (!*same_user_rec && moves_up) {
+ if (status == btr_pcur_t::SAME_UNIQ)
+ return true;
+next:
+ if (btr_pcur_move_to_next(pcur, mtr)
+ && rec_is_metadata(btr_pcur_get_rec(pcur),
+ *pcur->index())) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return true;
+ }
+ return(!*same_user_rec);
+ case BTR_PCUR_AFTER_LAST_IN_TREE:
+ case BTR_PCUR_BEFORE_FIRST_IN_TREE:
+ return true;
+ case BTR_PCUR_AFTER:
+ /* positioned to record after pcur->old_rec. */
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+prev:
+ if (btr_pcur_is_on_user_rec(pcur) && !moves_up
+ && !rec_is_metadata(btr_pcur_get_rec(pcur),
+ *pcur->index())) {
+ if (!btr_pcur_move_to_prev(pcur, mtr)) {
+ return true;
+ }
+ }
+ return true;
+ case BTR_PCUR_BEFORE:
+ /* For non optimistic restoration:
+ The position is now set to the record before pcur->old_rec.
+
+ For optimistic restoration:
+ The position also needs to take the previous search_mode into
+ consideration. */
+
+ switch (pcur->pos_state) {
+ case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+ if (pcur->search_mode == PAGE_CUR_GE) {
+ /* Positioned during Greater or Equal search
+ with BTR_PCUR_BEFORE. Optimistic restore to
+ the same record. If scanning for lower then
+ we must move to previous record.
+ This can happen with:
+ HANDLER READ idx a = (const);
+ HANDLER READ idx PREV; */
+ goto prev;
+ }
+ return true;
+ case BTR_PCUR_IS_POSITIONED:
+ if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+ goto next;
+ }
+ return true;
+ case BTR_PCUR_WAS_POSITIONED:
+ case BTR_PCUR_NOT_POSITIONED:
+ break;
+ }
+ }
+ ut_ad(0);
+ return true;
+}
+
+/********************************************************************//**
+Copies a cached field for MySQL from the fetch cache. */
+static
+void
+row_sel_copy_cached_field_for_mysql(
+/*================================*/
+ byte* buf, /*!< in/out: row buffer */
+ const byte* cache, /*!< in: cached row */
+ const mysql_row_templ_t*templ) /*!< in: column template */
+{
+ ulint len;
+
+ buf += templ->mysql_col_offset;
+ cache += templ->mysql_col_offset;
+
+ MEM_CHECK_ADDRESSABLE(buf, templ->mysql_col_len);
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
+ && (templ->type != DATA_INT)) {
+ /* Check for != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR!
+ Find the actual length of the true VARCHAR field. */
+ row_mysql_read_true_varchar(
+ &len, cache, templ->mysql_length_bytes);
+ len += templ->mysql_length_bytes;
+ MEM_UNDEFINED(buf, templ->mysql_col_len);
+ } else {
+ len = templ->mysql_col_len;
+ }
+
+ memcpy(buf, cache, len);
+}
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out] buf Where to copy the MySQL row.
+@param[in] cached_rec What to copy (in MySQL row format).
+@param[in] prebuilt prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+ byte* buf,
+ const byte* cached_rec,
+ row_prebuilt_t* prebuilt)
+{
+ const mysql_row_templ_t*templ;
+ ulint i;
+ for (i = 0; i < prebuilt->n_template; i++) {
+ templ = prebuilt->mysql_template + i;
+
+ /* Skip virtual columns */
+ if (templ->is_virtual) {
+ continue;
+ }
+
+ row_sel_copy_cached_field_for_mysql(
+ buf, cached_rec, templ);
+ /* Copy NULL bit of the current field from cached_rec
+ to buf */
+ if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+ buf[templ->mysql_null_byte_offset]
+ ^= (buf[templ->mysql_null_byte_offset]
+ ^ cached_rec[templ->mysql_null_byte_offset])
+ & (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+ }
+ }
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
+ byte* buf, /*!< in/out: buffer where to copy the
+ row */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
+{
+ ulint i;
+ const mysql_row_templ_t*templ;
+ const byte* cached_rec;
+ ut_ad(prebuilt->n_fetch_cached > 0);
+ ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+ MEM_CHECK_ADDRESSABLE(buf, prebuilt->mysql_row_len);
+
+ cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+ if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+ row_sel_copy_cached_fields_for_mysql(buf, cached_rec, prebuilt);
+ } else if (prebuilt->mysql_prefix_len > 63) {
+ /* The record is long. Copy it field by field, in case
+ there are some long VARCHAR column of which only a
+ small length is being used. */
+ MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len);
+
+ /* First copy the NULL bits. */
+ memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+ /* Then copy the requested fields. */
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+ templ = prebuilt->mysql_template + i;
+
+ /* Skip virtual columns */
+ if (templ->is_virtual
+ && !(dict_index_has_virtual(prebuilt->index)
+ && prebuilt->read_just_key)) {
+ continue;
+ }
+
+ row_sel_copy_cached_field_for_mysql(
+ buf, cached_rec, templ);
+ }
+ } else {
+ memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
+ }
+
+ prebuilt->n_fetch_cached--;
+ prebuilt->fetch_cache_first++;
+
+ if (prebuilt->n_fetch_cached == 0) {
+ prebuilt->fetch_cache_first = 0;
+ }
+}
+
+/********************************************************************//**
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ ulint i;
+ ulint sz;
+ byte* ptr;
+
+ /* Reserve space for the magic number. */
+ sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+ ptr = static_cast<byte*>(ut_malloc_nokey(sz));
+
+ for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
+
+ /* A user has reported memory corruption in these
+ buffers in Linux. Put magic numbers there to help
+ to track a possible bug. */
+
+ mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+
+ prebuilt->fetch_cache[i] = ptr;
+ ptr += prebuilt->mysql_row_len;
+
+ mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+ ptr += 4;
+ }
+}
+
+/********************************************************************//**
+Get the last fetch cache buffer from the queue.
+@return pointer to buffer. */
+UNIV_INLINE
+byte*
+row_sel_fetch_last_buf(
+/*===================*/
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ ut_ad(!prebuilt->templ_contains_blob);
+ ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+ if (prebuilt->fetch_cache[0] == NULL) {
+ /* Allocate memory for the fetch cache */
+ ut_ad(prebuilt->n_fetch_cached == 0);
+
+ row_sel_prefetch_cache_init(prebuilt);
+ }
+
+ ut_ad(prebuilt->fetch_cache_first == 0);
+ MEM_UNDEFINED(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+ prebuilt->mysql_row_len);
+
+ return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
+ byte* mysql_rec, /*!< in/out: MySQL record */
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ /* For non ICP code path the row should already exist in the
+ next fetch cache slot. */
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ memcpy(row_sel_fetch_last_buf(prebuilt), mysql_rec,
+ prebuilt->mysql_row_len);
+ }
+
+ ++prebuilt->n_fetch_cached;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode if AHI is enabled.
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+ const rec_t** out_rec,/*!< out: record if found */
+ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
+ rec_offs** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
+ mtr_t* mtr) /*!< in: started mtr */
+{
+ dict_index_t* index = prebuilt->index;
+ const dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ const rec_t* rec;
+
+ ut_ad(index->is_primary());
+ ut_ad(!index->table->is_temporary());
+ ut_ad(!prebuilt->templ_contains_blob);
+ ut_ad(trx->read_view.is_open());
+ pcur->old_rec = nullptr;
+
+ if (btr_pcur_open_with_no_init(search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur, mtr)
+ != DB_SUCCESS) {
+ return SEL_RETRY;
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+ return SEL_RETRY;
+ }
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+ return SEL_EXHAUSTED;
+ }
+
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+ } else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+ /* See row_search_mvcc() for a comment on bulk_trx_id */
+ if (!trx->read_view.changes_visible(bulk_trx_id)) {
+ return SEL_EXHAUSTED;
+ }
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+
+ if (row_sel_clust_sees(rec, *index, *offsets, trx->read_view)
+ != DB_SUCCESS) {
+ return SEL_RETRY;
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+ return SEL_EXHAUSTED;
+ }
+
+ *out_rec = rec;
+
+ return SEL_FOUND;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Check a pushed-down index condition.
+@return CHECK_ABORTED_BY_USER, CHECK_NEG, CHECK_POS, or CHECK_OUT_OF_RANGE */
+static
+check_result_t
+row_search_idx_cond_check(
+/*======================*/
+ byte* mysql_rec, /*!< out: record
+ in MySQL format (invalid unless
+ prebuilt->idx_cond!=NULL and
+ we return ICP_MATCH) */
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ for the table handle */
+ const rec_t* rec, /*!< in: InnoDB record */
+ const rec_offs* offsets) /*!< in: rec_get_offsets() */
+{
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
+
+ if (!prebuilt->idx_cond) {
+ if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+ return(CHECK_POS);
+ }
+ } else {
+ MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+ }
+
+ /* Convert to MySQL format those fields that are needed for
+ evaluating the index condition. */
+
+ if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+ mem_heap_empty(prebuilt->blob_heap);
+ }
+
+ for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
+ const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+ /* Skip virtual columns */
+ if (templ->is_virtual) {
+ continue;
+ }
+
+ if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+ rec, prebuilt->index, offsets,
+ templ->icp_rec_field_no,
+ templ)) {
+ return(CHECK_NEG);
+ }
+ }
+
+ /* We assume that the index conditions on
+ case-insensitive columns are case-insensitive. The
+ case of such columns may be wrong in a secondary
+ index, if the case of the column has been updated in
+ the past, or a record has been deleted and a record
+ inserted in a different case. */
+ check_result_t result = prebuilt->idx_cond
+ ? handler_index_cond_check(prebuilt->idx_cond)
+ : CHECK_POS;
+
+ switch (result) {
+ case CHECK_POS:
+ if (handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+ ut_ad(!prebuilt->index->is_primary());
+ if (prebuilt->clust_index_was_generated) {
+ ulint len;
+ dict_index_t* index = prebuilt->index;
+ const byte* data = rec_get_nth_field(
+ rec, offsets, index->n_fields - 1,
+ &len);
+ ut_ad(dict_index_get_nth_col(index,
+ index->n_fields - 1)
+ ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+ ut_ad(len == DATA_ROW_ID_LEN);
+ memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+ }
+ result = handler_rowid_filter_check(prebuilt->pk_filter);
+ switch (result) {
+ case CHECK_NEG:
+ MONITOR_INC(MONITOR_ICP_NO_MATCH);
+ return(result);
+ case CHECK_OUT_OF_RANGE:
+ MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+ return(result);
+ case CHECK_POS:
+ break;
+ default:
+ return(result);
+ }
+ }
+ /* Convert the remaining fields to MySQL format.
+ If this is a secondary index record, we must defer
+ this until we have fetched the clustered index record. */
+ if (!prebuilt->need_to_access_clustered
+ || dict_index_is_clust(prebuilt->index)) {
+ if (!row_sel_store_mysql_rec(
+ mysql_rec, prebuilt, rec, NULL, false,
+ prebuilt->index, offsets)) {
+ ut_ad(dict_index_is_clust(prebuilt->index));
+ return(CHECK_NEG);
+ }
+ }
+ MONITOR_INC(MONITOR_ICP_MATCH);
+ return(result);
+ case CHECK_NEG:
+ MONITOR_INC(MONITOR_ICP_NO_MATCH);
+ return(result);
+ case CHECK_OUT_OF_RANGE:
+ MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+ return(result);
+ case CHECK_ERROR:
+ case CHECK_ABORTED_BY_USER:
+ return(result);
+ }
+
+ ut_error;
+ return(result);
+}
+
+/** Extract virtual column data from a virtual index record and fill a dtuple
+@param[in] rec the virtual (secondary) index record
+@param[in] index the virtual index
+@param[in,out] vrow the dtuple where data extract to
+@param[in] heap memory heap to allocate memory
+*/
+static
+void
+row_sel_fill_vrow(
+ const rec_t* rec,
+ dict_index_t* index,
+ dtuple_t** vrow,
+ mem_heap_t* heap)
+{
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(!(*vrow));
+ ut_ad(heap);
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(!index->is_instant());
+ ut_ad(page_rec_is_leaf(rec));
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ *vrow = dtuple_create_with_vcol(
+ heap, 0, dict_table_get_n_v_cols(index->table));
+
+ /* Initialize all virtual row's mtype to DATA_MISSING */
+ dtuple_init_v_fld(*vrow);
+
+ for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+
+ if (col->is_virtual()) {
+ const byte* data;
+ ulint len;
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(col);
+
+ dfield_t* dfield = dtuple_get_nth_v_field(
+ *vrow, vcol->v_pos);
+ dfield_set_data(dfield, data, len);
+ dict_col_copy_type(col, dfield_get_type(dfield));
+ }
+ }
+}
+
+/** Return the record field length in characters.
+@param[in] col table column of the field
+@param[in] field_no field number
+@param[in] rec physical record
+@param[in] offsets field offsets in the physical record
+@return field length in characters. */
+static
+size_t
+rec_field_len_in_chars(
+ const dict_col_t* col,
+ const ulint field_no,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ const ulint cset = dtype_get_charset_coll(col->prtype);
+ const CHARSET_INFO* cs = all_charsets[cset];
+ ulint rec_field_len;
+ const char* rec_field = reinterpret_cast<const char *>(
+ rec_get_nth_field(
+ rec, offsets, field_no, &rec_field_len));
+
+ if (UNIV_UNLIKELY(!cs)) {
+ ib::warn() << "Missing collation " << cset;
+ return SIZE_T_MAX;
+ }
+
+ return cs->numchars(rec_field, rec_field + rec_field_len);
+}
+
+/** Avoid the clustered index lookup if all the following conditions
+are true:
+1) all columns are in secondary index
+2) all values for columns that are prefix-only indexes are shorter
+than the prefix size. This optimization can avoid many IOs for certain schemas.
+@return true, to avoid clustered index lookup. */
+static
+bool row_search_with_covering_prefix(
+ row_prebuilt_t* prebuilt,
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ const dict_index_t* index = prebuilt->index;
+ ut_ad(!dict_index_is_clust(index));
+
+ /* In ha_innobase::build_template() we choose to access the
+ whole row when using exclusive row locks or In case of fts
+ query, we need to read from clustered index */
+ if (prebuilt->select_lock_type == LOCK_X || prebuilt->in_fts_query
+ || !index->is_btree()) {
+ return false;
+ }
+
+ /** Optimization only applicable if there the number of secondary index
+ fields are greater than or equal to number of clustered index fields. */
+ if (prebuilt->n_template > index->n_fields) {
+ return false;
+ }
+
+ /* We can avoid a clustered index lookup if
+ all of the following hold:
+ (1) all columns are in the secondary index
+ (2) all values for columns that are prefix-only
+ indexes are shorter than the prefix size
+ This optimization can avoid many IOs for certain schemas. */
+ for (ulint i = 0; i < prebuilt->n_template; i++) {
+ mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+ ulint j = templ->rec_prefix_field_no;
+ ut_ad(!templ->mbminlen == !templ->mbmaxlen);
+
+ /** Condition (1) : is the field in the index. */
+ if (j == ULINT_UNDEFINED) {
+ return false;
+ }
+
+ /** Condition (2): If this is a prefix index then
+ row's value size shorter than prefix length. */
+
+ if (!templ->rec_field_is_prefix
+ || rec_offs_nth_sql_null(offsets, j)) {
+ continue;
+ }
+
+ const dict_field_t* field = dict_index_get_nth_field(index, j);
+
+ if (!field->prefix_len) {
+ continue;
+ }
+
+ const ulint rec_size = rec_offs_nth_size(offsets, j);
+
+ if (rec_size >= field->prefix_len) {
+ /* Shortest representation string by the
+ byte length of the record is longer than the
+ maximum possible index prefix. */
+ return false;
+ }
+
+ if (templ->mbminlen != templ->mbmaxlen
+ && rec_field_len_in_chars(field->col, j, rec, offsets)
+ >= field->prefix_len / templ->mbmaxlen) {
+ /* No of chars to store the record exceeds
+ the index prefix character length. */
+ return false;
+ }
+ }
+
+ /* If prefix index optimization condition satisfied then
+ for all columns above, use rec_prefix_field_no instead of
+ rec_field_no, and skip the clustered lookup below. */
+ for (ulint i = 0; i < prebuilt->n_template; i++) {
+ mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+ templ->rec_field_no = templ->rec_prefix_field_no;
+ ut_a(templ->rec_field_no != ULINT_UNDEFINED);
+ }
+
+ return true;
+}
+
+/** Searches for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out] buf buffer for the fetched row in MySQL format
+@param[in] mode search mode PAGE_CUR_L
+@param[in,out] prebuilt prebuilt struct for the table handler;
+ this contains the info to search_tuple,
+ index; if search tuple contains 0 field then
+ we position the cursor at start or the end of
+ index, depending on 'mode'
+@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+ Note: if this is != 0, then prebuilt must has a
+ pcur with stored position! In opening of a
+ cursor 'direction' should be 0.
+@return DB_SUCCESS or error code */
+dberr_t
+row_search_mvcc(
+ byte* buf,
+ page_cur_mode_t mode,
+ row_prebuilt_t* prebuilt,
+ ulint match_mode,
+ ulint direction)
+{
+ DBUG_ENTER("row_search_mvcc");
+ DBUG_ASSERT(prebuilt->index->table == prebuilt->table);
+
+ dict_index_t* index = prebuilt->index;
+ ibool comp = dict_table_is_comp(prebuilt->table);
+ const dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ dict_index_t* clust_index;
+ que_thr_t* thr;
+ const rec_t* UNINIT_VAR(rec);
+ dtuple_t* vrow = NULL;
+ const rec_t* result_rec = NULL;
+ const rec_t* clust_rec;
+ Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql;
+ ibool unique_search = FALSE;
+ ulint mtr_extra_clust_savepoint = 0;
+ bool moves_up = false;
+ /* if the returned record was locked and we did a semi-consistent
+ read (fetch the newest committed version), then this is set to
+ TRUE */
+ ulint next_offs;
+ bool same_user_rec;
+ ibool table_lock_waited = FALSE;
+ byte* next_buf = 0;
+ bool spatial_search = false;
+
+ ut_ad(index && pcur && search_tuple);
+ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+ /* We don't support FTS queries from the HANDLER interfaces, because
+ we implemented FTS as reversed inverted index with auxiliary tables.
+ So anything related to traditional index query would not apply to
+ it. */
+ if (prebuilt->index->type & DICT_FTS) {
+ DBUG_RETURN(DB_END_OF_INDEX);
+ }
+
+ if (!prebuilt->table->space) {
+ DBUG_RETURN(DB_TABLESPACE_DELETED);
+ } else if (!prebuilt->table->is_readable()) {
+ if (fil_space_crypt_t* crypt_data =
+ prebuilt->table->space->crypt_data) {
+ if (crypt_data->should_encrypt()) {
+ DBUG_RETURN(DB_DECRYPTION_FAILED);
+ }
+ }
+ DBUG_RETURN(DB_CORRUPTION);
+ } else if (!prebuilt->index_usable) {
+ DBUG_RETURN(DB_MISSING_HISTORY);
+ } else if (prebuilt->index->is_corrupted()) {
+ DBUG_RETURN(DB_CORRUPTION);
+ }
+
+ pcur->btr_cur.page_cur.index = index;
+
+ /* We need to get the virtual column values stored in secondary
+ index key, if this is covered index scan or virtual key read is
+ requested. */
+ bool need_vrow = prebuilt->read_just_key
+ && prebuilt->index->has_virtual();
+
+ /* Reset the new record lock info if READ UNCOMMITTED or
+ READ COMMITED isolation level is used. Then
+ we are able to remove the record locks set here on an individual
+ row. */
+ prebuilt->new_rec_locks = 0;
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 1: Try to pop the row from the prefetch cache */
+
+ if (UNIV_UNLIKELY(direction == 0)) {
+ trx->op_info = "starting index read";
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+ } else {
+ trx->op_info = "fetching rows";
+
+ if (prebuilt->n_rows_fetched == 0) {
+ prebuilt->fetch_direction = direction;
+ }
+
+ if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+ if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+ ut_error;
+ /* TODO: scrollable cursor: restore cursor to
+ the place of the latest returned row,
+ or better: prevent caching for a scroll
+ cursor! */
+ }
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+ row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+
+ prebuilt->n_rows_fetched++;
+ trx->op_info = "";
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ if (prebuilt->fetch_cache_first > 0
+ && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+early_not_found:
+ /* The previous returned row was popped from the fetch
+ cache, but the cache was not full at the time of the
+ popping: no more rows can exist in the result set */
+ trx->op_info = "";
+ DBUG_RETURN(DB_RECORD_NOT_FOUND);
+ }
+
+ prebuilt->n_rows_fetched++;
+
+ if (prebuilt->n_rows_fetched > 1000000000) {
+ /* Prevent wrap-over */
+ prebuilt->n_rows_fetched = 500000000;
+ }
+
+ mode = pcur->search_mode;
+ }
+
+ /* In a search where at most one record in the index may match, we
+ can use a LOCK_REC_NOT_GAP type record lock when locking a
+ non-delete-marked matching record.
+
+ Note that in a unique secondary index there may be different
+ delete-marked versions of a record where only the primary key
+ values differ: thus in a secondary index we must use next-key
+ locks when locking delete-marked records. */
+
+ if (match_mode == ROW_SEL_EXACT
+ && dict_index_is_unique(index)
+ && dtuple_get_n_fields(search_tuple)
+ == dict_index_get_n_unique(index)
+ && (dict_index_is_clust(index)
+ || !dtuple_contains_null(search_tuple))) {
+
+ /* Note above that a UNIQUE secondary index can contain many
+ rows with the same key value if one of the columns is the SQL
+ null. A clustered index under MySQL can never contain null
+ columns because we demand that all the columns in primary key
+ are non-null. */
+
+ unique_search = TRUE;
+
+ /* Even if the condition is unique, MySQL seems to try to
+ retrieve also a second row if a primary key contains more than
+ 1 column. Return immediately if this is not a HANDLER
+ command. */
+
+ if (UNIV_UNLIKELY(direction != 0
+ && !prebuilt->used_in_HANDLER)) {
+ goto early_not_found;
+ }
+ }
+
+ /* We don't support sequencial scan for Rtree index, because it
+ is no meaning to do so. */
+ if (dict_index_is_spatial(index) && !RTREE_SEARCH_MODE(mode)) {
+ trx->op_info = "";
+ DBUG_RETURN(DB_END_OF_INDEX);
+ }
+
+ /* if the query is a plain locking SELECT, and the isolation level
+ is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+ bool did_semi_consistent_read = false;
+ mtr_t mtr;
+ mtr.start();
+
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /*-------------------------------------------------------------*/
+ /* PHASE 2: Try fast adaptive hash index search if possible */
+
+ /* Next test if this is the special case where we can use the fast
+ adaptive hash index to try the search. Since we must release the
+ search system latch when we retrieve an externally stored field, we
+ cannot use the adaptive hash index in a search in the case the row
+ may be long and there may be externally stored fields */
+
+ if (UNIV_UNLIKELY(direction == 0)
+ && unique_search
+ && btr_search_enabled
+ && dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->used_in_HANDLER
+ && (prebuilt->mysql_row_len < srv_page_size / 8)) {
+
+ mode = PAGE_CUR_GE;
+
+ if (prebuilt->select_lock_type == LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && trx->read_view.is_open()) {
+
+ /* This is a SELECT query done as a consistent read,
+ and the read view has already been allocated:
+ let us try a search shortcut through the hash
+ index. */
+
+ dberr_t err = DB_SUCCESS;
+ switch (row_sel_try_search_shortcut_for_mysql(
+ &rec, prebuilt, &offsets, &heap,
+ &mtr)) {
+ case SEL_FOUND:
+ /* At this point, rec is protected by
+ a page latch that was acquired by
+ row_sel_try_search_shortcut_for_mysql().
+ The latch will not be released until
+ mtr.commit(). */
+ ut_ad(!rec_get_deleted_flag(rec, comp));
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ switch (row_search_idx_cond_check(
+ buf, prebuilt,
+ rec, offsets)) {
+ case CHECK_ABORTED_BY_USER:
+ goto aborted;
+ case CHECK_NEG:
+ case CHECK_OUT_OF_RANGE:
+ case CHECK_ERROR:
+ err = DB_RECORD_NOT_FOUND;
+ goto shortcut_done;
+ case CHECK_POS:
+ goto shortcut_done;
+ }
+
+ ut_ad("incorrect code" == 0);
+aborted:
+ err = DB_INTERRUPTED;
+ goto shortcut_done;
+ }
+
+ if (!row_sel_store_mysql_rec(
+ buf, prebuilt,
+ rec, NULL, false, index,
+ offsets)) {
+ /* Only fresh inserts may contain
+ incomplete externally stored
+ columns. Pretend that such
+ records do not exist. Such
+ records may only be accessed
+ at the READ UNCOMMITTED
+ isolation level or when
+ rolling back a recovered
+ transaction. Rollback happens
+ at a lower level, not here. */
+
+ /* Proceed as in case SEL_RETRY. */
+ break;
+ }
+
+ goto shortcut_done;
+
+ case SEL_EXHAUSTED:
+ err = DB_RECORD_NOT_FOUND;
+ shortcut_done:
+ mtr.commit();
+
+ /* NOTE that we do NOT store the cursor
+ position */
+ trx->op_info = "";
+ ut_ad(!did_semi_consistent_read);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ DBUG_RETURN(err);
+
+ case SEL_RETRY:
+ break;
+
+ default:
+ ut_ad(0);
+ }
+
+ mtr.commit();
+ mtr.start();
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 3: Open or restore index cursor position */
+
+ spatial_search = dict_index_is_spatial(index)
+ && mode >= PAGE_CUR_CONTAIN;
+
+#ifdef UNIV_DEBUG
+ /* The state of a running trx can only be changed by the
+ thread that is currently serving the transaction. Because we
+ are that thread, we can read trx->state without holding any
+ mutex. */
+ switch (trx->state) {
+ case TRX_STATE_ACTIVE:
+ break;
+ case TRX_STATE_NOT_STARTED:
+ ut_ad(prebuilt->sql_stat_start
+ || prebuilt->table->no_rollback());
+ break;
+ default:
+ ut_ad("invalid trx->state" == 0);
+ }
+#endif
+
+ ut_ad(prebuilt->sql_stat_start
+ || prebuilt->select_lock_type != LOCK_NONE
+ || trx->read_view.is_open()
+ || prebuilt->table->no_rollback()
+ || srv_read_only_mode);
+
+ /* Do not lock gaps at READ UNCOMMITTED or READ COMMITTED
+ isolation level */
+ const bool set_also_gap_locks =
+ prebuilt->select_lock_type != LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_COMMITTED
+#ifdef WITH_WSREP
+ && !wsrep_thd_skip_locking(trx->mysql_thd)
+#endif /* WITH_WSREP */
+ ;
+
+ /* Note that if the search mode was GE or G, then the cursor
+ naturally moves upward (in fetch next) in alphabetical order,
+ otherwise downward */
+
+ if (UNIV_UNLIKELY(direction == 0)) {
+ if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G
+ || mode >= PAGE_CUR_CONTAIN) {
+ moves_up = true;
+ }
+ } else if (direction == ROW_SEL_NEXT) {
+ moves_up = true;
+ }
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ clust_index = dict_table_get_first_index(prebuilt->table);
+
+ dberr_t err = DB_SUCCESS;
+
+ /* Do some start-of-statement preparations */
+
+ if (prebuilt->table->no_rollback()) {
+ /* NO_ROLLBACK tables do not support MVCC or locking. */
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->sql_stat_start = FALSE;
+ } else if (!prebuilt->sql_stat_start) {
+ /* No need to set an intention lock or assign a read view */
+ ut_a(prebuilt->select_lock_type != LOCK_NONE
+ || srv_read_only_mode || trx->read_view.is_open());
+ } else {
+ prebuilt->sql_stat_start = FALSE;
+ trx_start_if_not_started(trx, false);
+
+ if (prebuilt->select_lock_type == LOCK_NONE) {
+ trx->read_view.open(trx);
+ } else {
+wait_table_again:
+ err = lock_table(prebuilt->table, nullptr,
+ prebuilt->select_lock_type == LOCK_S
+ ? LOCK_IS : LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ table_lock_waited = TRUE;
+ goto lock_table_wait;
+ }
+ }
+ }
+
+ /* Open or restore index cursor position */
+
+ if (UNIV_LIKELY(direction != 0)) {
+ if (spatial_search) {
+ /* R-Tree access does not need to do
+ cursor position and resposition */
+ goto next_rec;
+ }
+
+ bool need_to_process = sel_restore_position_for_mysql(
+ &same_user_rec, BTR_SEARCH_LEAF,
+ pcur, moves_up, &mtr);
+
+ if (UNIV_UNLIKELY(need_to_process)) {
+ if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+ mtr.commit();
+ trx->op_info = "";
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return DB_CORRUPTION;
+ }
+
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ /* We did a semi-consistent read,
+ but the record was removed in
+ the meantime. */
+ prebuilt->row_read_type
+ = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ } else if (UNIV_LIKELY(prebuilt->row_read_type
+ != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+ /* The cursor was positioned on the record
+ that we returned previously. If we need
+ to repeat a semi-consistent read as a
+ pessimistic locking read, the record
+ cannot be skipped. */
+
+ goto next_rec_after_check;
+ }
+
+ } else if (dtuple_get_n_fields(search_tuple) > 0) {
+ pcur->btr_cur.thr = thr;
+ pcur->old_rec = nullptr;
+
+ if (index->is_spatial()) {
+ if (!prebuilt->rtr_info) {
+ prebuilt->rtr_info = rtr_create_rtr_info(
+ set_also_gap_locks, true,
+ btr_pcur_get_btr_cur(pcur), index);
+ prebuilt->rtr_info->search_tuple = search_tuple;
+ prebuilt->rtr_info->search_mode = mode;
+ rtr_info_update_btr(btr_pcur_get_btr_cur(pcur),
+ prebuilt->rtr_info);
+ } else {
+ rtr_info_reinit_in_cursor(
+ btr_pcur_get_btr_cur(pcur),
+ index, set_also_gap_locks);
+ prebuilt->rtr_info->search_tuple = search_tuple;
+ prebuilt->rtr_info->search_mode = mode;
+ }
+
+ err = rtr_search_leaf(pcur, search_tuple, mode, &mtr);
+ } else {
+ err = btr_pcur_open_with_no_init(search_tuple, mode,
+ BTR_SEARCH_LEAF,
+ pcur, &mtr);
+ }
+
+ if (err != DB_SUCCESS) {
+page_corrupted:
+ rec = NULL;
+ goto page_read_error;
+ }
+
+ pcur->trx_if_known = trx;
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(page_rec_is_leaf(rec));
+
+ if (!moves_up
+ && set_also_gap_locks
+ && !page_rec_is_supremum(rec)
+ && !dict_index_is_spatial(index)) {
+
+ /* Try to place a gap lock on the next index record
+ to prevent phantoms in ORDER BY ... DESC queries */
+ const rec_t* next_rec = page_rec_get_next_const(rec);
+ if (UNIV_UNLIKELY(!next_rec)) {
+ err = DB_CORRUPTION;
+ goto page_corrupted;
+ }
+
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(pcur,
+ next_rec, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
+ err = pcur->open_leaf(mode == PAGE_CUR_G, index,
+ BTR_SEARCH_LEAF, &mtr);
+
+ if (err != DB_SUCCESS) {
+ if (err == DB_DECRYPTION_FAILED) {
+ btr_decryption_failed(*index);
+ }
+ rec = NULL;
+ goto page_read_error;
+ }
+ }
+
+ /* Check if the table is supposed to be empty for our read view.
+
+ If we read bulk_trx_id as an older transaction ID, it is not
+ incorrect to check here whether that transaction should be
+ visible to us. If bulk_trx_id is not visible to us, the table
+ must have been empty at an earlier point of time, also in our
+ read view.
+
+ An INSERT would only update bulk_trx_id in
+ row_ins_clust_index_entry_low() if the table really was empty
+ (everything had been purged), when holding a leaf page latch
+ in the clustered index (actually, the root page is the only
+ leaf page in that case).
+
+ We are already holding a leaf page latch here, either
+ in a secondary index or in a clustered index.
+
+ If we are holding a clustered index page latch, there clearly
+ is no potential for race condition with a concurrent INSERT:
+ such INSERT would be blocked by us.
+
+ If we are holding a secondary index page latch, then we are
+ not directly blocking a concurrent INSERT that might update
+ bulk_trx_id to something that does not exist in our read view.
+ But, in that case, the entire table (all indexes) must have
+ been empty. So, even if our read below missed the update of
+ index->table->bulk_trx_id, we can safely proceed to reading
+ the empty secondary index page. Our latch will prevent the
+ INSERT from proceeding to that page. It will first modify
+ the clustered index. Also, we may only look up something in
+ the clustered index if the secondary index page is not empty
+ to begin with. So, only if the table is corrupted
+ (the clustered index is empty but the secondary index is not)
+ we could return corrupted results. */
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+ || !trx->read_view.is_open()) {
+ } else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+ /* InnoDB should allow the transaction to read all
+ the rows when InnoDB intends to do any locking
+ on the record */
+ if (prebuilt->select_lock_type == LOCK_NONE
+ && !trx->read_view.changes_visible(bulk_trx_id)) {
+ trx->op_info = "";
+ err = DB_END_OF_INDEX;
+ goto normal_return;
+ }
+ }
+
+rec_loop:
+ DEBUG_SYNC_C("row_search_rec_loop");
+ if (trx_is_interrupted(trx)) {
+ if (!spatial_search) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+ err = DB_INTERRUPTED;
+ goto normal_return;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 4: Look for matching records in a loop */
+
+ rec = btr_pcur_get_rec(pcur);
+
+ ut_ad(!!page_rec_is_comp(rec) == comp);
+ ut_ad(page_rec_is_leaf(rec));
+
+ if (page_rec_is_infimum(rec)) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. */
+
+ goto next_rec;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ if (set_also_gap_locks
+ && !dict_index_is_spatial(index)) {
+
+ /* Try to place a lock on the index record */
+
+ /* If the transaction isolation level is
+ READ UNCOMMITTED or READ COMMITTED,
+ we do not lock gaps. Supremum record is really
+ a gap and therefore we do not set locks there. */
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_ORDINARY, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ /* A page supremum record cannot be in the result set: skip
+ it now that we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* Do sanity checks in case our cursor has bumped into page
+ corruption */
+
+ if (comp) {
+ if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+ /* Skip the metadata pseudo-record. */
+ ut_ad(index->is_instant());
+ goto next_rec;
+ }
+
+ next_offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+ goto wrong_offs;
+ }
+ } else {
+ if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) {
+ /* Skip the metadata pseudo-record. */
+ ut_ad(index->is_instant());
+ goto next_rec;
+ }
+
+ next_offs = rec_get_next_offs(rec, FALSE);
+ if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+ goto wrong_offs;
+ }
+ }
+
+ if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) {
+
+wrong_offs:
+ if (srv_force_recovery == 0 || moves_up == false) {
+ ib::error() << "Rec address "
+ << static_cast<const void*>(rec)
+ << ", buf block fix count "
+ << btr_pcur_get_block(pcur)->page
+ .buf_fix_count();
+
+ ib::error() << "Index corruption: rec offs "
+ << page_offset(rec) << " next offs "
+ << next_offs
+ << btr_pcur_get_block(pcur)->page.id()
+ << ", index " << index->name
+ << " of table " << index->table->name
+ << ". Run CHECK TABLE. You may need to"
+ " restore from a backup, or dump + drop +"
+ " reimport the table.";
+ ut_ad(0);
+ err = DB_CORRUPTION;
+
+ goto page_read_error;
+ } else {
+ /* The user may be dumping a corrupt table. Jump
+ over the corruption to recover as much as possible. */
+
+ ib::info() << "Index corruption: rec offs "
+ << page_offset(rec) << " next offs "
+ << next_offs
+ << btr_pcur_get_block(pcur)->page.id()
+ << ", index " << index->name
+ << " of table " << index->table->name
+ << ". We try to skip the rest of the page.";
+
+ page_cur_set_after_last(btr_pcur_get_block(pcur),
+ btr_pcur_get_page_cur(pcur));
+ pcur->old_rec = nullptr;
+ goto next_rec;
+ }
+ }
+ /*-------------------------------------------------------------*/
+
+ /* Calculate the 'offsets' associated with 'rec' */
+
+ ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur)));
+ ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+ if (!rec_validate(rec, offsets)
+ || !btr_index_rec_validate(rec, index, FALSE)) {
+
+ ib::error() << "Index corruption: rec offs "
+ << page_offset(rec) << " next offs "
+ << next_offs
+ << btr_pcur_get_block(pcur)->page.id()
+ << ", index " << index->name
+ << " of table " << index->table->name
+ << ". We try to skip the record.";
+
+ goto next_rec;
+ }
+ }
+
+ /* Note that we cannot trust the up_match value in the cursor at this
+ place because we can arrive here after moving the cursor! Thus
+ we have to recompare rec and search_tuple to determine if they
+ match enough. */
+
+ if (match_mode == ROW_SEL_EXACT) {
+ /* Test if the index record matches completely to search_tuple
+ in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+ /* fputs("Comparing rec and search tuple\n", stderr); */
+
+ if (cmp_dtuple_rec(search_tuple, rec, index, offsets)) {
+
+ if (set_also_gap_locks
+ && !dict_index_is_spatial(index)) {
+ err = sel_set_rec_lock(
+ pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type, LOCK_GAP,
+ thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ /* The found record was not a match, but may be used
+ as NEXT record (index_next). Set the relative position
+ to BTR_PCUR_BEFORE, to reflect that the position of
+ the persistent cursor is before the found/stored row
+ (pcur->old_rec). */
+ ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+ pcur->rel_pos = BTR_PCUR_BEFORE;
+
+ err = DB_RECORD_NOT_FOUND;
+ goto normal_return;
+ }
+
+ } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+ if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec,
+ index, offsets)) {
+
+ if (set_also_gap_locks
+ && !dict_index_is_spatial(index)) {
+ err = sel_set_rec_lock(
+ pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type, LOCK_GAP,
+ thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ /* The found record was not a match, but may be used
+ as NEXT record (index_next). Set the relative position
+ to BTR_PCUR_BEFORE, to reflect that the position of
+ the persistent cursor is before the found/stored row
+ (pcur->old_rec). */
+ ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+ pcur->rel_pos = BTR_PCUR_BEFORE;
+
+ err = DB_RECORD_NOT_FOUND;
+ goto normal_return;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; note that delete
+ marked records are a special case in a unique search. If there
+ is a non-delete marked record, then it is enough to lock its
+ existence with LOCK_REC_NOT_GAP. */
+
+ unsigned lock_type;
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ /* At READ COMMITTED or READ UNCOMMITTED
+ isolation levels, do not lock committed
+ delete-marked records. */
+ if (!rec_get_deleted_flag(rec, comp)) {
+ goto no_gap_lock;
+ }
+
+ /* At most one transaction can be active
+ for temporary table. */
+ if (clust_index->table->is_temporary()) {
+ goto no_gap_lock;
+ }
+
+ if (index == clust_index) {
+ trx_id_t trx_id = row_get_rec_trx_id(
+ rec, index, offsets);
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(trx_id);
+ if (!trx_sys.is_registered(trx, trx_id)) {
+ /* The clustered index record
+ was delete-marked in a committed
+ transaction. Ignore the record. */
+ goto locks_ok_del_marked;
+ }
+ } else if (trx_t* t = row_vers_impl_x_locked(
+ trx, rec, index, offsets)) {
+ /* The record belongs to an active
+ transaction. We must acquire a lock. */
+ t->release_reference();
+ } else {
+ /* The secondary index record does not
+ point to a delete-marked clustered index
+ record that belongs to an active transaction.
+ Ignore the secondary index record, because
+ it is not locked. */
+ goto next_rec;
+ }
+
+ goto no_gap_lock;
+ }
+
+#ifdef WITH_WSREP
+ if (UNIV_UNLIKELY(!set_also_gap_locks)) {
+ ut_ad(wsrep_thd_skip_locking(trx->mysql_thd));
+ goto no_gap_lock;
+ }
+#else /* WITH_WSREP */
+ ut_ad(set_also_gap_locks);
+#endif /* WITH_WSREP */
+
+ /* Set next-key lock both for delete- and non-delete-marked
+ records for unique search, because non-delete-marked record can
+ be marked as deleted while transaction suspends. */
+ if (index->is_spatial()) {
+ goto no_gap_lock;
+ }
+
+ /* If we are doing a 'greater or equal than a primary key
+ value' search from a clustered index, and we find a record
+ that has that exact primary key value, then there is no need
+ to lock the gap before the record, because no insert in the
+ gap can be in our search range. That is, no phantom row can
+ appear that way.
+
+ An example: if col1 is the primary key, the search is WHERE
+ col1 >= 100, and we find a record where col1 = 100, then no
+ need to lock the gap before that record. */
+
+ if (index == clust_index
+ && mode == PAGE_CUR_GE
+ && direction == 0
+ && dtuple_get_n_fields_cmp(search_tuple)
+ == dict_index_get_n_unique(index)
+ && !cmp_dtuple_rec(search_tuple, rec, index, offsets)) {
+no_gap_lock:
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(pcur,
+ rec, index, offsets,
+ prebuilt->select_lock_type,
+ lock_type, thr, &mtr);
+
+ switch (err) {
+ const rec_t* old_vers;
+ case DB_SUCCESS_LOCKED_REC:
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ /* Note that a record of
+ prebuilt->index was locked. */
+ prebuilt->new_rec_locks = 1;
+ }
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ break;
+ case DB_LOCK_WAIT:
+ /* Lock wait for R-tree should already
+ be handled in sel_set_rtr_rec_lock() */
+ ut_ad(!dict_index_is_spatial(index));
+ /* Never unlock rows that were part of a conflict. */
+ prebuilt->new_rec_locks = 0;
+
+ if (UNIV_LIKELY(prebuilt->row_read_type
+ != ROW_READ_TRY_SEMI_CONSISTENT)
+ || unique_search
+ || index != clust_index) {
+ if (!prebuilt->skip_locked) {
+ goto lock_wait_or_error;
+ }
+ } else {
+ /* The following call returns 'offsets'
+ associated with 'old_vers' */
+ row_sel_build_committed_vers_for_mysql(
+ clust_index, prebuilt, rec,
+ &offsets, &heap, &old_vers,
+ need_vrow ? &vrow : NULL, &mtr);
+ }
+
+ /* Check whether it was a deadlock or not, if not
+ a deadlock and the transaction had to wait then
+ release the lock it is waiting on. */
+
+ err = lock_trx_handle_wait(trx);
+
+ switch (err) {
+ case DB_SUCCESS:
+ ut_ad(
+ !trx->lock.was_chosen_as_deadlock_victim);
+ /* The lock was granted while we were
+ searching for the last committed version.
+ Do a normal locking read. */
+
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ goto locks_ok;
+ case DB_DEADLOCK:
+ goto lock_wait_or_error;
+ case DB_LOCK_WAIT:
+ ut_ad(!dict_index_is_spatial(index));
+ err = DB_SUCCESS;
+ if (prebuilt->skip_locked) {
+ goto next_rec;
+ }
+ break;
+ case DB_LOCK_WAIT_TIMEOUT:
+ if (prebuilt->skip_locked) {
+ err = DB_SUCCESS;
+ goto next_rec;
+ }
+ /* fall through */
+ default:
+ ut_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row was not yet committed */
+
+ goto next_rec;
+ }
+
+ did_semi_consistent_read = true;
+ rec = old_vers;
+ break;
+ case DB_RECORD_NOT_FOUND:
+ if (dict_index_is_spatial(index)) {
+ goto next_rec;
+ } else {
+ goto lock_wait_or_error;
+ }
+ break;
+ case DB_LOCK_WAIT_TIMEOUT:
+ if (prebuilt->skip_locked) {
+ err = DB_SUCCESS;
+ goto next_rec;
+ }
+ /* fall through */
+ default:
+
+ goto lock_wait_or_error;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+ || prebuilt->table->is_temporary()
+ || prebuilt->table->no_rollback()) {
+
+ /* Do nothing: we let a non-locking SELECT read the
+ latest version of the record */
+
+ } else if (index == clust_index) {
+
+ /* Fetch a previous version of the row if the current
+ one is not visible in the snapshot; if we have a very
+ high force recovery level set, we try to avoid crashes
+ by skipping this lookup */
+
+ err = row_sel_clust_sees(rec, *index, offsets,
+ trx->read_view);
+
+ switch (err) {
+ default:
+ goto lock_wait_or_error;
+ case DB_SUCCESS:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ ut_ad(srv_force_recovery
+ < SRV_FORCE_NO_UNDO_LOG_SCAN);
+ rec_t* old_vers;
+ /* The following call returns 'offsets'
+ associated with 'old_vers' */
+ err = row_sel_build_prev_vers_for_mysql(
+ prebuilt, clust_index,
+ rec, &offsets, &heap, &old_vers,
+ need_vrow ? &vrow : nullptr, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row did not exist yet in
+ the read view */
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else {
+ /* We are looking into a non-clustered index,
+ and to get the right version of the record we
+ have to look also into the clustered index: this
+ is necessary, because we can only get the undo
+ information via the clustered index record. */
+
+ ut_ad(!dict_index_is_clust(index));
+
+ if (!srv_read_only_mode) {
+ trx_id_t trx_id = page_get_max_trx_id(
+ page_align(rec));
+ ut_ad(trx_id);
+ if (trx->read_view.sees(trx_id)) {
+ goto locks_ok;
+ }
+ /* We should look at the clustered index.
+ However, as this is a non-locking read,
+ we can skip the clustered index lookup if
+ the condition does not match the secondary
+ index entry. */
+ switch (row_search_idx_cond_check(
+ buf, prebuilt, rec, offsets)) {
+ case CHECK_NEG:
+ goto next_rec;
+ case CHECK_ABORTED_BY_USER:
+ err = DB_INTERRUPTED;
+ goto idx_cond_failed;
+ case CHECK_OUT_OF_RANGE:
+ case CHECK_ERROR:
+ err = DB_RECORD_NOT_FOUND;
+ goto idx_cond_failed;
+ case CHECK_POS:
+ goto requires_clust_rec;
+ }
+
+ ut_error;
+ }
+ }
+ }
+
+locks_ok:
+ /* NOTE that at this point rec can be an old version of a clustered
+ index record built for a consistent read. We cannot assume after this
+ point that rec is on a buffer pool page. Functions like
+ page_rec_is_comp() cannot be used! */
+
+ if (rec_get_deleted_flag(rec, comp)) {
+locks_ok_del_marked:
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(index != clust_index
+ || row_get_rec_trx_id(rec, index, offsets));
+
+ /* The record is delete-marked: we can skip it */
+
+ /* This is an optimization to skip setting the next key lock
+ on the record that follows this delete-marked record. This
+ optimization works because of the unique search criteria
+ which precludes the presence of a range lock between this
+ delete marked record and the record following it.
+
+ For now this is applicable only to clustered indexes while
+ doing a unique search except for HANDLER queries because
+ HANDLER allows NEXT and PREV even in unique search on
+ clustered index. There is scope for further optimization
+ applicable to unique secondary indexes. Current behaviour is
+ to widen the scope of a lock on an already delete marked record
+ if the same record is deleted twice by the same transaction */
+ if (index == clust_index && unique_search
+ && !prebuilt->used_in_HANDLER) {
+
+ err = DB_RECORD_NOT_FOUND;
+
+ goto normal_return;
+ }
+
+ goto next_rec;
+ }
+
+ /* Check if the record matches the index condition. */
+ switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
+ case CHECK_NEG:
+ if (did_semi_consistent_read) {
+ row_unlock_for_mysql(prebuilt, TRUE);
+ }
+ goto next_rec;
+ case CHECK_ABORTED_BY_USER:
+ err = DB_INTERRUPTED;
+ goto idx_cond_failed;
+ case CHECK_OUT_OF_RANGE:
+ case CHECK_ERROR:
+ err = DB_RECORD_NOT_FOUND;
+ goto idx_cond_failed;
+ case CHECK_POS:
+ break;
+ }
+
+ if (index != clust_index && prebuilt->need_to_access_clustered) {
+ if (row_search_with_covering_prefix(prebuilt, rec, offsets)) {
+ goto use_covering_index;
+ }
+requires_clust_rec:
+ ut_ad(index != clust_index);
+ /* We use a 'goto' to the preceding label if a consistent
+ read of a secondary index record requires us to look up old
+ versions of the associated clustered index record. */
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ mtr_extra_clust_savepoint = mtr.get_savepoint();
+
+ ut_ad(!vrow);
+ /* The following call returns 'offsets' associated with
+ 'clust_rec'. Note that 'clust_rec' can be an old version
+ built for a consistent read. */
+
+ err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+ thr, &clust_rec,
+ &offsets, &heap,
+ need_vrow ? &vrow : NULL,
+ &mtr);
+ if (err == DB_LOCK_WAIT && prebuilt->skip_locked) {
+ err = lock_trx_handle_wait(trx);
+ }
+ switch (err) {
+ case DB_SUCCESS:
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(prebuilt->select_lock_type == LOCK_NONE
+ || dict_index_is_spatial(index));
+ goto next_rec;
+ }
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ ut_a(clust_rec != NULL);
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ /* Note that the clustered index record
+ was locked. */
+ prebuilt->new_rec_locks = 2;
+ }
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT_TIMEOUT:
+ case DB_LOCK_WAIT:
+ if (prebuilt->skip_locked) {
+ err = DB_SUCCESS;
+ goto next_rec;
+ }
+ /* fall through */
+ default:
+ vrow = NULL;
+ goto lock_wait_or_error;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, comp)) {
+
+ /* The record is delete marked: we can skip it */
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* No need to keep a lock on a delete-marked
+ record if we do not want to use next-key
+ locking. */
+
+ row_unlock_for_mysql(prebuilt, TRUE);
+ }
+
+ goto next_rec;
+ }
+
+ if (need_vrow && !vrow) {
+ if (!heap) {
+ heap = mem_heap_create(100);
+ }
+ row_sel_fill_vrow(rec, index, &vrow, heap);
+ }
+
+ result_rec = clust_rec;
+ ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ /* Convert the record to MySQL format. We were
+ unable to do this in row_search_idx_cond_check(),
+ because the condition is on the secondary index
+ and the requested column is in the clustered index.
+ We convert all fields, including those that
+ may have been used in ICP, because the
+ secondary index may contain a column prefix
+ rather than the full column. Also, as noted
+ in Bug #56680, the column in the secondary
+ index may be in the wrong case, and the
+ authoritative case is in result_rec, the
+ appropriate version of the clustered index record. */
+ if (!row_sel_store_mysql_rec(
+ buf, prebuilt, result_rec, vrow,
+ true, clust_index, offsets)) {
+ goto next_rec;
+ }
+ }
+ } else {
+use_covering_index:
+ result_rec = rec;
+ }
+
+ /* We found a qualifying record 'result_rec'. At this point,
+ 'offsets' are associated with 'result_rec'. */
+
+ ut_ad(rec_offs_validate(result_rec,
+ result_rec != rec ? clust_index : index,
+ offsets));
+ ut_ad(!rec_get_deleted_flag(result_rec, comp));
+
+ /* Decide whether to prefetch extra rows.
+ At this point, the clustered index record is protected
+ by a page latch that was acquired when pcur was positioned.
+ The latch will not be released until mtr.commit(). */
+
+ if ((match_mode == ROW_SEL_EXACT
+ || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+ && prebuilt->select_lock_type == LOCK_NONE
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->clust_index_was_generated
+ && !prebuilt->used_in_HANDLER
+ && !prebuilt->in_fts_query) {
+ /* Inside an update, for example, we do not cache rows,
+ since we may use the cursor position to do the actual
+ update, that is why we require ...lock_type == LOCK_NONE.
+ Since we keep space in prebuilt only for the BLOBs of
+ a single row, we cannot cache rows in the case there
+ are BLOBs in the fields to be fetched. In HANDLER we do
+ not cache rows because there the cursor is a scrollable
+ cursor. */
+
+ ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+ /* We only convert from InnoDB row format to MySQL row
+ format when ICP is disabled. */
+
+ if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+ /* We use next_buf to track the allocation of buffers
+ where we store and enqueue the buffers for our
+ pre-fetch optimisation.
+
+ If next_buf == 0 then we store the converted record
+ directly into the MySQL record buffer (buf). If it is
+ != 0 then we allocate a pre-fetch buffer and store the
+ converted record there.
+
+ If the conversion fails and the MySQL record buffer
+ was not written to then we reset next_buf so that
+ we can re-use the MySQL record buffer in the next
+ iteration. */
+
+ next_buf = next_buf
+ ? row_sel_fetch_last_buf(prebuilt) : buf;
+
+ if (!row_sel_store_mysql_rec(
+ next_buf, prebuilt, result_rec, vrow,
+ result_rec != rec,
+ result_rec != rec ? clust_index : index,
+ offsets)) {
+
+ if (next_buf == buf) {
+ ut_a(prebuilt->n_fetch_cached == 0);
+ next_buf = 0;
+ }
+
+ /* Only fresh inserts may contain incomplete
+ externally stored columns. Pretend that such
+ records do not exist. Such records may only be
+ accessed at the READ UNCOMMITTED isolation
+ level or when rolling back a recovered
+ transaction. Rollback happens at a lower
+ level, not here. */
+ goto next_rec;
+ }
+
+ if (next_buf != buf) {
+ row_sel_enqueue_cache_row_for_mysql(
+ next_buf, prebuilt);
+ }
+ } else {
+ row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
+ }
+
+ if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+ goto next_rec;
+ }
+ } else {
+ if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+ /* The record was not yet converted to MySQL format. */
+ if (!row_sel_store_mysql_rec(
+ buf, prebuilt, result_rec, vrow,
+ result_rec != rec,
+ result_rec != rec ? clust_index : index,
+ offsets)) {
+ /* Only fresh inserts may contain
+ incomplete externally stored
+ columns. Pretend that such records do
+ not exist. Such records may only be
+ accessed at the READ UNCOMMITTED
+ isolation level or when rolling back a
+ recovered transaction. Rollback
+ happens at a lower level, not here. */
+ goto next_rec;
+ }
+ }
+
+ if (!prebuilt->clust_index_was_generated) {
+ } else if (result_rec != rec || index->is_primary()) {
+ memcpy(prebuilt->row_id, result_rec, DATA_ROW_ID_LEN);
+ } else {
+ ulint len;
+ const byte* data = rec_get_nth_field(
+ result_rec, offsets, index->n_fields - 1,
+ &len);
+ ut_ad(dict_index_get_nth_col(index,
+ index->n_fields - 1)
+ ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+ ut_ad(len == DATA_ROW_ID_LEN);
+ memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+ }
+ }
+
+ /* From this point on, 'offsets' are invalid. */
+
+ /* We have an optimization to save CPU time: if this is a consistent
+ read on a unique condition on the clustered index, then we do not
+ store the pcur position, because any fetch next or prev will anyway
+ return 'end of file'. Exceptions are locking reads and the MySQL
+ HANDLER command where the user can move the cursor with PREV or NEXT
+ even after a unique search. */
+
+ err = DB_SUCCESS;
+
+idx_cond_failed:
+ if (!unique_search
+ || !dict_index_is_clust(index)
+ || direction != 0
+ || prebuilt->select_lock_type != LOCK_NONE
+ || prebuilt->used_in_HANDLER) {
+
+ /* Inside an update always store the cursor position */
+
+ if (!spatial_search) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+ }
+
+ goto normal_return;
+
+next_rec:
+ /* Reset the old and new "did semi-consistent read" flags. */
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+next_rec_after_check:
+ did_semi_consistent_read = false;
+ prebuilt->new_rec_locks = 0;
+ vrow = NULL;
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 5: Move the cursor to the next index record */
+
+ /* NOTE: For moves_up==FALSE, the mini-transaction will be
+ committed and restarted every time when switching b-tree
+ pages. For moves_up==TRUE in index condition pushdown, we can
+ scan an entire secondary index tree within a single
+ mini-transaction. As long as the prebuilt->idx_cond does not
+ match, we do not need to consult the clustered index or
+ return records to MySQL, and thus we can avoid repositioning
+ the cursor. What prevents us from buffer-fixing all leaf pages
+ within the mini-transaction is the btr_leaf_page_release()
+ call in btr_pcur_move_to_next_page(). Only the leaf page where
+ the cursor is positioned will remain buffer-fixed.
+ For R-tree spatial search, we also commit the mini-transaction
+ each time */
+
+ if (spatial_search) {
+ /* No need to do store restore for R-tree */
+ mtr.rollback_to_savepoint(0);
+ } else if (mtr_extra_clust_savepoint) {
+ /* We must release any clustered index latches
+ if we are moving to the next non-clustered
+ index record, because we could break the latching
+ order if we would access a different clustered
+ index page right away without releasing the previous. */
+ mtr.rollback_to_savepoint(mtr_extra_clust_savepoint);
+ }
+
+ mtr_extra_clust_savepoint = 0;
+
+ if (moves_up) {
+ if (UNIV_UNLIKELY(spatial_search)) {
+ if (rtr_pcur_move_to_next(
+ search_tuple, mode, pcur, 0, &mtr)) {
+ goto rec_loop;
+ }
+ } else {
+ /* This is based on btr_pcur_move_to_next() */
+ ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(pcur->latch_mode != BTR_NO_LATCHES);
+ pcur->old_rec = nullptr;
+ if (btr_pcur_is_after_last_on_page(pcur)) {
+ if (btr_pcur_is_after_last_in_tree(pcur)) {
+ goto not_moved;
+ }
+ err = btr_pcur_move_to_next_page(pcur, &mtr);
+ if (err != DB_SUCCESS) {
+ goto lock_wait_or_error;
+ }
+ } else if (!btr_pcur_move_to_next_on_page(pcur)) {
+ goto corrupted;
+ }
+
+ goto rec_loop;
+ }
+ } else {
+ if (btr_pcur_move_to_prev(pcur, &mtr)) {
+ goto rec_loop;
+ }
+ if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+corrupted:
+ err = DB_CORRUPTION;
+ goto normal_return;
+ }
+ }
+
+not_moved:
+ if (!spatial_search) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+
+ err = match_mode ? DB_RECORD_NOT_FOUND : DB_END_OF_INDEX;
+ goto normal_return;
+
+lock_wait_or_error:
+ if (!dict_index_is_spatial(index)) {
+ btr_pcur_store_position(pcur, &mtr);
+ }
+page_read_error:
+ /* Reset the old and new "did semi-consistent read" flags. */
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ did_semi_consistent_read = false;
+
+lock_table_wait:
+ mtr.commit();
+ mtr_extra_clust_savepoint = 0;
+
+ trx->error_state = err;
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+ /* It was a lock wait, and it ended */
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+ mtr.start();
+
+ /* Table lock waited, go try to obtain table lock
+ again */
+ if (table_lock_waited) {
+ table_lock_waited = FALSE;
+
+ goto wait_table_again;
+ }
+
+ if (!dict_index_is_spatial(index)) {
+ sel_restore_position_for_mysql(
+ &same_user_rec, BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+ }
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && !same_user_rec) {
+
+ /* Since we were not able to restore the cursor
+ on the same user record, we cannot use
+ row_unlock_for_mysql() to unlock any records, and
+ we must thus reset the new rec lock info. Since
+ in lock0lock.cc we have blocked the inheriting of gap
+ X-locks, we actually do not have any new record locks
+ set in this case.
+
+ Note that if we were able to restore on the 'same'
+ user record, it is still possible that we were actually
+ waiting on a delete-marked record, and meanwhile
+ it was removed by purge and inserted again by some
+ other user. But that is no problem, because in
+ rec_loop we will again try to set a lock, and
+ new_rec_lock_info in trx will be right at the end. */
+
+ prebuilt->new_rec_locks = 0;
+ }
+
+ mode = pcur->search_mode;
+
+ goto rec_loop;
+ }
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ goto func_exit;
+
+normal_return:
+ mtr.commit();
+
+ DEBUG_SYNC_C("row_search_for_mysql_before_return");
+
+ if (prebuilt->pk_filter || prebuilt->idx_cond) {
+ /* When ICP is active we don't write to the MySQL buffer
+ directly, only to buffers that are enqueued in the pre-fetch
+ queue. We need to dequeue the first buffer and copy the contents
+ to the record buffer that was passed in by MySQL. */
+
+ if (prebuilt->n_fetch_cached > 0) {
+ row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+ err = DB_SUCCESS;
+ }
+
+ } else if (next_buf != 0) {
+
+ /* We may or may not have enqueued some buffers to the
+ pre-fetch queue, but we definitely wrote to the record
+ buffer passed to use by MySQL. */
+
+ DEBUG_SYNC_C("row_search_cached_row");
+ err = DB_SUCCESS;
+ }
+
+#ifdef UNIV_DEBUG
+ if (dict_index_is_spatial(index) && err != DB_SUCCESS
+ && err != DB_END_OF_INDEX && err != DB_INTERRUPTED) {
+ rtr_node_path_t* path = pcur->btr_cur.rtr_info->path;
+
+ ut_ad(path->empty());
+ }
+#endif
+
+func_exit:
+ trx->op_info = "";
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ /* Set or reset the "did semi-consistent read" flag on return.
+ The flag did_semi_consistent_read is set if and only if
+ the record being returned was fetched with a semi-consistent read. */
+ ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+ || !did_semi_consistent_read);
+
+ if (prebuilt->row_read_type != ROW_READ_WITH_LOCKS) {
+ if (did_semi_consistent_read) {
+ prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+ } else {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ }
+
+ DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
+
+ DBUG_RETURN(err);
+}
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
+{
+ dict_index_t* index = prebuilt->index;
+ dberr_t ret = DB_SUCCESS;
+ mtr_t mtr;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dtuple_t* search_entry = prebuilt->search_tuple;
+ ulint entry_len;
+ ulint i;
+ byte* buf;
+
+ ut_a(dict_index_is_spatial(index));
+
+ *n_rows = 0;
+
+ heap = mem_heap_create(256);
+
+ /* Build a search tuple. */
+ entry_len = dict_index_get_n_fields(index);
+ entry = dtuple_create(heap, entry_len);
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = ind_field->col;
+ dfield_t* dfield
+ = dtuple_get_nth_field(entry, i);
+
+ if (i == 0) {
+ double* mbr;
+ double tmp_mbr[SPDIMS * 2];
+
+ dfield->type.mtype = DATA_GEOMETRY;
+ dfield->type.prtype |= DATA_GIS_MBR;
+
+ /* Allocate memory for mbr field */
+ mbr = static_cast<double*>
+ (mem_heap_alloc(heap, DATA_MBR_LEN));
+
+ /* Set mbr field data. */
+ dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+ for (uint j = 0; j < SPDIMS; j++) {
+ tmp_mbr[j * 2] = DBL_MAX;
+ tmp_mbr[j * 2 + 1] = -DBL_MAX;
+ }
+ dfield_write_mbr(dfield, tmp_mbr);
+ continue;
+ }
+
+ dfield->type.mtype = col->mtype;
+ dfield->type.prtype = col->prtype;
+
+ }
+
+ prebuilt->search_tuple = entry;
+
+ ulint bufsize = std::max<ulint>(srv_page_size,
+ prebuilt->mysql_row_len);
+ buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
+
+ ulint direction = 0;
+
+loop:
+ ret = row_search_mvcc(buf, PAGE_CUR_WITHIN, prebuilt, 0, direction);
+ direction = ROW_SEL_NEXT;
+
+ switch (ret) {
+ case DB_SUCCESS:
+ break;
+ case DB_DEADLOCK:
+ case DB_LOCK_TABLE_FULL:
+ case DB_LOCK_WAIT_TIMEOUT:
+ case DB_INTERRUPTED:
+ goto func_exit;
+ default:
+ /* fall through (this error is ignored by CHECK TABLE) */
+ case DB_END_OF_INDEX:
+ ret = DB_SUCCESS;
+func_exit:
+ prebuilt->search_tuple = search_entry;
+ ut_free(buf);
+ mem_heap_free(heap);
+
+ return(ret);
+ }
+
+ ++*n_rows;
+ goto loop;
+}
+
+/** Check if a version of a clustered index record and a secondary
+index record match.
+
+@param prebuilt index and transaction
+@param clust_rec a version of a clustered index record
+@param clust_index clustered index
+@param clust_offsets rec_get_offsets(clust_rec, clust_index)
+@param rec secondary index leaf page record
+@param offsets rec_get_offsets(rec, index)
+@return an error code
+@retval DB_SUCCESS if rec matches clust_rec
+@retval DB_SUCCESS_LOCKED_REC if rec does not match clust_rec
+*/
+static dberr_t row_check_index_match(row_prebuilt_t *prebuilt,
+ const rec_t *clust_rec,
+ const dict_index_t *clust_index,
+ const rec_offs *clust_offsets,
+ const rec_t *rec,
+ const dict_index_t *index,
+ const rec_offs *offsets)
+{
+ ut_ad(index == prebuilt->index);
+
+ ib_vcol_row vc(index->has_virtual() ? mem_heap_create(256) : nullptr);
+
+ const uint16_t n= index->n_user_defined_cols;
+
+ for (uint16_t i= 0; i < n; i++)
+ {
+ ulint pos= 0;
+ ulint len, sec_len;
+
+ const dict_field_t &ifield= index->fields[i];
+ const byte *sec_field= rec_get_nth_field(rec, offsets, i, &sec_len);
+ const byte *field;
+
+ if (ifield.col->is_virtual())
+ {
+ /* Virtual column values must be reconstructed from the base columns. */
+ row_ext_t *ext;
+ byte *record= vc.record(prebuilt->trx->mysql_thd, clust_index,
+ &prebuilt->m_mysql_table);
+ const dict_v_col_t *v_col= reinterpret_cast<const dict_v_col_t*>
+ (ifield.col);
+ dtuple_t *row= row_build(ROW_COPY_POINTERS,
+ clust_index, clust_rec, clust_offsets,
+ nullptr, nullptr, nullptr, &ext, vc.heap);
+ if (dfield_t *vfield=
+ innobase_get_computed_value(row, v_col, clust_index, &vc.heap,
+ nullptr, nullptr,
+ prebuilt->trx->mysql_thd,
+ prebuilt->m_mysql_table,
+ record, nullptr, nullptr))
+ {
+ len= vfield->len;
+ field= static_cast<byte*>(vfield->data);
+ }
+ else
+ {
+ innobase_report_computed_value_failed(row);
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+ }
+ else
+ {
+ pos= dict_col_get_clust_pos(ifield.col, clust_index);
+ field= rec_get_nth_cfield(clust_rec, clust_index, clust_offsets, pos,
+ &len);
+ if (len == UNIV_SQL_NULL)
+ {
+ if (sec_len == UNIV_SQL_NULL)
+ continue;
+ return DB_SUCCESS_LOCKED_REC;
+ }
+ if (sec_len == UNIV_SQL_NULL)
+ return DB_SUCCESS_LOCKED_REC;
+
+ if (rec_offs_nth_extern(clust_offsets, pos))
+ {
+ if (len == BTR_EXTERN_FIELD_REF_SIZE)
+ goto compare_blobs;
+ len-= BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ if (ifield.prefix_len)
+ {
+ len=
+ dtype_get_at_most_n_mbchars(ifield.col->prtype, ifield.col->mbminlen,
+ ifield.col->mbmaxlen,
+ ifield.prefix_len, len,
+ reinterpret_cast<const char*>(field));
+ if (len < sec_len)
+ goto check_for_blob;
+ }
+ else
+ {
+check_for_blob:
+ if (rec_offs_nth_extern(clust_offsets, pos))
+ {
+compare_blobs:
+ if (!row_sel_sec_rec_is_for_blob(ifield.col->mtype,
+ ifield.col->prtype,
+ ifield.col->mbminlen,
+ ifield.col->mbmaxlen,
+ field, len, sec_field, sec_len,
+ ifield.prefix_len,
+ clust_index->table))
+ return DB_SUCCESS_LOCKED_REC;
+ continue;
+ }
+ }
+ }
+
+ if (cmp_data(ifield.col->mtype, ifield.col->prtype, false,
+ field, len, sec_field, sec_len))
+ return DB_SUCCESS_LOCKED_REC;
+ }
+
+ return DB_SUCCESS;
+}
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt index and transaction
+@param n_rows number of records counted
+
+@return error code
+@retval DB_SUCCESS if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+{
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ *n_rows= 0;
+ dict_index_t *const index= prebuilt->index;
+
+ if (!index->is_btree())
+ return DB_CORRUPTION;
+
+ mem_heap_t *heap= mem_heap_create(100);
+
+ dtuple_t *prev_entry= nullptr;
+ mtr_t mtr;
+ mtr.start();
+
+ dict_index_t *clust_index= dict_table_get_first_index(prebuilt->table);
+ prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+ dberr_t err= prebuilt->pcur->open_leaf(true, index, BTR_SEARCH_LEAF, &mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ {
+func_exit:
+ mtr.commit();
+ mem_heap_free(heap);
+ return err;
+ }
+
+ if (const trx_id_t bulk_trx_id= index->table->bulk_trx_id)
+ if (!prebuilt->trx->read_view.changes_visible(bulk_trx_id))
+ goto func_exit;
+
+ ReadView check_table_extended_view;
+ ReadView &view=
+ prebuilt->need_to_access_clustered &&
+ !prebuilt->table->is_temporary() &&
+ prebuilt->trx->isolation_level != TRX_ISO_READ_UNCOMMITTED
+ ? check_table_extended_view : prebuilt->trx->read_view;
+ if (&view == &check_table_extended_view)
+ check_table_extended_view.set_creator_trx_id(prebuilt->trx->id);
+
+page_loop:
+ if (&view == &check_table_extended_view)
+ /* In CHECK TABLE...EXTENDED, we make a copy of purge_sys.end_view
+ while holding a shared latch on the index leaf page.
+ Should a currently active purge batch desire to remove any further
+ records from this page, it would be blocked by our page latch.
+
+ We will consult check_table_extended_view to determine if a
+ clustered index record corresponding to a secondary index record
+ is visible to the current purge batch. Right after we have made our
+ copy, purge_sys.end_view is free to be changed again.
+
+ If we have an orphan secondary index record, we may attempt to
+ request a clustered index record version that cannot be retrieved
+ any more because the undo log records may have been freed
+ (according to the purge_sys.end_view). In such a case,
+ trx_undo_get_undo_rec() would cause
+ trx_undo_prev_version_build() and trx_undo_prev_version_build()
+ to return DB_MISSING_HISTORY. */
+ static_cast<ReadViewBase&>(check_table_extended_view)=
+ purge_sys_t::end_view_guard{}.view();
+
+rec_loop:
+ ut_ad(err == DB_SUCCESS);
+
+ if (!btr_pcur_move_to_next_on_page(prebuilt->pcur))
+ {
+ err= DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ const rec_t *rec= btr_pcur_get_rec(prebuilt->pcur);
+ rec_offs *offsets= offsets_;
+
+ if (page_rec_is_supremum(rec))
+ {
+ next_page:
+ if (btr_pcur_is_after_last_in_tree(prebuilt->pcur))
+ goto func_exit;
+ err= btr_pcur_move_to_next_page(prebuilt->pcur, &mtr);
+ if (err == DB_SUCCESS && trx_is_interrupted(prebuilt->trx))
+ err= DB_INTERRUPTED;
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ goto func_exit;
+ goto page_loop;
+ }
+
+ offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ const auto info_bits=
+ rec_get_info_bits(rec, prebuilt->table->not_redundant());
+ const bool rec_deleted= info_bits & REC_INFO_DELETED_FLAG;
+
+ if (UNIV_UNLIKELY(info_bits & REC_INFO_MIN_REC_FLAG))
+ {
+ if (*n_rows || !index->is_instant())
+ {
+ push_warning_printf(prebuilt->trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+ "InnoDB: invalid record encountered");
+ prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+ }
+ goto next_rec;
+ }
+
+ if (prebuilt->table->is_temporary())
+ {
+ count_or_not:
+ if (rec_deleted)
+ goto next_rec;
+ }
+ else if (index->is_clust())
+ {
+ if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED)
+ goto count_or_not;
+
+ trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+ if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+ UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+ {
+ invalid_trx_id:
+ if (prebuilt->autoinc_error == DB_SUCCESS)
+ push_warning_printf(prebuilt->trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+ " exceeds the system-wide maximum",
+ rec_trx_id);
+ prebuilt->autoinc_error= DB_CORRUPTION;
+ goto next_rec;
+ }
+
+ if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+ {
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+ rec_t *old_vers;
+ /* The following call returns 'offsets' associated with 'old_vers' */
+ err= row_sel_build_prev_vers_for_mysql(prebuilt, index, rec, &offsets,
+ &heap, &old_vers, nullptr, &mtr);
+
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ if (old_vers)
+ {
+ rec= old_vers;
+ rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+ if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+ UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+ goto invalid_trx_id;
+
+ if (!rec_get_deleted_flag(rec, prebuilt->table->not_redundant()))
+ goto count_row;
+ }
+ else
+ offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ goto next_rec;
+ }
+ else if (!rec_deleted && !rec_trx_id);
+ else if (!check_table_extended_view.changes_visible(rec_trx_id));
+ else if (prebuilt->autoinc_error == DB_SUCCESS)
+ {
+ const char *msg= rec_deleted
+ ? "Unpurged clustered index record"
+ : "Clustered index record with stale history";
+
+ ib::warn w;
+ w << msg << " in table " << index->table->name << ": "
+ << rec_offsets_print(rec, offsets);
+ prebuilt->autoinc_error= DB_MISSING_HISTORY;
+ push_warning_printf(prebuilt->trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE, "InnoDB: %s", w.m_oss.str().c_str());
+ }
+
+ goto count_or_not;
+ }
+ else if (const trx_id_t page_trx_id= page_get_max_trx_id(page_align(rec)))
+ {
+ if (page_trx_id >= trx_sys.get_max_trx_id())
+ goto invalid_PAGE_MAX_TRX_ID;
+ if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
+ else if (&view == &check_table_extended_view || rec_deleted ||
+ !view.sees(page_trx_id))
+ {
+ bool got_extended_match= &view == &check_table_extended_view;
+ const auto savepoint= mtr.get_savepoint();
+
+ row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, index, offsets);
+ err= btr_pcur_open_with_no_init(prebuilt->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ prebuilt->clust_pcur, &mtr);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ const rec_t *clust_rec= btr_pcur_get_rec(prebuilt->clust_pcur);
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec) ||
+ btr_pcur_get_low_match(prebuilt->clust_pcur) < clust_index->n_uniq)
+ {
+ if (!rec_deleted)
+ {
+ not_found:
+ /* MDEV-29823 FIXME: There is a race condition between
+ rollback, purge, and possibly other SQL connections that
+ are creating and releasing read views. At the time
+ row_undo_mod_del_mark_or_remove_sec_low() is executing
+ rollback on a secondary index record, purge_sys.view
+ may not allow it to delete the record, and it will be
+ delete-marked. Eventually purge_sys.view would advance,
+ but the delete-marked record could never be removed,
+ because no undo log record was ever added to
+ the purge queue by trx_purge_add_undo_to_history().
+
+ For now, we will not flag an error about orphan secondary index
+ records that are delete-marked; we will only warn about them. */
+
+ if (!rec_deleted || prebuilt->autoinc_error == DB_SUCCESS)
+ {
+ ib::error_or_warn w(!rec_deleted);
+ w << "Clustered index record not found for index "
+ << index->name << " of table " << index->table->name
+ << ": " << rec_offsets_print(rec, offsets);
+ push_warning_printf(prebuilt->trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE, "InnoDB: %s",
+ w.m_oss.str().c_str());
+ }
+
+ if (prebuilt->autoinc_error == DB_SUCCESS)
+ prebuilt->autoinc_error= rec_deleted
+ ? DB_MISSING_HISTORY
+ : DB_CORRUPTION;
+ }
+ else if (&view == &check_table_extended_view)
+ extended_not_found:
+ if (view.changes_visible(page_trx_id))
+ goto not_found;
+ did_not_find:
+ mtr.rollback_to_savepoint(savepoint);
+ goto next_rec;
+ }
+
+ rec_offs *clust_offsets;
+ trx_id_t rec_trx_id;
+ rec_t *old_vers= nullptr;
+
+ bool found_in_view= false;
+ trx_id_t visible_trx_id= ~0ULL;
+
+ if (ulint trx_id_offset= clust_index->trx_id_offset)
+ {
+ clust_offsets= nullptr;
+ read_trx_id:
+ rec_trx_id= trx_read_trx_id(clust_rec + trx_id_offset);
+
+ if (clust_rec[trx_id_offset + DATA_TRX_ID_LEN] & 0x80)
+ {
+ if (UNIV_UNLIKELY
+ (rec_get_deleted_flag(clust_rec,
+ prebuilt->table->not_redundant())))
+ {
+ err= DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ /* This is the oldest available record version (fresh insert). */
+ if (!view.changes_visible(rec_trx_id))
+ {
+ if (rec_trx_id >= view.low_limit_id() &&
+ UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+ goto invalid_rec_trx_id;
+ if (got_extended_match)
+ goto check_latest_version;
+ goto did_not_find;
+ }
+ }
+ }
+ else
+ {
+ clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ ulint trx_id_pos= clust_index->n_uniq ? clust_index->n_uniq : 1;
+ ulint len;
+ trx_id_offset= rec_get_nth_field_offs(clust_offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ goto read_trx_id;
+ }
+
+ if (got_extended_match)
+ {
+ check_latest_version:
+ /* In CHECK TABLE...EXTENDED, always check if the secondary
+ index record matches the latest clustered index record
+ version, no matter if it is visible in our own read view.
+
+ If the latest clustered index version is delete-marked and
+ purgeable, it is not safe to fetch any BLOBs for column prefix
+ indexes because they may already have been freed. */
+ if (rec_trx_id &&
+ rec_get_deleted_flag(clust_rec,
+ prebuilt->table->not_redundant()) &&
+ purge_sys.is_purgeable(rec_trx_id))
+ goto did_not_find;
+
+ if (!clust_offsets)
+ clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ err= row_check_index_match(prebuilt,
+ clust_rec, clust_index, clust_offsets,
+ rec, index, offsets);
+
+ switch (err) {
+ default:
+ goto func_exit;
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ }
+
+ got_extended_match= err == DB_SUCCESS;
+ err= DB_SUCCESS;
+
+ if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+ /* While CHECK TABLE ... EXTENDED checks for a matching
+ clustered index record version for each secondary index
+ record, it must count only those records that belong to its
+ own read view.
+
+ If the latest version of clust_rec matches rec but is not
+ in our read view, there may still be an older version of
+ clust_rec that not only matches rec but is in our view.
+ We must evaluate old versions before deciding whether rec
+ should be counted. */
+ goto check_old_vers;
+
+ /* Remember that this is the visible clust_rec for rec,
+ and whether it matches rec. */
+ visible_trx_id= rec_trx_id;
+ found_in_view= got_extended_match &&
+ !rec_get_deleted_flag(clust_rec,
+ prebuilt->table->not_redundant());
+
+ if (!got_extended_match)
+ goto check_old_vers;
+
+ if (!found_in_view)
+ goto did_not_find;
+
+ found_match:
+ mtr.rollback_to_savepoint(savepoint);
+ goto count_row;
+ }
+ else if (!view.changes_visible(rec_trx_id))
+ {
+ check_old_vers:
+ if (rec_trx_id >= view.low_limit_id() &&
+ UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+ {
+ invalid_rec_trx_id:
+ if (prebuilt->autoinc_error == DB_SUCCESS)
+ push_warning_printf(prebuilt->trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+ " exceeds the system-wide maximum",
+ rec_trx_id);
+ goto not_found;
+ }
+
+ if (!clust_offsets)
+ clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ row_sel_reset_old_vers_heap(prebuilt);
+ /* The following is adapted from row_vers_build_for_consistent_read()
+ because when using check_table_extended_view, we must
+ consider every available version of the clustered index record. */
+ mem_heap_t *vers_heap= nullptr;
+
+ for (;;)
+ {
+ mem_heap_t *prev_heap= vers_heap;
+ vers_heap= mem_heap_create(1024);
+ err= trx_undo_prev_version_build(clust_rec,
+ clust_index, clust_offsets,
+ vers_heap, &old_vers,
+ nullptr, nullptr, 0);
+ if (prev_heap)
+ mem_heap_free(prev_heap);
+ if (err != DB_SUCCESS)
+ {
+ old_vers_err:
+ mem_heap_free(vers_heap);
+ if (err == DB_MISSING_HISTORY)
+ {
+ err= DB_SUCCESS;
+ if (got_extended_match)
+ goto did_not_find;
+ goto not_found;
+ }
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(!old_vers))
+ {
+ mem_heap_free(vers_heap);
+ /* We did not find a matching clustered index record version
+ for the secondary index record. Normal CHECK TABLE will simply
+ not count the secondary index record; CHECK TABLE ... EXTENDED
+ will flag such orphan records if appropriate.
+
+ A secondary index record may may be "temporarily orphan"
+ if purge is in progress. We will only flag them if
+ everything up to PAGE_MAX_TRX_ID has been fully purged.
+
+ "Temporary orphans" may be produced when
+ row_undo_mod_clust() resets the DB_TRX_ID of the latest
+ clust_rec version or when trx_undo_prev_version_build()
+ encounters a BLOB that may have been freed according to
+ purge_sys.view (not purge_sys.end_view). */
+ if (&view == &check_table_extended_view && !got_extended_match)
+ goto extended_not_found;
+ goto did_not_find;
+ }
+
+ clust_rec= old_vers;
+ clust_offsets= rec_get_offsets(clust_rec, clust_index, clust_offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ rec_trx_id= row_get_rec_trx_id(clust_rec, clust_index,
+ clust_offsets);
+
+ if (UNIV_UNLIKELY(rec_trx_id >=
+ prebuilt->trx->read_view.low_limit_id() &&
+ rec_trx_id >= trx_sys.get_max_trx_id()))
+ {
+ mem_heap_free(vers_heap);
+ goto invalid_rec_trx_id;
+ }
+
+ const bool rec_visible=
+ prebuilt->trx->read_view.changes_visible(rec_trx_id);
+ const bool clust_rec_deleted=
+ rec_get_deleted_flag(clust_rec, prebuilt->table->not_redundant());
+
+ if (&view != &prebuilt->trx->read_view)
+ {
+ /* It is not safe to fetch BLOBs of committed delete-marked
+ records that may have been freed in purge. */
+ err= clust_rec_deleted && rec_trx_id &&
+ purge_sys.is_purgeable(rec_trx_id)
+ ? DB_SUCCESS_LOCKED_REC
+ : row_check_index_match(prebuilt,
+ clust_rec, clust_index, clust_offsets,
+ rec, index, offsets);
+
+ switch (err) {
+ default:
+ goto old_vers_err;
+ case DB_SUCCESS_LOCKED_REC:
+ if (rec_visible && !~visible_trx_id)
+ visible_trx_id= rec_trx_id;
+ continue;
+ case DB_SUCCESS:
+ got_extended_match= true;
+ if (!rec_visible)
+ continue;
+ if (!~visible_trx_id)
+ {
+ visible_trx_id= rec_trx_id;
+ found_in_view= !clust_rec_deleted;
+ }
+ mem_heap_free(vers_heap);
+ if (!found_in_view)
+ goto did_not_find;
+ goto found_match;
+ }
+ }
+ else if (rec_visible)
+ {
+ if (!clust_rec_deleted)
+ {
+ clust_rec= rec_copy(mem_heap_alloc(heap,
+ rec_offs_size(clust_offsets)),
+ clust_rec, clust_offsets);
+ rec_offs_make_valid(clust_rec, clust_index, true, clust_offsets);
+ }
+ mem_heap_free(vers_heap);
+ if (clust_rec_deleted)
+ goto did_not_find;
+ goto check_match;
+ }
+ }
+ }
+ else if (rec_get_deleted_flag(clust_rec,
+ prebuilt->table->not_redundant()))
+ goto did_not_find;
+
+ ut_ad(clust_rec);
+ ut_ad(&view != &check_table_extended_view);
+
+ /* If we had to go to an earlier version of row or the secondary
+ index record is delete marked, then it may be that the secondary
+ index record corresponding to clust_rec (or old_vers) is not
+ rec; in that case we must ignore such row because in our
+ snapshot rec would not have existed. Remember that from rec we
+ cannot see directly which transaction id corresponds to it: we
+ have to go to the clustered index record. A query where we want
+ to fetch all rows where the secondary index value is in some
+ interval would return a wrong result if we would not drop rows
+ which we come to visit through secondary index records that
+ would not really exist in our snapshot. */
+
+ if (rec_deleted)
+ {
+ if (!clust_offsets)
+ clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ check_match:
+ /* This clustered index record version exists in
+ prebuilt->trx->read_view and is not delete-marked.
+ By design, any BLOBs in it are not allowed to be
+ freed in the purge of committed transaction history. */
+ err= row_check_index_match(prebuilt, clust_rec, clust_index,
+ clust_offsets, rec, index, offsets);
+ switch (err) {
+ case DB_SUCCESS:
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ err= DB_SUCCESS;
+ goto did_not_find;
+ default:
+ goto func_exit;
+ }
+ }
+
+ mtr.rollback_to_savepoint(savepoint);
+ }
+ }
+ else
+ {
+ invalid_PAGE_MAX_TRX_ID:
+ if (UNIV_LIKELY(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN))
+ {
+ push_warning_printf(prebuilt->trx->mysql_thd,
+ Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+ "InnoDB: Invalid PAGE_MAX_TRX_ID=%llu"
+ " in index '%-.200s'",
+ page_trx_id, index->name());
+ prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+ }
+ goto next_rec;
+ }
+
+count_row:
+ ++*n_rows;
+
+ if (prev_entry)
+ {
+ ulint matched_fields= 0;
+ int cmp= cmp_dtuple_rec_with_match(prev_entry, rec, index, offsets,
+ &matched_fields);
+ const char* msg;
+
+ if (UNIV_LIKELY(cmp < 0));
+ else if (cmp > 0)
+ {
+ prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+ msg= "index records in a wrong order in ";
+not_ok:
+ ib::error() << msg << index->name << " of table " << index->table->name
+ << ": " << *prev_entry << ", "
+ << rec_offsets_print(rec, offsets);
+ }
+ else if (index->is_unique() && matched_fields >=
+ dict_index_get_n_ordering_defined_by_user(index))
+ {
+ /* NULL values in unique indexes are considered not to be duplicates */
+ for (ulint i= 0; i < dict_index_get_n_ordering_defined_by_user(index);
+ i++)
+ if (dfield_is_null(dtuple_get_nth_field(prev_entry, i)))
+ goto next_rec;
+
+ if (prebuilt->autoinc_error == DB_SUCCESS)
+ prebuilt->autoinc_error= DB_DUPLICATE_KEY;
+ msg= "duplicate key in ";
+ goto not_ok;
+ }
+ }
+
+next_rec:
+ ut_ad(err == DB_SUCCESS);
+
+ {
+ mem_heap_t *tmp_heap= nullptr;
+
+ /* Empty the heap on each round. But preserve offsets[]
+ for the row_rec_to_index_entry() call, by copying them
+ into a separate memory heap when needed. */
+ if (UNIV_UNLIKELY(offsets != offsets_))
+ {
+ ulint size= rec_offs_get_n_alloc(offsets) * sizeof *offsets;
+ tmp_heap= mem_heap_create(size);
+ offsets= static_cast<rec_offs*>(mem_heap_dup(tmp_heap, offsets, size));
+ }
+
+ mem_heap_empty(heap);
+ prev_entry= row_rec_to_index_entry(rec, index, offsets, heap);
+
+ if (UNIV_LIKELY_NULL(tmp_heap))
+ mem_heap_free(tmp_heap);
+ }
+
+ if (btr_pcur_is_after_last_on_page(prebuilt->pcur))
+ goto next_page;
+
+ goto rec_loop;
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+ dict_index_t* index, /*!< in: index to read from */
+ const rec_t* rec, /*!< in: current rec */
+ ulint col_no, /*!< in: column number */
+ ulint mtype, /*!< in: column main type */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ ulint len;
+ const byte* data;
+ ib_uint64_t value;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+ ut_ad(page_rec_is_leaf(rec));
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ col_no + 1, &heap);
+
+ if (rec_offs_nth_sql_null(offsets, col_no)) {
+ /* There is no non-NULL value in the auto-increment column. */
+ value = 0;
+ goto func_exit;
+ }
+
+ data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+ value = row_parse_int(data, len, mtype, unsigned_type);
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(value);
+}
+
+/** Get the maximum and non-delete-marked record in an index.
+@param[in] index index tree
+@param[in,out] mtr mini-transaction (may be committed and restarted)
+@return maximum record, page s-latched in mtr
+@retval NULL if there are no records, or if all of them are delete-marked */
+static
+const rec_t*
+row_search_get_max_rec(
+ dict_index_t* index,
+ mtr_t* mtr)
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ const bool desc = index->fields[0].descending;
+
+ if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ if (desc) {
+ const bool comp = index->table->not_redundant();
+ while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+ rec = btr_pcur_get_rec(&pcur);
+ if (rec_is_metadata(rec, *index)) {
+ continue;
+ }
+ if (!rec_get_deleted_flag(rec, comp)) {
+ goto found;
+ }
+ }
+ } else {
+ do {
+ rec = page_find_rec_last_not_deleted(
+ btr_pcur_get_page(&pcur));
+ if (page_rec_is_user_rec(rec)) {
+ goto found;
+ }
+ btr_pcur_move_before_first_on_page(&pcur);
+ } while (btr_pcur_move_to_prev(&pcur, mtr));
+ }
+
+ rec = nullptr;
+
+found:
+ ut_ad(!rec
+ || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
+ & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
+ return(rec);
+}
+
+/** Read the max AUTOINC value from an index.
+@param[in] index index starting with an AUTO_INCREMENT column
+@return the largest AUTO_INCREMENT value
+@retval 0 if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+{
+ const dict_field_t* dfield = dict_index_get_nth_field(index, 0);
+
+ ib_uint64_t value = 0;
+
+ mtr_t mtr;
+ mtr.start();
+
+ if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) {
+ value = row_search_autoinc_read_column(
+ index, rec, 0,
+ dfield->col->mtype,
+ dfield->col->prtype & DATA_UNSIGNED);
+ }
+
+ mtr.commit();
+ return(value);
+}
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
new file mode 100644
index 00000000..23255cc9
--- /dev/null
+++ b/storage/innobase/row/row0uins.cc
@@ -0,0 +1,652 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.cc
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "fil0fil.h"
+#include <mysql/service_thd_mdl.h>
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+ undo_node_t* node) /*!< in: undo node */
+{
+ dberr_t err;
+ ulint n_tries = 0;
+ mtr_t mtr;
+ dict_index_t* index = node->pcur.index();
+ table_id_t table_id = 0;
+ const bool dict_locked = node->trx->dict_operation_lock_mode;
+restart:
+ MDL_ticket* mdl_ticket = nullptr;
+ ut_ad(!table_id || dict_locked
+ || !node->trx->dict_operation_lock_mode);
+ dict_table_t *table = table_id
+ ? dict_table_open_on_id(table_id, dict_locked,
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
+ node->trx->mysql_thd, &mdl_ticket)
+ : nullptr;
+
+ ut_ad(index->is_primary());
+ ut_ad(node->trx->in_rollback);
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ ut_ad(index->table->id >= DICT_HDR_FIRST_ID);
+ } else {
+ index->set_modified(mtr);
+ ut_ad(lock_table_has_locks(index->table));
+ }
+
+ /* This is similar to row_undo_mod_clust(). The DDL thread may
+ already have copied this row from the log to the new table.
+ We must log the removal, so that the row will be correctly
+ purged. However, we can log the removal out of sync with the
+ B-tree modification. */
+ ut_a(node->pcur.restore_position(
+ (node->rec_type == TRX_UNDO_INSERT_METADATA)
+ ? BTR_MODIFY_TREE
+ : BTR_MODIFY_LEAF,
+ &mtr) == btr_pcur_t::SAME_ALL);
+ rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+ ut_ad(rec_get_trx_id(rec, index) == node->trx->id
+ || node->table->is_temporary());
+ ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant())
+ || rec_is_alter_metadata(rec, index->table->not_redundant()));
+ ut_ad(rec_is_metadata(rec, index->table->not_redundant())
+ == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+
+ switch (node->table->id) {
+ case DICT_COLUMNS_ID:
+ /* This is rolling back an INSERT into SYS_COLUMNS.
+ If it was part of an instant ALTER TABLE operation, we
+ must evict the table definition, so that it can be
+ reloaded after the dictionary operation has been
+ completed. At this point, any corresponding operation
+ to the metadata record will have been rolled back. */
+ ut_ad(node->trx->dict_operation_lock_mode);
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+ if (rec_get_n_fields_old(rec)
+ != DICT_NUM_FIELDS__SYS_COLUMNS
+ || (rec_get_1byte_offs_flag(rec)
+ ? rec_1_get_field_end_info(rec, 0) != 8
+ : rec_2_get_field_end_info(rec, 0) != 8)) {
+ break;
+ }
+ static_assert(!DICT_FLD__SYS_COLUMNS__TABLE_ID, "");
+ node->trx->evict_table(mach_read_from_8(rec));
+ break;
+ case DICT_INDEXES_ID:
+ ut_ad(node->trx->dict_operation_lock_mode);
+ ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+ if (!table_id) {
+ table_id = mach_read_from_8(rec);
+ if (table_id) {
+ mtr.commit();
+ goto restart;
+ }
+ ut_ad("corrupted SYS_INDEXES record" == 0);
+ }
+
+ pfs_os_file_t d = OS_FILE_CLOSED;
+
+ const uint32_t space_id = dict_drop_index_tree(
+ &node->pcur, node->trx, &mtr);
+ if (space_id) {
+ if (table) {
+ lock_release_on_rollback(node->trx,
+ table);
+ if (!dict_locked) {
+ dict_sys.lock(SRW_LOCK_CALL);
+ }
+ if (table->release()) {
+ dict_sys.remove(table);
+ } else if (table->space_id
+ == space_id) {
+ table->space = nullptr;
+ table->file_unreadable = true;
+ }
+ if (!dict_locked) {
+ dict_sys.unlock();
+ }
+ table = nullptr;
+ if (!mdl_ticket);
+ else if (MDL_context* mdl_context =
+ static_cast<MDL_context*>(
+ thd_mdl_context(
+ node->trx->
+ mysql_thd))) {
+ mdl_context->release_lock(
+ mdl_ticket);
+ mdl_ticket = nullptr;
+ }
+ }
+
+ d = fil_delete_tablespace(space_id);
+ }
+
+ mtr.commit();
+
+ if (d != OS_FILE_CLOSED) {
+ os_file_close(d);
+ }
+
+ if (space_id) {
+ ibuf_delete_for_discarded_space(space_id);
+ }
+
+ mtr.start();
+ ut_a(node->pcur.restore_position(
+ BTR_MODIFY_LEAF, &mtr) == btr_pcur_t::SAME_ALL);
+ }
+
+ err = btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr);
+
+ if (err != DB_FAIL) {
+ goto func_exit;
+ }
+
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+retry:
+ /* If did not succeed, try pessimistic descent to tree */
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+ ut_a(node->pcur.restore_position(BTR_PURGE_TREE, &mtr)
+ == btr_pcur_t::SAME_ALL);
+
+ btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true,
+ &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err == DB_OUT_OF_FILE_SPACE
+ && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ n_tries++;
+
+ std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+func_exit:
+ if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) {
+ /* When rolling back the very first instant ADD COLUMN
+ operation, reset the root page to the basic state. */
+ btr_reset_instant(*index, true, &mtr);
+ }
+
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+ if (UNIV_LIKELY_NULL(table)) {
+ dict_table_close(table, dict_locked,
+ node->trx->mysql_thd, mdl_ticket);
+ }
+
+ return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_low(
+/*========================*/
+ btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to remove */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t pcur;
+ dberr_t err = DB_SUCCESS;
+ mtr_t mtr;
+ const bool modify_leaf = mode == BTR_MODIFY_LEAF;
+
+ pcur.btr_cur.page_cur.index = index;
+ row_mtr_start(&mtr, index, !modify_leaf);
+
+ if (index->is_spatial()) {
+ mode = modify_leaf
+ ? btr_latch_mode(BTR_MODIFY_LEAF
+ | BTR_RTREE_DELETE_MARK
+ | BTR_RTREE_UNDO_INS)
+ : btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+ btr_pcur_get_btr_cur(&pcur)->thr = thr;
+ if (rtr_search(entry, mode, &pcur, &mtr)) {
+ goto func_exit;
+ }
+
+ if (rec_get_deleted_flag(
+ btr_pcur_get_rec(&pcur),
+ dict_table_is_comp(index->table))) {
+ ib::error() << "Record found in index " << index->name
+ << " is deleted marked on insert rollback.";
+ ut_ad(0);
+ }
+ goto found;
+ } else if (modify_leaf) {
+ mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ ut_ad(mode == BTR_PURGE_TREE);
+ mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+ mtr_x_lock_index(index, &mtr);
+ }
+
+ switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ case ROW_NOT_FOUND:
+ break;
+ case ROW_FOUND:
+ found:
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (modify_leaf) {
+ err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+ } else {
+ /* Passing rollback=false here, because we are
+ deleting a secondary index record: the distinction
+ only matters when deleting a record that contains
+ externally stored columns. */
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+ false, &mtr);
+ }
+ }
+
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ ulint n_tries = 0;
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+retry:
+ err = row_undo_ins_remove_sec_low(BTR_PURGE_TREE, index, entry, thr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ return(err);
+}
+
+/** Parse an insert undo record.
+@param[in,out] node row rollback state
+@param[in] dict_locked whether the data dictionary cache is locked */
+static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+ dict_index_t* clust_index;
+ const byte* ptr;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ byte dummy;
+ bool dummy_extern;
+
+ ut_ad(node->trx->in_rollback);
+ ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy,
+ &dummy_extern, &undo_no, &table_id);
+
+ node->update = NULL;
+ if (!node->is_temp) {
+ node->table = dict_table_open_on_id(table_id, dict_locked,
+ DICT_TABLE_OP_NORMAL);
+ } else if (!dict_locked) {
+ dict_sys.freeze(SRW_LOCK_CALL);
+ node->table = dict_sys.acquire_temporary_table(table_id);
+ dict_sys.unfreeze();
+ } else {
+ node->table = dict_sys.acquire_temporary_table(table_id);
+ }
+
+ if (!node->table) {
+ return false;
+ }
+
+ switch (node->rec_type) {
+ default:
+ ut_ad("wrong undo record type" == 0);
+ goto close_table;
+ case TRX_UNDO_INSERT_METADATA:
+ case TRX_UNDO_INSERT_REC:
+ case TRX_UNDO_EMPTY:
+ break;
+ case TRX_UNDO_RENAME_TABLE:
+ dict_table_t* table = node->table;
+ ut_ad(!table->is_temporary());
+ ut_ad(table->file_unreadable
+ || dict_table_is_file_per_table(table)
+ == !is_system_tablespace(table->space_id));
+ size_t len = mach_read_from_2(node->undo_rec)
+ - page_offset(ptr) - 2;
+ const span<const char> name(reinterpret_cast<const char*>(ptr),
+ len);
+ if (strlen(table->name.m_name) != len
+ || memcmp(table->name.m_name, ptr, len)) {
+ dict_table_rename_in_cache(table, name, true);
+ } else if (table->space && table->space->id) {
+ const auto s = table->space->name();
+ if (len != s.size() || memcmp(ptr, s.data(), len)) {
+ table->rename_tablespace(name, true);
+ }
+ }
+ goto close_table;
+ }
+
+ if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+ /* Normally, tables should not disappear or become
+ unaccessible during ROLLBACK, because they should be
+ protected by InnoDB table locks. Corruption could be
+ a valid exception.
+
+ FIXME: When running out of temporary tablespace, it
+ would probably be better to just drop all temporary
+ tables (and temporary undo log records) of the current
+ connection, instead of doing this rollback. */
+ dict_table_close(node->table, dict_locked);
+ node->table = NULL;
+ return false;
+ } else {
+ ut_ad(!node->table->skip_alter_undo);
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (clust_index != NULL) {
+ switch (node->rec_type) {
+ case TRX_UNDO_INSERT_REC:
+ ptr = trx_undo_rec_get_row_ref(
+ ptr, clust_index, &node->ref,
+ node->heap);
+ break;
+ case TRX_UNDO_EMPTY:
+ node->ref = nullptr;
+ return true;
+ default:
+ node->ref = &trx_undo_metadata;
+ if (!row_undo_search_clust_to_pcur(node)) {
+ /* An error probably occurred during
+ an insert into the clustered index,
+ after we wrote the undo log record. */
+ goto close_table;
+ }
+ return true;
+ }
+
+ if (!row_undo_search_clust_to_pcur(node)) {
+ /* An error probably occurred during
+ an insert into the clustered index,
+ after we wrote the undo log record. */
+ goto close_table;
+ }
+ if (node->table->n_v_cols) {
+ trx_undo_read_v_cols(node->table, ptr,
+ node->row, false);
+ }
+
+ } else {
+ ib::warn() << "Table " << node->table->name
+ << " has no indexes,"
+ " ignoring the table";
+ goto close_table;
+ }
+ }
+
+ return true;
+}
+
+/***************************************************************//**
+Removes secondary index records.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_rec(
+/*========================*/
+ undo_node_t* node, /*!< in/out: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err = DB_SUCCESS;
+ dict_index_t* index;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(1024);
+
+ for (index = node->index; index;
+ index = dict_table_get_next_index(index)) {
+ if (index->type & (DICT_FTS | DICT_CORRUPT)
+ || !index->is_committed()) {
+ continue;
+ }
+
+ /* An insert undo record TRX_UNDO_INSERT_REC will
+ always contain all fields of the index. It does not
+ matter if any indexes were created afterwards; all
+ index entries can be reconstructed from the row. */
+ dtuple_t* entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The database must have crashed after
+ inserting a clustered index record but before
+ writing all the externally stored columns of
+ that record, or a statement is being rolled
+ back because an error occurred while storing
+ off-page columns.
+
+ Because secondary index entries are inserted
+ after the clustered index record, we may
+ assume that the secondary index record does
+ not exist. */
+ } else {
+ err = row_undo_ins_remove_sec(index, entry, thr);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto func_exit;
+ }
+ }
+
+ mem_heap_empty(heap);
+ }
+
+func_exit:
+ node->index = index;
+ mem_heap_free(heap);
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+row_undo_ins(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ const bool dict_locked = node->trx->dict_operation_lock_mode;
+
+ if (!row_undo_ins_parse_undo_rec(node, dict_locked)) {
+ return DB_SUCCESS;
+ }
+
+ ut_ad(node->table->is_temporary()
+ || lock_table_has_locks(node->table));
+
+ /* Iterate over all the indexes and undo the insert.*/
+
+ node->index = dict_table_get_first_index(node->table);
+ ut_ad(dict_index_is_clust(node->index));
+
+ switch (node->rec_type) {
+ default:
+ ut_ad("wrong undo record type" == 0);
+ /* fall through */
+ case TRX_UNDO_INSERT_REC:
+ /* Skip the clustered index (the first index) */
+ node->index = dict_table_get_next_index(node->index);
+
+ err = row_undo_ins_remove_sec_rec(node, thr);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ log_free_check();
+
+ if (!dict_locked && node->table->id == DICT_INDEXES_ID) {
+ dict_sys.lock(SRW_LOCK_CALL);
+ err = row_undo_ins_remove_clust_rec(node);
+ dict_sys.unlock();
+ } else {
+ ut_ad(node->table->id != DICT_INDEXES_ID
+ || !node->table->is_temporary());
+ err = row_undo_ins_remove_clust_rec(node);
+ }
+
+ if (err == DB_SUCCESS && node->table->stat_initialized) {
+ /* Not protected by dict_sys.latch
+ or table->stats_mutex_lock() for
+ performance reasons, we would rather get garbage
+ in stat_n_rows (which is just an estimate anyway)
+ than protecting the following code with a latch. */
+ dict_table_n_rows_dec(node->table);
+
+ /* Do not attempt to update statistics when
+ executing ROLLBACK in the InnoDB SQL
+ interpreter, because in that case we would
+ already be holding dict_sys.latch, which
+ would be acquired when updating statistics. */
+ if (!dict_locked) {
+ dict_stats_update_if_needed(node->table,
+ *node->trx);
+ }
+ }
+ break;
+
+ case TRX_UNDO_INSERT_METADATA:
+ log_free_check();
+ ut_ad(!node->table->is_temporary());
+ err = row_undo_ins_remove_clust_rec(node);
+ break;
+ case TRX_UNDO_EMPTY:
+ err = node->table->clear(thr);
+ break;
+ }
+
+ dict_table_close(node->table, dict_locked);
+
+ node->table = NULL;
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
new file mode 100644
index 00000000..a01eaea5
--- /dev/null
+++ b/storage/innobase/row/row0umod.cc
@@ -0,0 +1,1288 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.cc
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust_low(
+/*===================*/
+ undo_node_t* node, /*!< in: row undo node */
+ rec_offs** offsets,/*!< out: rec_get_offsets() on the record */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: memory heap that can be emptied */
+ mem_heap_t* heap, /*!< in/out: memory heap */
+ byte* sys, /*!< out: DB_TRX_ID, DB_ROLL_PTR
+ for row_log_table_delete() */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in: mtr; must be committed before
+ latching any further pages */
+ btr_latch_mode mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ dberr_t err;
+
+ pcur = &node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ if (pcur->restore_position(mode, mtr) != btr_pcur_t::SAME_ALL) {
+ return DB_CORRUPTION;
+ }
+
+ ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
+ btr_cur_get_index(btr_cur))
+ == thr_get_trx(thr)->id
+ || btr_cur_get_index(btr_cur)->table->is_temporary());
+ ut_ad(node->ref != &trx_undo_metadata
+ || node->update->info_bits == REC_INFO_METADATA_ADD
+ || node->update->info_bits == REC_INFO_METADATA_ALTER);
+
+ if (mode != BTR_MODIFY_TREE) {
+ ut_ad(mode == BTR_MODIFY_LEAF
+ || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED);
+
+ err = btr_cur_optimistic_update(
+ BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, offsets, offsets_heap,
+ node->update, node->cmpl_info,
+ thr, thr_get_trx(thr)->id, mtr);
+ ut_ad(err != DB_SUCCESS || node->ref != &trx_undo_metadata);
+ } else {
+ big_rec_t* dummy_big_rec;
+
+ err = btr_cur_pessimistic_update(
+ BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, offsets, offsets_heap, heap,
+ &dummy_big_rec, node->update,
+ node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+
+ ut_a(!dummy_big_rec);
+
+ if (err == DB_SUCCESS
+ && node->ref == &trx_undo_metadata
+ && btr_cur_get_index(btr_cur)->table->instant
+ && node->update->info_bits == REC_INFO_METADATA_ADD) {
+ btr_reset_instant(*btr_cur->index(), false, mtr);
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+
+ switch (const auto id = btr_cur_get_index(btr_cur)->table->id) {
+ unsigned c;
+ case DICT_TABLES_ID:
+ if (node->trx != trx_roll_crash_recv_trx) {
+ break;
+ }
+ c = DICT_COL__SYS_TABLES__ID;
+ goto evict;
+ case DICT_INDEXES_ID:
+ if (node->trx != trx_roll_crash_recv_trx) {
+ break;
+ } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC
+ && btr_cur_get_rec(btr_cur)
+ [8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+ == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) {
+ /* We are rolling back the DELETE of metadata
+ for a failed ADD INDEX operation. This does
+ not affect any cached table definition,
+ because we are filtering out such indexes in
+ dict_load_indexes(). */
+ break;
+ }
+ /* fall through */
+ case DICT_COLUMNS_ID:
+ static_assert(!DICT_COL__SYS_INDEXES__TABLE_ID, "");
+ static_assert(!DICT_COL__SYS_COLUMNS__TABLE_ID, "");
+ c = DICT_COL__SYS_COLUMNS__TABLE_ID;
+ /* This is rolling back an UPDATE or DELETE on SYS_COLUMNS.
+ If it was part of an instant ALTER TABLE operation, we
+ must evict the table definition, so that it can be
+ reloaded after the dictionary operation has been
+ completed. At this point, any corresponding operation
+ to the metadata record will have been rolled back. */
+ evict:
+ const dfield_t& table_id = *dtuple_get_nth_field(node->row, c);
+ ut_ad(dfield_get_len(&table_id) == 8);
+ node->trx->evict_table(mach_read_from_8(
+ static_cast<byte*>(
+ table_id.data)),
+ id == DICT_COLUMNS_ID);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Get the byte offset of the DB_TRX_ID column
+@param[in] rec clustered index record
+@param[in] index clustered index
+@return the byte offset of DB_TRX_ID, from the start of rec */
+static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
+{
+ ut_ad(index->n_uniq <= MAX_REF_PARTS);
+ ulint trx_id_offset = index->trx_id_offset;
+ if (!trx_id_offset) {
+ /* Reserve enough offsets for the PRIMARY KEY and 2 columns
+ so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ const ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+ rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ trx_id_pos + 1, &heap);
+ ut_ad(!heap);
+ ulint len;
+ trx_id_offset = rec_get_nth_field_offs(
+ offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ }
+
+ return trx_id_offset;
+}
+
+/** Determine if rollback must execute a purge-like operation.
+@param node row undo
+@return whether the record should be purged */
+static bool row_undo_mod_must_purge(const undo_node_t &node)
+{
+ ut_ad(node.rec_type == TRX_UNDO_UPD_DEL_REC);
+ ut_ad(!node.table->is_temporary());
+
+ const btr_cur_t &btr_cur= node.pcur.btr_cur;
+ ut_ad(btr_cur.index()->is_primary());
+ DEBUG_SYNC_C("rollback_purge_clust");
+
+ if (!purge_sys.is_purgeable(node.new_trx_id))
+ return false;
+
+ const rec_t *rec= btr_cur_get_rec(&btr_cur);
+ return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur.index())) ==
+ node.new_trx_id;
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return DB_SUCCESS or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust(
+/*===============*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t* pcur;
+ mtr_t mtr;
+ dberr_t err;
+ dict_index_t* index;
+
+ ut_ad(thr_get_trx(thr) == node->trx);
+ ut_ad(node->trx->in_rollback);
+
+ log_free_check();
+ pcur = &node->pcur;
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur));
+ ut_ad(index->is_primary());
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ ut_ad(lock_table_has_locks(index->table));
+ }
+
+ mem_heap_t* heap = mem_heap_create(1024);
+ mem_heap_t* offsets_heap = NULL;
+ rec_offs* offsets = NULL;
+ byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+ /* Try optimistic processing of the record, keeping changes within
+ the index page */
+
+ err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+ heap, sys, thr, &mtr, BTR_MODIFY_LEAF);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a pessimistic
+ descent down the index tree */
+
+ mtr.start();
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(mtr);
+ }
+
+ err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+ heap, sys, thr, &mtr,
+ BTR_MODIFY_TREE);
+ ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+ }
+
+ /**
+ * when scrubbing, and records gets cleared,
+ * the transaction id is not present afterwards.
+ * this is safe as: since the record is on free-list
+ * it can be reallocated at any time after this mtr-commits
+ * which is just below
+ */
+ ut_ad(srv_immediate_scrub_data_uncompressed
+ || row_get_rec_trx_id(btr_pcur_get_rec(pcur), index, offsets)
+ == node->new_trx_id);
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ DEBUG_SYNC_C("rollback_undo_pk");
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ /* FIXME: Perform the below operations in the above
+ mini-transaction when possible. */
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(node->new_trx_id);
+
+ mtr.start();
+ if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) !=
+ btr_pcur_t::SAME_ALL) {
+ goto mtr_commit_exit;
+ }
+
+ ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(node->table)));
+
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+ &mtr);
+ if (err != DB_FAIL) {
+ goto mtr_commit_exit;
+ }
+ err = DB_SUCCESS;
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ } else {
+ index->set_modified(mtr);
+ if (!row_undo_mod_must_purge(*node)) {
+ goto mtr_commit_exit;
+ }
+ err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+ &mtr);
+ if (err != DB_FAIL) {
+ goto mtr_commit_exit;
+ }
+ err = DB_SUCCESS;
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ }
+
+ mtr.start();
+ if (pcur->restore_position(BTR_PURGE_TREE, &mtr) !=
+ btr_pcur_t::SAME_ALL) {
+ goto mtr_commit_exit;
+ }
+
+ ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(node->table)));
+
+ if (index->table->is_temporary()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ if (!row_undo_mod_must_purge(*node)) {
+ goto mtr_commit_exit;
+ }
+ index->set_modified(mtr);
+ }
+
+ /* This operation is analogous to purge, we can free
+ also inherited externally stored fields. We can also
+ assume that the record was complete (including BLOBs),
+ because it had been delete-marked after it had been
+ completely inserted. Therefore, we are passing
+ rollback=false, just like purge does. */
+ btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0,
+ false, &mtr);
+ ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+ } else if (!index->table->is_temporary() && node->new_trx_id) {
+ /* We rolled back a record so that it still exists.
+ We must reset the DB_TRX_ID if the history is no
+ longer accessible by any active read view. */
+
+ mtr.start();
+ if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr)
+ != btr_pcur_t::SAME_ALL
+ || !purge_sys.is_purgeable(node->new_trx_id)) {
+ goto mtr_commit_exit;
+ }
+
+ rec_t* rec = btr_pcur_get_rec(pcur);
+ ulint trx_id_offset = index->trx_id_offset;
+ ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+ /* Reserve enough offsets for the PRIMARY KEY and
+ 2 columns so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+ rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+ if (trx_id_offset) {
+#ifdef UNIV_DEBUG
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ if (buf_block_get_page_zip(
+ btr_pcur_get_block(&node->pcur))) {
+ /* Below, page_zip_write_trx_id_and_roll_ptr()
+ needs offsets to access DB_TRX_ID,DB_ROLL_PTR.
+ We already computed offsets for possibly
+ another record in the clustered index.
+ Because the PRIMARY KEY is fixed-length,
+ the offsets for the PRIMARY KEY and
+ DB_TRX_ID,DB_ROLL_PTR are still valid.
+ Silence the rec_offs_validate() assertion. */
+ rec_offs_make_valid(rec, index, true, offsets);
+ }
+#endif
+ } else if (rec_is_metadata(rec, *index)) {
+ ut_ad(!buf_block_get_page_zip(btr_pcur_get_block(
+ pcur)));
+ for (unsigned i = index->first_user_field(); i--; ) {
+ trx_id_offset += index->fields[i].fixed_len;
+ }
+ } else {
+ ut_ad(index->n_uniq <= MAX_REF_PARTS);
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ trx_id_pos + 2, &heap);
+ ulint len;
+ trx_id_offset = rec_get_nth_field_offs(
+ offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ }
+
+ if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) {
+ ut_ad(!rec_get_deleted_flag(
+ rec, dict_table_is_comp(node->table))
+ || rec_is_alter_metadata(rec, *index));
+ index->set_modified(mtr);
+ buf_block_t* block = btr_pcur_get_block(pcur);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ block, rec, offsets, trx_id_pos,
+ 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+ &mtr);
+ } else {
+ size_t offs = page_offset(rec + trx_id_offset);
+ mtr.memset(block, offs, DATA_TRX_ID_LEN, 0);
+ offs += DATA_TRX_ID_LEN;
+ mtr.write<1,mtr_t::MAYBE_NOP>(*block,
+ block->page.frame
+ + offs, 0x80U);
+ mtr.memset(block, offs + 1,
+ DATA_ROLL_PTR_LEN - 1, 0);
+ }
+ }
+ } else {
+ goto func_exit;
+ }
+
+mtr_commit_exit:
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+func_exit:
+ if (offsets_heap) {
+ mem_heap_free(offsets_heap);
+ }
+ mem_heap_free(heap);
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry */
+ btr_latch_mode mode) /*!< in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ dberr_t err = DB_SUCCESS;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+ const bool modify_leaf = mode == BTR_MODIFY_LEAF;
+
+ row_mtr_start(&mtr, index, !modify_leaf);
+
+ pcur.btr_cur.page_cur.index = index;
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (index->is_spatial()) {
+ mode = modify_leaf
+ ? btr_latch_mode(BTR_MODIFY_LEAF
+ | BTR_RTREE_DELETE_MARK
+ | BTR_RTREE_UNDO_INS)
+ : btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+ btr_cur->thr = thr;
+ if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+ goto found;
+ } else {
+ goto func_exit;
+ }
+ } else if (!index->is_committed()) {
+ /* The index->online_status may change if the index is
+ or was being created online, but not committed yet. It
+ is protected by index->lock. */
+ if (modify_leaf) {
+ mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ ut_ad(mode == BTR_PURGE_TREE);
+ mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+ mtr_x_lock_index(index, &mtr);
+ }
+ } else {
+ /* For secondary indexes,
+ index->online_status==ONLINE_INDEX_COMPLETE if
+ index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+ }
+
+ switch (UNIV_EXPECT(row_search_index_entry(entry, mode, &pcur, &mtr),
+ ROW_FOUND)) {
+ case ROW_NOT_FOUND:
+ /* In crash recovery, the secondary index record may
+ be missing if the UPDATE did not have time to insert
+ the secondary index records before the crash. When we
+ are undoing that UPDATE in crash recovery, the record
+ may be missing.
+
+ In normal processing, if an update ends in a deadlock
+ before it has inserted all updated secondary index
+ records, then the undo will not find those records. */
+ goto func_exit;
+ case ROW_FOUND:
+ break;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ }
+
+found:
+ /* We should remove the index record if no prior version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should delete mark the record. */
+
+ mtr_vers.start();
+
+ ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) ==
+ btr_pcur_t::SAME_ALL);
+
+ /* For temporary table, we can skip to check older version of
+ clustered index entry, because there is no MVCC or purge. */
+ if (node->table->is_temporary()
+ || row_vers_old_has_index_entry(
+ false, btr_pcur_get_rec(&node->pcur),
+ &mtr_vers, index, entry, 0, 0)) {
+ btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+ btr_cur_get_rec(btr_cur), &mtr);
+ } else {
+ /* Remove the index record */
+
+ if (dict_index_is_spatial(index)) {
+ rec_t* rec = btr_pcur_get_rec(&pcur);
+ if (rec_get_deleted_flag(rec,
+ dict_table_is_comp(index->table))) {
+ ib::error() << "Record found in index "
+ << index->name << " is deleted marked"
+ " on rollback update.";
+ ut_ad(0);
+ }
+ }
+
+ if (modify_leaf) {
+ err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+ } else {
+ /* Passing rollback=false,
+ because we are deleting a secondary index record:
+ the distinction only matters when deleting a
+ record that contains externally stored columns. */
+ ut_ad(!index->is_primary());
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+ false, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.cc, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry */
+{
+ dberr_t err;
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_LEAF);
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_PURGE_TREE);
+ return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@retval DB_SUCCESS on success
+@retval DB_FAIL if BTR_MODIFY_TREE should be tried
+@retval DB_OUT_OF_FILE_SPACE when running out of tablespace
+@retval DB_DUPLICATE_KEY if the value was missing
+ and an insert would lead to a duplicate exists */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+ btr_latch_mode mode, /*!< in: search mode: BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+ upd_t* update;
+ dberr_t err = DB_SUCCESS;
+ big_rec_t* dummy_big_rec;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ const ulint flags
+ = BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
+ const auto orig_mode = mode;
+
+ pcur.btr_cur.page_cur.index = index;
+ ut_ad(trx->id != 0);
+
+ if (index->is_spatial()) {
+ /* FIXME: Currently we do a 2-pass search for the undo
+ due to avoid undel-mark a wrong rec in rolling back in
+ partial update. Later, we could log some info in
+ secondary index updates to avoid this. */
+ static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+ ut_ad(!(mode & 8));
+ mode = btr_latch_mode(mode | BTR_RTREE_DELETE_MARK);
+ }
+
+try_again:
+ row_mtr_start(&mtr, index, mode & 8);
+
+ btr_cur->thr = thr;
+
+ if (index->is_spatial()) {
+ if (!rtr_search(entry, mode, &pcur, &mtr)) {
+ goto found;
+ }
+
+ if (mode != orig_mode && btr_cur->rtr_info->fd_del) {
+ mode = orig_mode;
+ btr_pcur_close(&pcur);
+ mtr.commit();
+ goto try_again;
+ }
+
+ goto not_found;
+ }
+
+ switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+ mem_heap_t* heap;
+ mem_heap_t* offsets_heap;
+ rec_offs* offsets;
+ case ROW_BUFFERED:
+ case ROW_NOT_DELETED_REF:
+ /* These are invalid outcomes, because the mode passed
+ to row_search_index_entry() did not include any of the
+ flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+ ut_error;
+ case ROW_NOT_FOUND:
+not_found:
+ if (btr_cur->up_match >= dict_index_get_n_unique(index)
+ || btr_cur->low_match >= dict_index_get_n_unique(index)) {
+ ib::warn() << "Record in index " << index->name
+ << " of table " << index->table->name
+ << " was not found on rollback, and"
+ " a duplicate exists: "
+ << *entry
+ << " at: " << rec_index_print(
+ btr_cur_get_rec(btr_cur), index);
+ err = DB_DUPLICATE_KEY;
+ break;
+ }
+
+ ib::warn() << "Record in index " << index->name
+ << " of table " << index->table->name
+ << " was not found on rollback, trying to insert: "
+ << *entry
+ << " at: " << rec_index_print(
+ btr_cur_get_rec(btr_cur), index);
+
+ /* Insert the missing record that we were trying to
+ delete-unmark. */
+ big_rec_t* big_rec;
+ rec_t* insert_rec;
+ offsets = NULL;
+ offsets_heap = NULL;
+
+ err = btr_cur_optimistic_insert(
+ flags, btr_cur, &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ 0, thr, &mtr);
+ ut_ad(!big_rec);
+
+ if (err == DB_FAIL && mode == BTR_MODIFY_TREE) {
+ err = btr_cur_pessimistic_insert(
+ flags, btr_cur,
+ &offsets, &offsets_heap,
+ entry, &insert_rec, &big_rec,
+ 0, thr, &mtr);
+ /* There are no off-page columns in
+ secondary indexes. */
+ ut_ad(!big_rec);
+ }
+
+ if (err == DB_SUCCESS) {
+ page_update_max_trx_id(
+ btr_cur_get_block(btr_cur),
+ btr_cur_get_page_zip(btr_cur),
+ trx->id, &mtr);
+ }
+
+ if (offsets_heap) {
+ mem_heap_free(offsets_heap);
+ }
+
+ break;
+ case ROW_FOUND:
+found:
+ btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
+ btr_cur_get_rec(btr_cur), &mtr);
+ heap = mem_heap_create(
+ sizeof(upd_t)
+ + dtuple_get_n_fields(entry) * sizeof(upd_field_t));
+ offsets_heap = NULL;
+ offsets = rec_get_offsets(
+ btr_cur_get_rec(btr_cur),
+ index, nullptr, index->n_core_fields, ULINT_UNDEFINED,
+ &offsets_heap);
+ update = row_upd_build_sec_rec_difference_binary(
+ btr_cur_get_rec(btr_cur), index, offsets, entry, heap);
+ if (upd_get_n_fields(update) == 0) {
+
+ /* Do nothing */
+
+ } else if (mode != BTR_MODIFY_TREE) {
+ /* Try an optimistic updating of the record, keeping
+ changes within the page */
+
+ /* TODO: pass offsets, not &offsets */
+ err = btr_cur_optimistic_update(
+ flags, btr_cur, &offsets, &offsets_heap,
+ update, 0, thr, thr_get_trx(thr)->id, &mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ default:
+ break;
+ }
+ } else {
+ err = btr_cur_pessimistic_update(
+ flags, btr_cur, &offsets, &offsets_heap,
+ heap, &dummy_big_rec,
+ update, 0, thr, thr_get_trx(thr)->id, &mtr);
+ ut_a(!dummy_big_rec);
+ }
+
+ mem_heap_free(heap);
+ mem_heap_free(offsets_heap);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_del_sec(
+/*=====================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ ut_ad(!node->undo_row);
+
+ heap = mem_heap_create(1024);
+
+ do {
+ dict_index_t* index = node->index;
+
+ if (index->type & (DICT_FTS | DICT_CORRUPT)
+ || !index->is_committed()) {
+ continue;
+ }
+
+ /* During online index creation,
+ HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCk
+ should guarantee that any active transaction has not modified
+ indexed columns such that col->ord_part was 0 at the
+ time when the undo log record was written. When we get
+ to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+ it should always cover all affected indexes. */
+ dtuple_t* entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The database must have crashed after
+ inserting a clustered index record but before
+ writing all the externally stored columns of
+ that record. Because secondary index entries
+ are inserted after the clustered index record,
+ we may assume that the secondary index record
+ does not exist. However, this situation may
+ only occur during the rollback of incomplete
+ transactions. */
+ ut_a(thr_get_trx(thr) == trx_roll_crash_recv_trx);
+ } else {
+ err = row_undo_mod_del_mark_or_remove_sec(
+ node, thr, index, entry);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+ break;
+ }
+ }
+
+ mem_heap_empty(heap);
+ } while ((node->index = dict_table_get_next_index(node->index)));
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_sec(
+/*======================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(!node->undo_row);
+
+ heap = mem_heap_create(1024);
+
+ do {
+ dict_index_t* index = node->index;
+
+ if (index->type & (DICT_FTS | DICT_CORRUPT)
+ || !index->is_committed()) {
+ continue;
+ }
+
+ /* During online index creation,
+ HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+ should guarantee that any active transaction has not modified
+ indexed columns such that col->ord_part was 0 at the
+ time when the undo log record was written. When we get
+ to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+ it should always cover all affected indexes. */
+ dtuple_t* entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+
+ ut_a(entry);
+
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF, thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE, thr, index, entry);
+ }
+
+ if (err == DB_DUPLICATE_KEY) {
+ index->type |= DICT_CORRUPT;
+ err = DB_SUCCESS;
+ /* Do not return any error to the caller. The
+ duplicate will be reported by ALTER TABLE or
+ CREATE UNIQUE INDEX. Unfortunately we cannot
+ report the duplicate key value to the DDL
+ thread, because the altered_table object is
+ private to its call stack. */
+ } else if (err != DB_SUCCESS) {
+ break;
+ }
+
+ mem_heap_empty(heap);
+ } while ((node->index = dict_table_get_next_index(node->index)));
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ return DB_SUCCESS;
+ }
+
+ mem_heap_t* heap = mem_heap_create(1024);
+ dberr_t err = DB_SUCCESS;
+
+ do {
+ dict_index_t* index = node->index;
+
+ if (index->type & (DICT_FTS | DICT_CORRUPT)
+ || !index->is_committed()) {
+ continue;
+ }
+
+ if (!row_upd_changes_ord_field_binary_func(
+ index, node->update,
+#ifdef UNIV_DEBUG
+ thr,
+#endif /* UNIV_DEBUG */
+ node->row, node->ext, ROW_BUILD_FOR_UNDO)) {
+ continue;
+ }
+
+ /* Build the newest version of the index entry */
+ dtuple_t* entry = row_build_index_entry(
+ node->row, node->ext, index, heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The server must have crashed in
+ row_upd_clust_rec_by_insert() before
+ the updated externally stored columns (BLOBs)
+ of the new clustered index entry were written. */
+
+ /* The table must be in DYNAMIC or COMPRESSED
+ format. REDUNDANT and COMPACT formats
+ store a local 768-byte prefix of each
+ externally stored column. */
+ ut_a(dict_table_has_atomic_blobs(index->table));
+
+ /* This is only legitimate when
+ rolling back an incomplete transaction
+ after crash recovery. */
+ ut_a(thr_get_trx(thr)->is_recovered);
+
+ /* The server must have crashed before
+ completing the insert of the new
+ clustered index entry and before
+ inserting to the secondary indexes.
+ Because node->row was not yet written
+ to this index, we can ignore it. But
+ we must restore node->undo_row. */
+ } else {
+ /* NOTE that if we updated the fields of a
+ delete-marked secondary index record so that
+ alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc', we cannot return to the
+ original values because we do not know them.
+ But this should not cause problems because
+ in row0sel.cc, in queries we always retrieve
+ the clustered index record or an earlier
+ version of it, if the secondary index record
+ through which we do the search is
+ delete-marked. */
+
+ err = row_undo_mod_del_mark_or_remove_sec(
+ node, thr, index, entry);
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+
+ mem_heap_empty(heap);
+ /* We may have to update the delete mark in the
+ secondary index record of the previous version of
+ the row. We also need to update the fields of
+ the secondary index record if we updated its fields
+ but alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc'. */
+ entry = row_build_index_entry_low(node->undo_row,
+ node->undo_ext,
+ index, heap,
+ ROW_BUILD_FOR_UNDO);
+ ut_a(entry);
+
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF, thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE, thr, index, entry);
+ }
+
+ if (err == DB_DUPLICATE_KEY) {
+ index->type |= DICT_CORRUPT;
+ err = DB_SUCCESS;
+ } else if (err != DB_SUCCESS) {
+ break;
+ }
+
+ mem_heap_empty(heap);
+ } while ((node->index = dict_table_get_next_index(node->index)));
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/** Parse an update undo record.
+@param[in,out] node row rollback state
+@param[in] dict_locked whether the data dictionary cache is locked */
+static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+ dict_index_t* clust_index;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ byte info_bits;
+ byte type;
+ byte cmpl_info;
+ bool dummy_extern;
+
+ ut_ad(node->trx->in_rollback);
+ ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+ const byte *ptr = trx_undo_rec_get_pars(
+ node->undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+ node->rec_type = type;
+
+ if (!node->is_temp) {
+ node->table = dict_table_open_on_id(table_id, dict_locked,
+ DICT_TABLE_OP_NORMAL);
+ } else if (!dict_locked) {
+ dict_sys.freeze(SRW_LOCK_CALL);
+ node->table = dict_sys.acquire_temporary_table(table_id);
+ dict_sys.unfreeze();
+ } else {
+ node->table = dict_sys.acquire_temporary_table(table_id);
+ }
+
+ if (!node->table) {
+ return false;
+ }
+
+ ut_ad(!node->table->skip_alter_undo);
+
+ if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+ /* Normally, tables should not disappear or become
+ unaccessible during ROLLBACK, because they should be
+ protected by InnoDB table locks. Corruption could be
+ a valid exception.
+
+ FIXME: When running out of temporary tablespace, it
+ would probably be better to just drop all temporary
+ tables (and temporary undo log records) of the current
+ connection, instead of doing this rollback. */
+ dict_table_close(node->table, dict_locked);
+ node->table = NULL;
+ return false;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits,
+ node->heap, &(node->update));
+ node->new_trx_id = trx_id;
+ node->cmpl_info = cmpl_info;
+ ut_ad(!node->ref->info_bits);
+
+ if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+ if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG)
+ != REC_INFO_MIN_REC_FLAG) {
+ ut_ad("wrong info_bits in undo log record" == 0);
+ goto close_table;
+ }
+ /* This must be an undo log record for a subsequent
+ instant ALTER TABLE, extending the metadata record. */
+ ut_ad(clust_index->is_instant());
+ ut_ad(clust_index->table->instant
+ || !(node->update->info_bits & REC_INFO_DELETED_FLAG));
+ node->ref = &trx_undo_metadata;
+ node->update->info_bits = (node->update->info_bits
+ & REC_INFO_DELETED_FLAG)
+ ? REC_INFO_METADATA_ALTER
+ : REC_INFO_METADATA_ADD;
+ }
+
+ if (!row_undo_search_clust_to_pcur(node)) {
+ /* As long as this rolling-back transaction exists,
+ the PRIMARY KEY value pointed to by the undo log
+ record should exist.
+
+ However, if InnoDB is killed during a rollback, or
+ shut down during the rollback of recovered
+ transactions, then after restart we may try to roll
+ back some of the same undo log records again, because
+ trx_roll_try_truncate() is not being invoked after
+ every undo log record.
+
+ It is also possible that the record
+ was not modified yet (the DB_ROLL_PTR does not match
+ node->roll_ptr) and thus there is nothing to roll back.
+
+ btr_cur_upd_lock_and_undo() only writes the undo log
+ record after successfully acquiring an exclusive lock
+ on the the clustered index record. That lock will not
+ be released before the transaction is committed or
+ fully rolled back. (Exception: if the server was
+ killed, restarted, and shut down again before the
+ rollback of the recovered transaction was completed,
+ it is possible that the transaction was partially
+ rolled back and locks released.) */
+ goto close_table;
+ }
+
+ /* Extract indexed virtual columns from undo log */
+ if (node->ref != &trx_undo_metadata && node->table->n_v_cols) {
+ row_upd_replace_vcol(node->row, node->table,
+ node->update, false, node->undo_row,
+ (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+ ? nullptr : ptr);
+ }
+
+ return true;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err = DB_SUCCESS;
+ ut_ad(thr_get_trx(thr) == node->trx);
+ const bool dict_locked = node->trx->dict_operation_lock_mode;
+
+ if (!row_undo_mod_parse_undo_rec(node, dict_locked)) {
+ return DB_SUCCESS;
+ }
+
+ ut_ad(node->table->is_temporary()
+ || lock_table_has_locks(node->table));
+ node->index = dict_table_get_first_index(node->table);
+ ut_ad(dict_index_is_clust(node->index));
+
+ if (node->ref->info_bits) {
+ ut_ad(node->ref->is_metadata());
+ goto rollback_clust;
+ }
+
+ /* Skip the clustered index (the first index) */
+ node->index = dict_table_get_next_index(node->index);
+ if (node->index) {
+ switch (node->rec_type) {
+ case TRX_UNDO_UPD_EXIST_REC:
+ err = row_undo_mod_upd_exist_sec(node, thr);
+ break;
+ case TRX_UNDO_DEL_MARK_REC:
+ err = row_undo_mod_del_mark_sec(node, thr);
+ break;
+ case TRX_UNDO_UPD_DEL_REC:
+ err = row_undo_mod_upd_del_sec(node, thr);
+ break;
+ default:
+ MY_ASSERT_UNREACHABLE();
+ }
+ }
+
+ if (err == DB_SUCCESS) {
+rollback_clust:
+ err = row_undo_mod_clust(node, thr);
+
+ bool update_statistics
+ = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+
+ if (err == DB_SUCCESS && node->table->stat_initialized) {
+ switch (node->rec_type) {
+ case TRX_UNDO_UPD_EXIST_REC:
+ break;
+ case TRX_UNDO_DEL_MARK_REC:
+ dict_table_n_rows_inc(node->table);
+ update_statistics = update_statistics
+ || !srv_stats_include_delete_marked;
+ break;
+ case TRX_UNDO_UPD_DEL_REC:
+ dict_table_n_rows_dec(node->table);
+ update_statistics = update_statistics
+ || !srv_stats_include_delete_marked;
+ break;
+ }
+
+ /* Do not attempt to update statistics when
+ executing ROLLBACK in the InnoDB SQL
+ interpreter, because in that case we would
+ already be holding dict_sys.latch, which
+ would be acquired when updating statistics. */
+ if (update_statistics && !dict_locked) {
+ dict_stats_update_if_needed(node->table,
+ *node->trx);
+ } else {
+ node->table->stat_modified_counter++;
+ }
+ }
+ }
+
+ dict_table_close(node->table, dict_locked);
+
+ node->table = NULL;
+
+ return(err);
+}
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
new file mode 100644
index 00000000..8a1041c8
--- /dev/null
+++ b/storage/innobase/row/row0undo.cc
@@ -0,0 +1,453 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.cc
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ undo_node_t* undo;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
+ || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+ || trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(parent);
+
+ undo = static_cast<undo_node_t*>(
+ mem_heap_alloc(heap, sizeof(undo_node_t)));
+
+ undo->common.type = QUE_NODE_UNDO;
+ undo->common.parent = parent;
+
+ undo->trx = trx;
+
+ btr_pcur_init(&(undo->pcur));
+
+ undo->heap = mem_heap_create(256);
+
+ return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node) /*!< in/out: row undo node */
+{
+ dict_index_t* clust_index;
+ bool found;
+ mtr_t mtr;
+ row_ext_t** ext;
+ const rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(!node->table->skip_alter_undo);
+
+ mtr_start(&mtr);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
+ node->table, node->ref, &mtr);
+
+ if (!found) {
+ goto func_exit;
+ }
+
+ rec = btr_pcur_get_rec(&node->pcur);
+
+ offsets = rec_get_offsets(rec, clust_index, offsets,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ found = row_get_rec_roll_ptr(rec, clust_index, offsets)
+ == node->roll_ptr;
+
+ if (found) {
+ ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
+ == node->trx->id || node->table->is_temporary());
+
+ if (dict_table_has_atomic_blobs(node->table)) {
+ /* There is no prefix of externally stored
+ columns in the clustered index record. Build a
+ cache of column prefixes. */
+ ext = &node->ext;
+ } else {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored
+ column. No cache is needed. */
+ ext = NULL;
+ node->ext = NULL;
+ }
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+ offsets, NULL,
+ NULL, NULL, ext, node->heap);
+
+ /* We will need to parse out virtual column info from undo
+ log, first mark them DATA_MISSING. So we will know if the
+ value gets updated */
+ if (node->table->n_v_cols
+ && !trx_undo_roll_ptr_is_insert(node->roll_ptr)
+ && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ for (ulint i = 0;
+ i < dict_table_get_n_v_cols(node->table); i++) {
+ dfield_get_type(dtuple_get_nth_v_field(
+ node->row, i))->mtype = DATA_MISSING;
+ }
+ }
+
+ if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+ ut_ad((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG
+ || node->row->info_bits == 0);
+ node->undo_row = dtuple_copy(node->row, node->heap);
+ row_upd_replace(node->undo_row, &node->undo_ext,
+ clust_index, node->update, node->heap);
+ } else {
+ ut_ad(((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG)
+ == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+ node->undo_row = NULL;
+ node->undo_ext = NULL;
+ }
+
+ btr_pcur_store_position(&node->pcur, &mtr);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+func_exit:
+ btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+ return(found);
+}
+
+/** Get the latest undo log record for rollback.
+@param[in,out] node rollback context
+@return undo block for the undo log record
+@retval nullptr if no undo log record was fetched */
+static buf_block_t* row_undo_rec_get(undo_node_t* node)
+{
+ trx_t* trx = node->trx;
+
+ if (trx->pages_undone) {
+ trx->pages_undone = 0;
+ trx_undo_try_truncate(*trx);
+ }
+
+ trx_undo_t* undo = NULL;
+ trx_undo_t* update = trx->rsegs.m_redo.undo;
+ trx_undo_t* temp = trx->rsegs.m_noredo.undo;
+ const undo_no_t limit = trx->roll_limit;
+ node->is_temp = false;
+
+ ut_ad(!update || !temp || update->empty() || temp->empty()
+ || update->top_undo_no != temp->top_undo_no);
+
+ if (update && !update->empty() && update->top_undo_no >= limit) {
+ if (!undo) {
+ undo = update;
+ } else if (undo->top_undo_no < update->top_undo_no) {
+ undo = update;
+ }
+ }
+
+ if (temp && !temp->empty() && temp->top_undo_no >= limit) {
+ if (!undo || undo->top_undo_no < temp->top_undo_no) {
+ undo = temp;
+ node->is_temp = true;
+ }
+ }
+
+ if (undo == NULL) {
+ trx_undo_try_truncate(*trx);
+ /* Mark any ROLLBACK TO SAVEPOINT completed, so that
+ if the transaction object is committed and reused
+ later, we will default to a full ROLLBACK. */
+ trx->roll_limit = 0;
+ trx->in_rollback = false;
+ return nullptr;
+ }
+
+ ut_ad(!undo->empty());
+ ut_ad(limit <= undo->top_undo_no);
+
+ node->roll_ptr = trx_undo_build_roll_ptr(
+ false, trx_sys.rseg_id(undo->rseg, !node->is_temp),
+ undo->top_page_no, undo->top_offset);
+
+ mtr_t mtr;
+ mtr.start();
+
+ buf_block_t* undo_page = buf_page_get(
+ page_id_t(undo->rseg->space->id, undo->top_page_no),
+ 0, RW_S_LATCH, &mtr);
+ if (!undo_page) {
+ return nullptr;
+ }
+
+ uint16_t offset = undo->top_offset;
+
+ buf_block_t* prev_page = undo_page;
+ if (trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec(
+ prev_page, offset, undo->hdr_page_no, undo->hdr_offset,
+ true, &mtr)) {
+ if (prev_page != undo_page) {
+ trx->pages_undone++;
+ }
+
+ undo->top_page_no = prev_page->page.id().page_no();
+ undo->top_offset = page_offset(prev_rec);
+ undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+ ut_ad(!undo->empty());
+ } else {
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+ }
+
+ undo_page->fix();
+ mtr.commit();
+
+ node->undo_rec = undo_page->page.frame + offset;
+
+ const size_t end = mach_read_from_2(node->undo_rec);
+ if (UNIV_UNLIKELY(end <= offset
+ || end >= srv_page_size - FIL_PAGE_DATA_END)) {
+ undo_page->unfix();
+ node->undo_rec = nullptr;
+ return nullptr;
+ }
+
+ switch (node->undo_rec[2] & (TRX_UNDO_CMPL_INFO_MULT - 1)) {
+ case TRX_UNDO_INSERT_METADATA:
+ /* This record type was introduced in MDEV-11369
+ instant ADD COLUMN, which was implemented after
+ MDEV-12288 removed the insert_undo log. There is no
+ instant ADD COLUMN for temporary tables. Therefore,
+ this record can only be present in the main undo log. */
+ /* fall through */
+ case TRX_UNDO_RENAME_TABLE:
+ ut_ad(undo == update);
+ /* fall through */
+ case TRX_UNDO_INSERT_REC:
+ case TRX_UNDO_EMPTY:
+ node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+ }
+
+ trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no(
+ node->undo_rec);
+ return undo_page;
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_undo(
+/*=====*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(node->trx->in_rollback);
+
+ buf_block_t* undo_page = row_undo_rec_get(node);
+
+ if (!undo_page) {
+ /* Rollback completed for this query thread */
+ thr->run_node = que_node_get_parent(node);
+ return DB_SUCCESS;
+ }
+
+ dberr_t err = trx_undo_roll_ptr_is_insert(node->roll_ptr)
+ ? row_undo_ins(node, thr) : row_undo_mod(node, thr);
+ undo_page->unfix();
+ btr_pcur_close(&(node->pcur));
+
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err;
+ undo_node_t* node;
+ trx_t* trx = thr_get_trx(thr);
+
+ node = static_cast<undo_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+ if (UNIV_UNLIKELY(!trx->dict_operation
+ && !srv_undo_sources
+ && srv_shutdown_state != SRV_SHUTDOWN_NONE)
+ && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
+ /* Shutdown has been initiated. */
+ trx->error_state = DB_INTERRUPTED;
+ return NULL;
+ }
+
+ if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
+ trx_roll_report_progress();
+ }
+
+ err = row_undo(node, thr);
+
+#ifdef ENABLED_DEBUG_SYNC
+ if (trx->mysql_thd) {
+ DEBUG_SYNC_C("trx_after_rollback_row");
+ }
+#endif /* ENABLED_DEBUG_SYNC */
+
+ trx->error_state = err;
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ ib::fatal() << "Error (" << err << ") in rollback.";
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
new file mode 100644
index 00000000..bec53841
--- /dev/null
+++ b/storage/innobase/row/row0upd.cc
@@ -0,0 +1,3002 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.cc
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0log.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+#include "trx0rec.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include <algorithm>
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+#ifdef WITH_WSREP
+#include "log.h"
+#include "wsrep.h"
+#endif /* WITH_WSREP */
+
+
+/* What kind of latch and lock can we assume when the control comes to
+ -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+ Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ dtuple_t* entry, /*!< in: old value of index entry */
+ dict_index_t* index, /*!< in: index of entry */
+ const upd_t* update, /*!< in: update vector for the row */
+ ulint n); /*!< in: how many first fields to check */
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+@return true if referenced */
+static
+bool
+row_upd_index_is_referenced(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_table_t *table= index->table;
+ /* The pointers in table->referenced_set are safe to dereference
+ thanks to the SQL layer having acquired MDL on all (grand)parent tables. */
+ dict_foreign_set::iterator end= table->referenced_set.end();
+ return end != std::find_if(table->referenced_set.begin(), end,
+ dict_foreign_with_index(index));
+}
+
+#ifdef WITH_WSREP
+static
+bool
+wsrep_row_upd_index_is_foreign(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ if (!trx->is_wsrep())
+ return false;
+
+ dict_table_t *table= index->table;
+
+ if (table->foreign_set.empty())
+ return false;
+
+ /* No MDL protects dereferencing the members of table->foreign_set. */
+ const bool no_lock= !trx->dict_operation_lock_mode;
+ if (no_lock)
+ dict_sys.freeze(SRW_LOCK_CALL);
+
+ auto end= table->foreign_set.end();
+ const bool is_referenced= end !=
+ std::find_if(table->foreign_set.begin(), end,
+ [index](const dict_foreign_t* f)
+ {return f->foreign_index == index;});
+ if (no_lock)
+ dict_sys.unfreeze();
+
+ return is_referenced;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return DB_SUCCESS or an error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_check_references_constraints(
+/*=================================*/
+ upd_node_t* node, /*!< in: row update node */
+ btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /*!< in: table in question */
+ dict_index_t* index, /*!< in: index of the cursor */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ const rec_t* rec;
+ dberr_t err;
+
+ DBUG_ENTER("row_upd_check_references_constraints");
+
+ if (table->referenced_set.empty()) {
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+ mtr_commit(mtr);
+
+ DEBUG_SYNC_C("foreign_constraint_check_for_update");
+
+ mtr->start();
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "foreign_constraint_check_for_insert");
+
+ for (dict_foreign_set::iterator it = table->referenced_set.begin();
+ it != table->referenced_set.end();
+ ++it) {
+
+ foreign = *it;
+
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->referenced_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(
+ entry, index, node->update,
+ foreign->n_fields))) {
+ dict_table_t* ref_table = nullptr;
+
+ if (!foreign->foreign_table) {
+ ref_table = dict_table_open_on_name(
+ foreign->foreign_table_name_lookup,
+ false, DICT_ERR_IGNORE_NONE);
+ }
+
+ err = row_ins_check_foreign_constraint(
+ FALSE, foreign, table, entry, thr);
+
+ if (ref_table) {
+ dict_table_close(ref_table);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ }
+
+ err = DB_SUCCESS;
+
+func_exit:
+ mem_heap_free(heap);
+
+ DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
+ DBUG_RETURN(err);
+}
+
+#ifdef WITH_WSREP
+static
+dberr_t
+wsrep_row_upd_check_foreign_constraints(
+/*=================================*/
+ upd_node_t* node, /*!< in: row update node */
+ btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /*!< in: table in question */
+ dict_index_t* index, /*!< in: index of the cursor */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ const rec_t* rec;
+ dberr_t err;
+
+ if (table->foreign_set.empty()) {
+ return(DB_SUCCESS);
+ }
+
+ /* TODO: make native slave thread bail out here */
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ for (dict_foreign_set::iterator it = table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++it) {
+
+ foreign = *it;
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->foreign_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(
+ entry, index, node->update,
+ foreign->n_fields))) {
+
+ dict_table_t *opened = nullptr;
+
+ if (!foreign->referenced_table) {
+ foreign->referenced_table =
+ dict_table_open_on_name(
+ foreign->referenced_table_name_lookup,
+ false, DICT_ERR_IGNORE_NONE);
+ opened = foreign->referenced_table;
+ }
+
+ err = row_ins_check_foreign_constraint(
+ TRUE, foreign, table, entry, thr);
+
+ if (opened) {
+ dict_table_close(opened);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ }
+ }
+
+ err = DB_SUCCESS;
+func_exit:
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/** Determine if a FOREIGN KEY constraint needs to be processed.
+@param[in] node query node
+@param[in] trx transaction
+@return whether the node cannot be ignored */
+
+inline bool wsrep_must_process_fk(const upd_node_t* node, const trx_t* trx)
+{
+ if (!trx->is_wsrep()) {
+ return false;
+ }
+ return que_node_get_type(node->common.parent) != QUE_NODE_UPDATE
+ || static_cast<upd_node_t*>(node->common.parent)->cascade_node
+ != node;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ upd_node_t* node;
+
+ node = static_cast<upd_node_t*>(
+ mem_heap_zalloc(heap, sizeof(upd_node_t)));
+
+ node->common.type = QUE_NODE_UPDATE;
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ node->heap = mem_heap_create(128);
+ node->magic_n = UPD_NODE_MAGIC_N;
+
+ return(node);
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update) /*!< in: update vector */
+{
+ const upd_field_t* upd_field;
+ const dfield_t* new_val;
+ ulint old_len;
+ ulint new_len;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ ut_ad(!index->table->skip_alter_undo);
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ /* We should ignore virtual field if the index is not
+ a virtual index */
+ if (upd_fld_is_virtual_col(upd_field)
+ && !index->has_virtual()) {
+ continue;
+ }
+
+ new_val = &(upd_field->new_val);
+ if (dfield_is_ext(new_val)) {
+ return(TRUE);
+ }
+ new_len = dfield_get_len(new_val);
+ ut_ad(new_len != UNIV_SQL_DEFAULT);
+
+ if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+ new_len = dict_col_get_sql_null_size(
+ dict_index_get_nth_col(index,
+ upd_field->field_no),
+ 0);
+ }
+
+ if (rec_offs_nth_default(offsets, upd_field->field_no)) {
+ /* This is an instantly added column that is
+ at the initial default value. */
+ return(TRUE);
+ }
+
+ if (rec_offs_comp(offsets)
+ && rec_offs_nth_sql_null(offsets, upd_field->field_no)) {
+ /* Note that in the compact table format, for a
+ variable length field, an SQL NULL will use zero
+ bytes in the offset array at the start of the physical
+ record, but a zero-length value (empty string) will
+ use one byte! Thus, we cannot use update-in-place
+ if we update an SQL NULL varchar to an empty string! */
+
+ old_len = UNIV_SQL_NULL;
+ } else {
+ old_len = rec_offs_nth_size(offsets,
+ upd_field->field_no);
+ }
+
+ if (old_len != new_len
+ || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ const rec_t* rec, /*!< in: secondary index record */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ const dfield_t* dfield;
+ const byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+
+ /* This function is used only for a secondary index */
+ ut_a(!dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry));
+ ut_ad(!rec_offs_any_extern(offsets));
+ ut_ad(!rec_offs_any_default(offsets));
+ ut_ad(!index->table->skip_alter_undo);
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+
+ for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE that it may be that len != dfield_get_len(dfield) if we
+ are updating in a character set and collation where strings of
+ different length can be equal in an alphabetical comparison,
+ and also in the case where we have a column prefix index
+ and the last characters in the index field are spaces; the
+ latter case probably caused the assertion failures reported at
+ row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+
+ if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index);
+
+ n_diff++;
+ }
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in] index clustered index
+@param[in] entry clustered index entry to insert
+@param[in] rec clustered index record
+@param[in] offsets rec_get_offsets(rec,index), or NULL
+@param[in] no_sys skip the system columns
+ DB_TRX_ID and DB_ROLL_PTR
+@param[in] trx transaction (for diagnostics),
+ or NULL
+@param[in] heap memory heap from which allocated
+@param[in] mysql_table NULL, or mysql table object when
+ user thread invokes dml
+@param[out] error error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id,if error is not equal to DB_SUCCESS, return NULL */
+upd_t*
+row_upd_build_difference_binary(
+ dict_index_t* index,
+ const dtuple_t* entry,
+ const rec_t* rec,
+ const rec_offs* offsets,
+ bool no_sys,
+ bool ignore_warnings,
+ trx_t* trx,
+ mem_heap_t* heap,
+ TABLE* mysql_table,
+ dberr_t* error)
+{
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint n_v_fld = dtuple_get_n_v_fields(entry);
+ rec_offs_init(offsets_);
+
+ /* This function is used only for a clustered index */
+ ut_a(dict_index_is_clust(index));
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(entry->n_fields <= index->n_fields);
+ ut_ad(entry->n_fields >= index->n_core_fields);
+
+ update = upd_create(index->n_fields + n_v_fld, heap);
+
+ n_diff = 0;
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ for (uint16_t i = 0; i < entry->n_fields; i++) {
+ const byte* data = rec_get_nth_cfield(rec, index, offsets, i,
+ &len);
+ const dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+ if (no_sys && (i == index->db_trx_id()
+ || i == index->db_roll_ptr())) {
+ continue;
+ }
+
+ if (!dfield_is_ext(dfield)
+ != !rec_offs_nth_extern(offsets, i)
+ || !dfield_data_is_binary_equal(dfield, len, data)) {
+ upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+ dfield_copy(&uf->new_val, dfield);
+ upd_field_set_field_no(uf, i, index);
+ }
+ }
+
+ for (uint16_t i = static_cast<uint16_t>(entry->n_fields);
+ i < index->n_fields; i++) {
+ upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+ const dict_col_t* col = dict_index_get_nth_col(index, i);
+ /* upd_create() zero-initialized uf */
+ uf->new_val.data = const_cast<byte*>(col->instant_value(&len));
+ uf->new_val.len = static_cast<unsigned>(len);
+ dict_col_copy_type(col, &uf->new_val.type);
+ upd_field_set_field_no(uf, i, index);
+ }
+
+ /* Check the virtual columns updates. Even if there is no non-virtual
+ column (base columns) change, we will still need to build the
+ indexed virtual column value so that undo log would log them (
+ for purge/mvcc purpose) */
+ if (n_v_fld > 0) {
+ row_ext_t* ext;
+ THD* thd;
+
+ if (trx == NULL) {
+ thd = current_thd;
+ } else {
+ thd = trx->mysql_thd;
+ }
+
+ ut_ad(!update->old_vrow);
+
+ ib_vcol_row vc(NULL);
+ uchar *record = vc.record(thd, index, &mysql_table);
+
+ for (uint16_t i = 0; i < n_v_fld; i++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(index->table, i);
+
+ if (!col->m_col.ord_part) {
+ continue;
+ }
+
+ if (update->old_vrow == NULL) {
+ update->old_vrow = row_build(
+ ROW_COPY_POINTERS, index, rec, offsets,
+ index->table, NULL, NULL, &ext, heap);
+ }
+
+ dfield_t* vfield = innobase_get_computed_value(
+ update->old_vrow, col, index,
+ &vc.heap, heap, NULL, thd, mysql_table, record,
+ NULL, NULL, ignore_warnings);
+ if (vfield == NULL) {
+ *error = DB_COMPUTE_VALUE_FAILED;
+ return(NULL);
+ }
+
+ const dfield_t* dfield = dtuple_get_nth_v_field(
+ entry, i);
+
+ if (!dfield_data_is_binary_equal(
+ dfield, vfield->len,
+ static_cast<byte*>(vfield->data))) {
+ upd_field_t* uf = upd_get_nth_field(update,
+ n_diff++);
+ uf->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(heap,
+ sizeof *uf->old_v_val));
+ dfield_copy(uf->old_v_val, vfield);
+ dfield_copy(&uf->new_val, dfield);
+ upd_field_set_v_field_no(uf, i, index);
+ }
+ }
+ }
+
+ update->n_fields = n_diff;
+ ut_ad(update->validate());
+
+ return(update);
+}
+
+/** Fetch a prefix of an externally stored column.
+This is similar to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part
+@param[in] local_len length of data, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] len input - length of prefix to
+fetch; output: fetched length of the prefix
+@param[in,out] heap heap where to allocate
+@return BLOB prefix
+@retval NULL if the record is incomplete (should only happen
+in row_vers_vc_matches_cluster() executed concurrently with another purge) */
+static
+byte*
+row_upd_ext_fetch(
+ const byte* data,
+ ulint local_len,
+ ulint zip_size,
+ ulint* len,
+ mem_heap_t* heap)
+{
+ byte* buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+ *len = btr_copy_externally_stored_field_prefix(
+ buf, *len, zip_size, data, local_len);
+
+ return *len ? buf : NULL;
+}
+
+/** Replaces the new column value stored in the update vector in
+the given index entry field.
+@param[in,out] dfield data field of the index entry
+@param[in] field index field
+@param[in] col field->col
+@param[in] uf update field
+@param[in,out] heap memory heap for allocating and copying
+the new value
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return whether the previous version was built successfully */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+row_upd_index_replace_new_col_val(
+ dfield_t* dfield,
+ const dict_field_t* field,
+ const dict_col_t* col,
+ const upd_field_t* uf,
+ mem_heap_t* heap,
+ ulint zip_size)
+{
+ ulint len;
+ const byte* data;
+
+ dfield_copy_data(dfield, &uf->new_val);
+
+ if (dfield_is_null(dfield)) {
+ return true;
+ }
+
+ len = dfield_get_len(dfield);
+ data = static_cast<const byte*>(dfield_get_data(dfield));
+
+ if (field->prefix_len > 0) {
+ ibool fetch_ext = dfield_is_ext(dfield)
+ && len < (ulint) field->prefix_len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (fetch_ext) {
+ ulint l = len;
+
+ len = field->prefix_len;
+
+ data = row_upd_ext_fetch(data, l, zip_size,
+ &len, heap);
+ if (UNIV_UNLIKELY(!data)) {
+ return false;
+ }
+ }
+
+ len = dtype_get_at_most_n_mbchars(col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ field->prefix_len, len,
+ (const char*) data);
+
+ dfield_set_data(dfield, data, len);
+
+ if (!fetch_ext) {
+ dfield_dup(dfield, heap);
+ }
+
+ return true;
+ }
+
+ switch (uf->orig_len) {
+ byte* buf;
+ case BTR_EXTERN_FIELD_REF_SIZE:
+ /* Restore the original locally stored
+ part of the column. In the undo log,
+ InnoDB writes a longer prefix of externally
+ stored columns, so that column prefixes
+ in secondary indexes can be reconstructed. */
+ dfield_set_data(dfield,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_set_ext(dfield);
+ /* fall through */
+ case 0:
+ dfield_dup(dfield, heap);
+ break;
+ default:
+ /* Reconstruct the original locally
+ stored part of the column. The data
+ will have to be copied. */
+ ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+ buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
+ /* Copy the locally stored prefix. */
+ memcpy(buf, data,
+ unsigned(uf->orig_len) - BTR_EXTERN_FIELD_REF_SIZE);
+
+ /* Copy the BLOB pointer. */
+ memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ dfield_set_data(dfield, buf, uf->orig_len);
+ dfield_set_ext(dfield);
+ break;
+ }
+
+ return true;
+}
+
+/** Apply an update vector to an metadata entry.
+@param[in,out] entry clustered index metadata record to be updated
+@param[in] index index of the entry
+@param[in] update update vector built for the entry
+@param[in,out] heap memory heap for copying off-page columns */
+static
+void
+row_upd_index_replace_metadata(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ mem_heap_t* heap)
+{
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(update->is_alter_metadata());
+ ut_ad(entry->info_bits == update->info_bits);
+ ut_ad(entry->n_fields == ulint(index->n_fields) + 1);
+ const ulint zip_size = index->table->space->zip_size();
+ const ulint first = index->first_user_field();
+ ut_d(bool found_mblob = false);
+
+ for (ulint i = upd_get_n_fields(update); i--; ) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ ut_ad(!upd_fld_is_virtual_col(uf));
+ ut_ad(uf->field_no >= first - 2);
+ ulint f = uf->field_no;
+ dfield_t* dfield = dtuple_get_nth_field(entry, f);
+
+ if (f == first) {
+ ut_d(found_mblob = true);
+ ut_ad(!dfield_is_null(&uf->new_val));
+ ut_ad(dfield_is_ext(dfield));
+ ut_ad(dfield_get_len(dfield) == FIELD_REF_SIZE);
+ ut_ad(!dfield_is_null(dfield));
+ dfield_set_data(dfield, uf->new_val.data,
+ uf->new_val.len);
+ if (dfield_is_ext(&uf->new_val)) {
+ dfield_set_ext(dfield);
+ }
+ continue;
+ }
+
+ f -= f > first;
+ const dict_field_t* field = dict_index_get_nth_field(index, f);
+ if (!row_upd_index_replace_new_col_val(dfield, field,
+ field->col,
+ uf, heap, zip_size)) {
+ ut_error;
+ }
+ }
+
+ ut_ad(found_mblob);
+}
+
+/** Apply an update vector to an index entry.
+@param[in,out] entry index entry to be updated; the clustered index record
+ must be covered by a lock or a page latch to prevent
+ deletion (rollback or purge)
+@param[in] index index of the entry
+@param[in] update update vector built for the entry
+@param[in,out] heap memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ mem_heap_t* heap)
+{
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(!entry->is_metadata() || entry->info_bits == update->info_bits);
+
+ if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+ row_upd_index_replace_metadata(entry, index, update, heap);
+ return;
+ }
+
+ const ulint zip_size = index->table->space->zip_size();
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (uint16_t i = index->n_fields; i--; ) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+ const upd_field_t* uf;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+ if (col->is_virtual()) {
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(
+ col);
+
+ uf = upd_get_field_by_field_no(
+ update, vcol->v_pos, true);
+ } else {
+ uf = upd_get_field_by_field_no(
+ update, i, false);
+ }
+
+ if (uf && UNIV_UNLIKELY(!row_upd_index_replace_new_col_val(
+ dtuple_get_nth_field(entry, i),
+ field, col, uf, heap,
+ zip_size))) {
+ ut_error;
+ }
+ }
+}
+
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry clustered index tuple where the values are replaced
+ (the clustered index leaf page latch must be held)
+@param index clustered index
+@param update update vector for the clustered index
+@param heap memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+ const upd_t *update, mem_heap_t *heap)
+{
+ ut_ad(index.is_primary());
+ const ulint zip_size= index.table->space->zip_size();
+
+ ut_ad(!index.table->skip_alter_undo);
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (ulint i= 0; i < index.n_fields; i++)
+ {
+ const dict_field_t *field= &index.fields[i];
+ const dict_col_t* col= dict_field_get_col(field);
+ const upd_field_t *uf;
+
+ if (col->is_virtual())
+ {
+ const dict_v_col_t *vcol= reinterpret_cast<const dict_v_col_t*>(col);
+ uf= upd_get_field_by_field_no(update, vcol->v_pos, true);
+ }
+ else
+ uf= upd_get_field_by_field_no(update, static_cast<uint16_t>
+ (dict_col_get_clust_pos(col, &index)),
+ false);
+
+ if (!uf)
+ continue;
+
+ if (!row_upd_index_replace_new_col_val(dtuple_get_nth_field(entry, i),
+ field, col, uf, heap, zip_size))
+ return false;
+ }
+
+ return true;
+}
+
+/** Replaces the virtual column values stored in the update vector.
+@param[in,out] row row whose column to be set
+@param[in] field data to set
+@param[in] len data length
+@param[in] vcol virtual column info */
+static
+void
+row_upd_set_vcol_data(
+ dtuple_t* row,
+ const byte* field,
+ ulint len,
+ dict_v_col_t* vcol)
+{
+ dfield_t* dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+ if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ dict_col_copy_type(&vcol->m_col, dfield_get_type(dfield));
+
+ dfield_set_data(dfield, field, len);
+ }
+}
+
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out] row row whose column to be updated
+@param[in] table table
+@param[in] update an update vector built for the clustered index
+@param[in] upd_new update to new or old value
+@param[in,out] undo_row undo row (if needs to be updated)
+@param[in] ptr remaining part in update undo log */
+void
+row_upd_replace_vcol(
+ dtuple_t* row,
+ const dict_table_t* table,
+ const upd_t* update,
+ bool upd_new,
+ dtuple_t* undo_row,
+ const byte* ptr)
+{
+ ulint col_no;
+ ulint i;
+ ulint n_cols;
+
+ ut_ad(!table->skip_alter_undo);
+
+ n_cols = dtuple_get_n_v_fields(row);
+ for (col_no = 0; col_no < n_cols; col_no++) {
+ dfield_t* dfield;
+
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ /* If there is no index on the column, do not bother for
+ value update */
+ if (!col->m_col.ord_part) {
+ continue;
+ }
+
+ dfield = dtuple_get_nth_v_field(row, col_no);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+ if (!upd_fld_is_virtual_col(upd_field)
+ || upd_field->field_no != col->v_pos) {
+ continue;
+ }
+
+ if (upd_new) {
+ dfield_copy_data(dfield, &upd_field->new_val);
+ } else {
+ dfield_copy_data(dfield, upd_field->old_v_val);
+ }
+
+ dfield->type = upd_field->new_val.type;
+ break;
+ }
+ }
+
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ /* We will read those unchanged (but indexed) virtual columns in */
+ if (ptr) {
+ const byte* const end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+
+ while (ptr != end_ptr) {
+ const byte* field;
+ uint32_t field_no, len, orig_len;
+
+ field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_v = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_v) {
+ ptr = trx_undo_read_v_idx(
+ table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+
+ if (field_no == FIL_NULL) {
+ ut_ad(is_v);
+ continue;
+ }
+
+ if (is_v) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ table, field_no);
+
+ row_upd_set_vcol_data(row, field, len, vcol);
+
+ if (undo_row) {
+ row_upd_set_vcol_data(
+ undo_row, field, len, vcol);
+ }
+ }
+ ut_ad(ptr<= end_ptr);
+ }
+ }
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint col_no;
+ ulint i;
+ ulint n_cols;
+ ulint n_ext_cols;
+ ulint* ext_cols;
+ const dict_table_t* table;
+
+ ut_ad(row);
+ ut_ad(ext);
+ ut_ad(index);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(update);
+ ut_ad(heap);
+ ut_ad(update->validate());
+
+ n_cols = dtuple_get_n_fields(row);
+ table = index->table;
+ ut_ad(n_cols == dict_table_get_n_cols(table));
+
+ ext_cols = static_cast<ulint*>(
+ mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
+ n_ext_cols = 0;
+
+ dtuple_set_info_bits(row, update->info_bits);
+
+ for (col_no = 0; col_no < n_cols; col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+ const ulint clust_pos
+ = dict_col_get_clust_pos(col, index);
+ dfield_t* dfield;
+
+ if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+ continue;
+ }
+
+ dfield = dtuple_get_nth_field(row, col_no);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+
+ if (upd_field->field_no != clust_pos
+ || upd_fld_is_virtual_col(upd_field)) {
+
+ continue;
+ }
+
+ dfield_copy_data(dfield, &upd_field->new_val);
+ break;
+ }
+
+ if (dfield_is_ext(dfield) && col->ord_part) {
+ ext_cols[n_ext_cols++] = col_no;
+ }
+ }
+
+ if (n_ext_cols) {
+ *ext = row_ext_create(n_ext_cols, ext_cols, *table, row, heap);
+ } else {
+ *ext = NULL;
+ }
+
+ row_upd_replace_vcol(row, table, update, true, nullptr, nullptr);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update, /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+#ifdef UNIV_DEBUG
+ const que_thr_t*thr, /*!< in: query thread */
+#endif /* UNIV_DEBUG */
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ const row_ext_t*ext, /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ ulint flag) /*!< in: ROW_BUILD_NORMAL,
+ ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+{
+ ulint n_unique;
+ ulint i;
+ const dict_index_t* clust_index;
+
+ ut_ad(!index->table->skip_alter_undo);
+
+ n_unique = dict_index_get_n_unique(index);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ for (i = 0; i < n_unique; i++) {
+
+ const dict_field_t* ind_field;
+ const dict_col_t* col;
+ ulint col_no;
+ const upd_field_t* upd_field;
+ const dfield_t* dfield;
+ dfield_t dfield_ext;
+ ulint dfield_len= 0;
+ const byte* buf;
+ bool is_virtual;
+ const dict_v_col_t* vcol = NULL;
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_no = dict_col_get_no(col);
+ is_virtual = col->is_virtual();
+
+ if (is_virtual) {
+ vcol = reinterpret_cast<const dict_v_col_t*>(col);
+
+ upd_field = upd_get_field_by_field_no(
+ update, vcol->v_pos, true);
+ } else {
+ upd_field = upd_get_field_by_field_no(
+ update, static_cast<uint16_t>(
+ dict_col_get_clust_pos(
+ col, clust_index)),
+ false);
+ }
+
+ if (upd_field == NULL) {
+ continue;
+ }
+
+ if (row == NULL) {
+ ut_ad(ext == NULL);
+ return(TRUE);
+ }
+
+ if (is_virtual) {
+ dfield = dtuple_get_nth_v_field(
+ row, vcol->v_pos);
+ } else {
+ dfield = dtuple_get_nth_field(row, col_no);
+ }
+
+ /* For spatial index update, since the different geometry
+ data could generate same MBR, so, if the new index entry is
+ same as old entry, which means the MBR is not changed, we
+ don't need to do anything. */
+ if (dict_index_is_spatial(index) && i == 0) {
+ double mbr1[SPDIMS * 2];
+ double mbr2[SPDIMS * 2];
+ rtr_mbr_t* old_mbr;
+ rtr_mbr_t* new_mbr;
+ const uchar* dptr = NULL;
+ ulint flen = 0;
+ ulint dlen = 0;
+ mem_heap_t* temp_heap = NULL;
+ const dfield_t* new_field = &upd_field->new_val;
+
+ const ulint zip_size = ext
+ ? ext->zip_size
+ : index->table->space->zip_size();
+
+ ut_ad(dfield->data != NULL
+ && dfield->len > GEO_DATA_HEADER_SIZE);
+ ut_ad(dict_col_get_spatial_status(col) != SPATIAL_NONE);
+
+ /* Get the old mbr. */
+ if (dfield_is_ext(dfield)) {
+ /* For off-page stored data, we
+ need to read the whole field data. */
+ flen = dfield_get_len(dfield);
+ dptr = static_cast<const byte*>(
+ dfield_get_data(dfield));
+ temp_heap = mem_heap_create(1000);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, dptr,
+ zip_size,
+ flen,
+ temp_heap);
+ } else {
+ dptr = static_cast<const uchar*>(dfield->data);
+ dlen = dfield->len;
+ }
+
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE),
+ SPDIMS, mbr1);
+ old_mbr = reinterpret_cast<rtr_mbr_t*>(mbr1);
+
+ /* Get the new mbr. */
+ if (dfield_is_ext(new_field)) {
+ if (flag == ROW_BUILD_FOR_UNDO
+ && dict_table_has_atomic_blobs(
+ index->table)) {
+ /* For ROW_FORMAT=DYNAMIC
+ or COMPRESSED, a prefix of
+ off-page records is stored
+ in the undo log record
+ (for any column prefix indexes).
+ For SPATIAL INDEX, we must
+ ignore this prefix. The
+ full column value is stored in
+ the BLOB.
+ For non-spatial index, we
+ would have already fetched a
+ necessary prefix of the BLOB,
+ available in the "ext" parameter.
+
+ Here, for SPATIAL INDEX, we are
+ fetching the full column, which is
+ potentially wasting a lot of I/O,
+ memory, and possibly involving a
+ concurrency problem, similar to ones
+ that existed before the introduction
+ of row_ext_t.
+
+ MDEV-11657 FIXME: write the MBR
+ directly to the undo log record,
+ and avoid recomputing it here! */
+ flen = BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(dfield_get_len(new_field) >=
+ BTR_EXTERN_FIELD_REF_SIZE);
+ dptr = static_cast<const byte*>(
+ dfield_get_data(new_field))
+ + dfield_get_len(new_field)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+ flen = dfield_get_len(new_field);
+ dptr = static_cast<const byte*>(
+ dfield_get_data(new_field));
+ }
+
+ if (temp_heap == NULL) {
+ temp_heap = mem_heap_create(1000);
+ }
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, dptr,
+ zip_size,
+ flen,
+ temp_heap);
+ } else {
+ dptr = static_cast<const byte*>(
+ upd_field->new_val.data);
+ dlen = upd_field->new_val.len;
+ }
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE),
+ SPDIMS, mbr2);
+ new_mbr = reinterpret_cast<rtr_mbr_t*>(mbr2);
+
+ if (temp_heap) {
+ mem_heap_free(temp_heap);
+ }
+
+ if (!MBR_EQUAL_CMP(old_mbr, new_mbr)) {
+ return(TRUE);
+ } else {
+ continue;
+ }
+ }
+
+ /* This treatment of column prefix indexes is loosely
+ based on row_build_index_entry(). */
+
+ if (UNIV_LIKELY(ind_field->prefix_len == 0)
+ || dfield_is_null(dfield)) {
+ /* do nothing special */
+ } else if (ext) {
+ /* Silence a compiler warning without
+ silencing a Valgrind error. */
+ dfield_len = 0;
+ MEM_UNDEFINED(&dfield_len, sizeof dfield_len);
+ /* See if the column is stored externally. */
+ buf = row_ext_lookup(ext, col_no, &dfield_len);
+
+ ut_ad(col->ord_part);
+
+ if (UNIV_LIKELY_NULL(buf)) {
+ if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+ /* The externally stored field
+ was not written yet. This
+ record should only be seen by
+ trx_rollback_recovered()
+ when the server had crashed before
+ storing the field. */
+ ut_ad(!thr
+ || thr->graph->trx->is_recovered);
+ ut_ad(!thr
+ || thr->graph->trx
+ == trx_roll_crash_recv_trx);
+ return(TRUE);
+ }
+
+ goto copy_dfield;
+ }
+ } else if (dfield_is_ext(dfield)) {
+ dfield_len = dfield_get_len(dfield);
+ ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(dict_index_is_clust(index)
+ || ind_field->prefix_len <= dfield_len);
+
+ buf= static_cast<const byte*>(dfield_get_data(dfield));
+copy_dfield:
+ ut_a(dfield_len > 0);
+ dfield_copy(&dfield_ext, dfield);
+ dfield_set_data(&dfield_ext, buf, dfield_len);
+ dfield = &dfield_ext;
+ }
+
+ if (!dfield_datas_are_binary_equal(
+ dfield, &upd_field->new_val,
+ ind_field->prefix_len)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update) /*!< in: update vector for the row */
+{
+ upd_field_t* upd_field;
+ dict_index_t* index;
+ ulint i;
+
+ index = dict_table_get_first_index(table);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ if (upd_fld_is_virtual_col(upd_field)) {
+ if (dict_table_get_nth_v_col(index->table,
+ upd_field->field_no)
+ ->m_col.ord_part) {
+ return(TRUE);
+ }
+ } else {
+ if (dict_field_get_col(dict_index_get_nth_field(
+ index, upd_field->field_no))->ord_part) {
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether the Doc ID column is changed */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field) /*!< in: field to check */
+{
+ ulint col_no;
+ dict_index_t* clust_index;
+ fts_t* fts = table->fts;
+
+ ut_ad(!table->skip_alter_undo);
+
+ clust_index = dict_table_get_first_index(table);
+
+ /* Convert from index-specific column number to table-global
+ column number. */
+ col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+ return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+ dict_table_t* table, /*!< in: table */
+ upd_field_t* upd_field) /*!< in: field to check */
+{
+ ulint col_no;
+ dict_index_t* clust_index;
+ fts_t* fts = table->fts;
+
+ ut_ad(!table->skip_alter_undo);
+
+ if (upd_fld_is_virtual_col(upd_field)) {
+ col_no = upd_field->field_no;
+ return(dict_table_is_fts_column(fts->indexes, col_no, true));
+ } else {
+ clust_index = dict_table_get_first_index(table);
+
+ /* Convert from index-specific column number to table-global
+ column number. */
+ col_no = dict_index_get_nth_col_no(clust_index,
+ upd_field->field_no);
+ return(dict_table_is_fts_column(fts->indexes, col_no, false));
+ }
+
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ dtuple_t* entry, /*!< in: index entry */
+ dict_index_t* index, /*!< in: index of entry */
+ const upd_t* update, /*!< in: update vector for the row */
+ ulint n) /*!< in: how many first fields to check */
+{
+ ulint n_upd_fields;
+ ulint i, j;
+ dict_index_t* clust_index;
+
+ ut_ad(update && index);
+ ut_ad(n <= dict_index_get_n_fields(index));
+
+ n_upd_fields = upd_get_n_fields(update);
+ clust_index = dict_table_get_first_index(index->table);
+
+ for (i = 0; i < n; i++) {
+
+ const dict_field_t* ind_field;
+ const dict_col_t* col;
+ ulint col_pos;
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col, clust_index);
+
+ ut_a(ind_field->prefix_len == 0);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ upd_field_t* upd_field
+ = upd_get_nth_field(update, j);
+
+ if (col_pos == upd_field->field_no
+ && !dfield_datas_are_binary_equal(
+ dtuple_get_nth_field(entry, i),
+ &upd_field->new_val, 0)) {
+
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+ rec_t* rec, /*!< in: record in a clustered index */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ const dict_index_t* index, /*!< in: index of rec */
+ sym_node_t* column) /*!< in: first column in a column list, or
+ NULL */
+{
+ ut_ad(dict_index_is_clust(index));
+
+ const byte* data;
+ ulint len;
+
+ while (column) {
+ data = rec_get_nth_cfield(
+ rec, index, offsets,
+ column->field_nos[SYM_CLUST_FIELD_NO], &len);
+ eval_node_copy_and_alloc_val(column, data, len);
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+ upd_t* update) /*!< in/out: update vector */
+{
+ que_node_t* exp;
+ upd_field_t* upd_field;
+ ulint n_fields;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ exp = upd_field->exp;
+
+ eval_exp(exp);
+
+ dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+ }
+}
+
+/** Stores to the heap the virtual columns that need for any indexes
+@param[in,out] node row update node
+@param[in] update an update vector if it is update
+@param[in] thd mysql thread handle
+@param[in,out] mysql_table mysql table object
+@return true if success
+ false if virtual column value computation fails. */
+static
+bool
+row_upd_store_v_row(
+ upd_node_t* node,
+ const upd_t* update,
+ THD* thd,
+ TABLE* mysql_table)
+{
+ dict_index_t* index = dict_table_get_first_index(node->table);
+ ib_vcol_row vc(NULL);
+
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(node->table);
+ col_no++) {
+
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(node->table, col_no);
+
+ if (col->m_col.ord_part) {
+ dfield_t* dfield
+ = dtuple_get_nth_v_field(node->row, col_no);
+ ulint n_upd
+ = update ? upd_get_n_fields(update) : 0;
+ ulint i = 0;
+
+ /* Check if the value is already in update vector */
+ for (i = 0; i < n_upd; i++) {
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+ if (!(upd_field->new_val.type.prtype
+ & DATA_VIRTUAL)
+ || upd_field->field_no != col->v_pos) {
+ continue;
+ }
+
+ dfield_copy_data(dfield, upd_field->old_v_val);
+ dfield_dup(dfield, node->heap);
+ break;
+ }
+
+ /* Not updated */
+ if (i >= n_upd) {
+ /* If this is an update, then the value
+ should be in update->old_vrow */
+ if (update) {
+ if (update->old_vrow == NULL) {
+ /* This only happens in
+ cascade update. And virtual
+ column can't be affected,
+ so it is Ok to set it to NULL */
+ dfield_set_null(dfield);
+ } else {
+ dfield_t* vfield
+ = dtuple_get_nth_v_field(
+ update->old_vrow,
+ col_no);
+ dfield_copy_data(dfield, vfield);
+ dfield_dup(dfield, node->heap);
+ }
+ } else {
+ uchar *record = vc.record(thd, index,
+ &mysql_table);
+ /* Need to compute, this happens when
+ deleting row */
+ dfield_t* vfield =
+ innobase_get_computed_value(
+ node->row, col, index,
+ &vc.heap, node->heap,
+ NULL, thd, mysql_table,
+ record, NULL, NULL);
+ if (vfield == NULL) {
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/** Stores to the heap the row on which the node->pcur is positioned.
+@param[in] node row update node
+@param[in] thd mysql thread handle
+@param[in,out] mysql_table NULL, or mysql table object when
+ user thread invokes dml
+@return false if virtual column value computation fails
+ true otherwise. */
+static
+bool
+row_upd_store_row(
+ upd_node_t* node,
+ THD* thd,
+ TABLE* mysql_table)
+{
+ dict_index_t* clust_index;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ row_ext_t** ext;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ const rec_offs* offsets;
+ rec_offs_init(offsets_);
+
+ ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+ if (node->row != NULL) {
+ mem_heap_empty(node->heap);
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ rec = btr_pcur_get_rec(node->pcur);
+
+ offsets = rec_get_offsets(rec, clust_index, offsets_,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_table_has_atomic_blobs(node->table)) {
+ /* There is no prefix of externally stored columns in
+ the clustered index record. Build a cache of column
+ prefixes. */
+ ext = &node->ext;
+ } else {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored column.
+ No cache is needed. */
+ ext = NULL;
+ node->ext = NULL;
+ }
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+ NULL, NULL, NULL, ext, node->heap);
+
+ if (node->table->n_v_cols) {
+ bool ok = row_upd_store_v_row(node,
+ node->is_delete ? NULL : node->update,
+ thd, mysql_table);
+ if (!ok) {
+ return false;
+ }
+ }
+
+ if (node->is_delete == PLAIN_DELETE) {
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ } else {
+ node->upd_row = dtuple_copy(node->row, node->heap);
+ row_upd_replace(node->upd_row, &node->upd_ext,
+ clust_index, node->update, node->heap);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return true;
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_index_entry(
+/*====================*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mtr_t mtr;
+ btr_pcur_t pcur;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx = thr_get_trx(thr);
+ btr_latch_mode mode;
+ ulint flags;
+ enum row_search_result search_result;
+
+ ut_ad(trx->id != 0);
+
+ index = node->index;
+ ut_ad(index->is_committed());
+
+ /* For secondary indexes, index->online_status==ONLINE_INDEX_COMPLETE
+ if index->is_committed(). */
+ ut_ad(!dict_index_is_online_ddl(index));
+
+ const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+ const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif /* WITH_WSREP */
+
+ heap = mem_heap_create(1024);
+
+ /* Build old index entry */
+ entry = row_build_index_entry(node->row, node->ext, index, heap);
+ ut_a(entry);
+
+ log_free_check();
+
+ DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+ "before_row_upd_sec_index_entry");
+
+ mtr.start();
+ mode = BTR_MODIFY_LEAF;
+
+ switch (index->table->space_id) {
+ case SRV_TMP_SPACE_ID:
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ flags = BTR_NO_LOCKING_FLAG;
+ break;
+ default:
+ index->set_modified(mtr);
+ /* fall through */
+ case IBUF_SPACE_ID:
+ flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+ /* We can only buffer delete-mark operations if there
+ are no foreign key constraints referring to the index. */
+ if (!referenced) {
+ mode = BTR_DELETE_MARK_LEAF;
+ }
+ break;
+ }
+
+ /* Set the query thread, so that ibuf_insert_low() will be
+ able to invoke thd_get_trx(). */
+ pcur.btr_cur.thr = thr;
+ pcur.btr_cur.page_cur.index = index;
+
+ if (index->is_spatial()) {
+ mode = btr_latch_mode(BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK);
+ if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+ goto found;
+ }
+
+ if (pcur.btr_cur.rtr_info->fd_del) {
+ /* We found the record, but a delete marked */
+ goto close;
+ }
+
+ goto not_found;
+ }
+
+ search_result = row_search_index_entry(entry, mode, &pcur, &mtr);
+
+ switch (search_result) {
+ const rec_t* rec;
+ case ROW_NOT_DELETED_REF: /* should only occur for BTR_DELETE */
+ ut_error;
+ break;
+ case ROW_BUFFERED:
+ /* Entry was delete marked already. */
+ break;
+
+ case ROW_NOT_FOUND:
+not_found:
+ rec = btr_pcur_get_rec(&pcur);
+ ib::error()
+ << "Record in index " << index->name
+ << " of table " << index->table->name
+ << " was not found on update: " << *entry
+ << " at: " << rec_index_print(rec, index);
+#ifdef UNIV_DEBUG
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ ut_ad(btr_validate_index(index, 0) == DB_SUCCESS);
+ ut_ad(0);
+#endif /* UNIV_DEBUG */
+ break;
+ case ROW_FOUND:
+found:
+ ut_ad(err == DB_SUCCESS);
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* Delete mark the old index record; it can already be
+ delete marked if we return after a lock wait in
+ row_ins_sec_index_entry() below */
+ if (!rec_get_deleted_flag(
+ rec, dict_table_is_comp(index->table))) {
+ err = lock_sec_rec_modify_check_and_lock(
+ flags,
+ btr_pcur_get_block(&pcur),
+ btr_pcur_get_rec(&pcur), index, thr, &mtr);
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ btr_rec_set_deleted<true>(btr_pcur_get_block(&pcur),
+ btr_pcur_get_rec(&pcur),
+ &mtr);
+#ifdef WITH_WSREP
+ if (!referenced && foreign
+ && wsrep_must_process_fk(node, trx)
+ && !wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+
+ rec_offs* offsets = rec_get_offsets(
+ rec, index, NULL, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, &pcur, index->table,
+ index, offsets, thr, &mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT:
+ case DB_DEADLOCK:
+ case DB_LOCK_WAIT_TIMEOUT:
+ WSREP_DEBUG("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ default:
+ WSREP_ERROR("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ }
+ }
+#endif /* WITH_WSREP */
+ }
+
+#ifdef WITH_WSREP
+ ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT
+ || err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT);
+#else
+ ut_ad(err == DB_SUCCESS);
+#endif
+
+ if (referenced) {
+ rec_offs* offsets = rec_get_offsets(
+ rec, index, NULL, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* NOTE that the following call loses
+ the position of pcur ! */
+ err = row_upd_check_references_constraints(
+ node, &pcur, index->table,
+ index, offsets, thr, &mtr);
+ }
+ }
+
+close:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (node->is_delete == PLAIN_DELETE || err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ mem_heap_empty(heap);
+
+ DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+ "before_row_upd_sec_new_index_entry");
+
+ /* Build a new index entry */
+ entry = row_build_index_entry(node->upd_row, node->upd_ext,
+ index, heap);
+ ut_a(entry);
+
+ /* Insert new index entry */
+ err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete);
+
+func_exit:
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_step(
+/*=============*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+ || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+ ut_ad(!dict_index_is_clust(node->index));
+
+ if (node->state == UPD_NODE_UPDATE_ALL_SEC
+ || row_upd_changes_ord_field_binary(node->index, node->update,
+ thr, node->row, node->ext)) {
+ return(row_upd_sec_index_entry(node, thr));
+ }
+
+ return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+ row_upd_clust_rec_by_insert_inherit_func(rec,index,offsets,entry,update)
+#else /* UNIV_DEBUG */
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+ row_upd_clust_rec_by_insert_inherit_func(rec,entry,update)
+#endif /* UNIV_DEBUG */
+/*******************************************************************//**
+Mark non-updated off-page columns inherited when the primary key is
+updated. We must mark them as inherited in entry, so that they are not
+freed in a rollback. A limited version of this function used to be
+called btr_cur_mark_dtuple_inherited_extern().
+@return whether any columns were inherited */
+static
+bool
+row_upd_clust_rec_by_insert_inherit_func(
+/*=====================================*/
+ const rec_t* rec, /*!< in: old record, or NULL */
+#ifdef UNIV_DEBUG
+ dict_index_t* index, /*!< in: index, or NULL */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec), or NULL */
+#endif /* UNIV_DEBUG */
+ dtuple_t* entry, /*!< in/out: updated entry to be
+ inserted into the clustered index */
+ const upd_t* update) /*!< in: update vector */
+{
+ bool inherit = false;
+
+ ut_ad(!rec == !offsets);
+ ut_ad(!rec == !index);
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec || rec_offs_any_extern(offsets));
+
+ for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+ byte* data;
+ ulint len;
+
+ ut_ad(!offsets
+ || !rec_offs_nth_extern(offsets, i)
+ == !dfield_is_ext(dfield)
+ || (!dict_index_get_nth_field(index, i)->name
+ && !dfield_is_ext(dfield)
+ && (dfield_is_null(dfield) || dfield->len == 0))
+ || upd_get_field_by_field_no(update, i, false));
+ if (!dfield_is_ext(dfield)
+ || upd_get_field_by_field_no(update, i, false)) {
+ continue;
+ }
+
+#ifdef UNIV_DEBUG
+ if (UNIV_LIKELY(rec != NULL)) {
+ ut_ad(!rec_offs_nth_default(offsets, i));
+ const byte* rec_data
+ = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len == dfield_get_len(dfield));
+ ut_ad(len != UNIV_SQL_NULL);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ rec_data += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* The pointer must not be zero. */
+ ut_ad(memcmp(rec_data, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ /* The BLOB must be owned. */
+ ut_ad(!(rec_data[BTR_EXTERN_LEN]
+ & BTR_EXTERN_OWNER_FLAG));
+ }
+#endif /* UNIV_DEBUG */
+
+ len = dfield_get_len(dfield);
+ ut_a(len != UNIV_SQL_NULL);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ data = static_cast<byte*>(dfield_get_data(dfield));
+
+ data += len - BTR_EXTERN_FIELD_REF_SIZE;
+ /* The pointer must not be zero. */
+ ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ /* The BLOB must be owned, unless we are resuming from
+ a lock wait and we already had disowned the BLOB. */
+ ut_a(rec == NULL
+ || !(data[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ data[BTR_EXTERN_LEN] &= byte(~BTR_EXTERN_OWNER_FLAG);
+ data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG;
+ /* The BTR_EXTERN_INHERITED_FLAG only matters in
+ rollback of a fresh insert. Purge will always free
+ the extern fields of a delete-marked row. */
+
+ inherit = true;
+ }
+
+ return(inherit);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec_by_insert(
+/*========================*/
+ upd_node_t* node, /*!< in/out: row update node */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ que_thr_t* thr, /*!< in: query thread */
+ bool referenced,/*!< in: whether index may be referenced in
+ a foreign key constraint */
+#ifdef WITH_WSREP
+ bool foreign,/*!< in: whether this is a foreign key */
+#endif
+ mtr_t* mtr) /*!< in/out: mini-transaction,
+ may be committed and restarted */
+{
+ mem_heap_t* heap;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ trx_t* trx;
+ dict_table_t* table;
+ dtuple_t* entry;
+ dberr_t err;
+ rec_t* rec;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ ut_ad(dict_index_is_clust(index));
+
+ rec_offs_init(offsets_);
+
+ trx = thr_get_trx(thr);
+ table = node->table;
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ heap = mem_heap_create(1000);
+
+ entry = row_build_index_entry_low(node->upd_row, node->upd_ext,
+ index, heap, ROW_BUILD_FOR_INSERT);
+ if (index->is_instant()) entry->trim(*index);
+ ut_ad(dtuple_get_info_bits(entry) == 0);
+
+ {
+ dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+ ut_ad(t->len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(static_cast<byte*>(t->data), trx->id);
+ }
+
+ switch (node->state) {
+ default:
+ ut_error;
+ case UPD_NODE_INSERT_CLUSTERED:
+ /* A lock wait occurred in row_ins_clust_index_entry() in
+ the previous invocation of this function. */
+ row_upd_clust_rec_by_insert_inherit(
+ NULL, NULL, NULL, entry, node->update);
+ break;
+ case UPD_NODE_UPDATE_CLUSTERED:
+ /* This is the first invocation of the function where
+ we update the primary key. Delete-mark the old record
+ in the clustered index and prepare to insert a new entry. */
+ rec = btr_cur_get_rec(btr_cur);
+ offsets = rec_get_offsets(rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ /* If the clustered index record is already delete
+ marked, then we are here after a DB_LOCK_WAIT.
+ Skip delete marking clustered index and disowning
+ its blobs. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets)
+ == trx->id);
+ ut_ad(!trx_undo_roll_ptr_is_insert(
+ row_get_rec_roll_ptr(rec, index,
+ offsets)));
+ goto check_fk;
+ }
+
+ err = btr_cur_del_mark_set_clust_rec(
+ btr_cur_get_block(btr_cur), rec, index, offsets,
+ thr, node->row, mtr);
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ /* If the the new row inherits externally stored
+ fields (off-page columns a.k.a. BLOBs) from the
+ delete-marked old record, mark them disowned by the
+ old record and owned by the new entry. */
+
+ if (rec_offs_any_extern(offsets)) {
+ if (row_upd_clust_rec_by_insert_inherit(
+ rec, index, offsets,
+ entry, node->update)) {
+ /* The blobs are disowned here, expecting the
+ insert down below to inherit them. But if the
+ insert fails, then this disown will be undone
+ when the operation is rolled back. */
+ btr_cur_disown_inherited_fields(
+ btr_cur_get_block(btr_cur),
+ rec, index, offsets, node->update,
+ mtr);
+ }
+ }
+check_fk:
+ if (referenced) {
+ /* NOTE that the following call loses
+ the position of pcur ! */
+
+ err = row_upd_check_references_constraints(
+ node, pcur, table, index, offsets, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+#ifdef WITH_WSREP
+ } else if (foreign && wsrep_must_process_fk(node, trx)) {
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, pcur, table, index, offsets, thr, mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT:
+ case DB_DEADLOCK:
+ case DB_LOCK_WAIT_TIMEOUT:
+ WSREP_DEBUG("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+
+ goto err_exit;
+ default:
+ WSREP_ERROR("Foreign key check fail: "
+ "%s on table %s index %s query %s",
+ ut_strerr(err), index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+
+ goto err_exit;
+ }
+#endif /* WITH_WSREP */
+ }
+ }
+
+ mtr->commit();
+ mtr->start();
+
+ node->state = UPD_NODE_INSERT_CLUSTERED;
+ err = row_ins_clust_index_entry(index, entry, thr,
+ dtuple_get_n_ext(entry));
+err_exit:
+ mem_heap_free(heap);
+ return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec(
+/*==============*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in: rec_get_offsets() on node->pcur */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: memory heap, can be emptied */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in,out: mini-transaction; may be
+ committed and restarted here */
+{
+ mem_heap_t* heap = NULL;
+ big_rec_t* big_rec = NULL;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ dberr_t err;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!thr_get_trx(thr)->in_rollback);
+ ut_ad(!node->table->skip_alter_undo);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ ut_ad(btr_cur_get_index(btr_cur) == index);
+ ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+ dict_table_is_comp(index->table)));
+ ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
+
+ /* Try optimistic updating of the record, keeping changes within
+ the page; we do not check locks because we assume the x-lock on the
+ record to update */
+
+ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+ err = btr_cur_update_in_place(
+ flags | BTR_NO_LOCKING_FLAG, btr_cur,
+ offsets, node->update,
+ node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+ } else {
+ err = btr_cur_optimistic_update(
+ flags | BTR_NO_LOCKING_FLAG, btr_cur,
+ &offsets, offsets_heap, node->update,
+ node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+ }
+
+ if (err == DB_SUCCESS) {
+ goto func_exit;
+ }
+
+ if (buf_pool.running_out()) {
+ err = DB_LOCK_TABLE_FULL;
+ goto func_exit;
+ }
+
+ /* We may have to modify the tree structure: do a pessimistic descent
+ down the index tree */
+
+ mtr->commit();
+ mtr->start();
+
+ if (index->table->is_temporary()) {
+ /* Disable locking, because temporary tables are never
+ shared between transactions or connections. */
+ flags |= BTR_NO_LOCKING_FLAG;
+ mtr->set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ index->set_modified(*mtr);
+ }
+
+ /* NOTE: this transaction has an s-lock or x-lock on the record and
+ therefore other transactions cannot modify the record when we have no
+ latch on the page. In addition, we assume that other query threads of
+ the same transaction do not modify the record in the meantime.
+ Therefore we can assert that the restoration of the cursor succeeds. */
+
+ ut_a(pcur->restore_position(BTR_MODIFY_TREE, mtr) ==
+ btr_pcur_t::SAME_ALL);
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(index->table)));
+
+ if (!heap) {
+ heap = mem_heap_create(1024);
+ }
+
+ err = btr_cur_pessimistic_update(
+ flags | BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+ &offsets, offsets_heap, heap, &big_rec,
+ node->update, node->cmpl_info,
+ thr, thr_get_trx(thr)->id, mtr);
+ if (big_rec) {
+ ut_a(err == DB_SUCCESS);
+
+ DEBUG_SYNC_C("before_row_upd_extern");
+ err = btr_store_big_rec_extern_fields(
+ pcur, offsets, big_rec, mtr, BTR_STORE_UPDATE);
+ DEBUG_SYNC_C("after_row_upd_extern");
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (big_rec) {
+ dtuple_big_rec_free(big_rec);
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_del_mark_clust_rec(
+/*=======================*/
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets() for the
+ record under the cursor */
+ que_thr_t* thr, /*!< in: query thread */
+ bool referenced,
+ /*!< in: whether index may be referenced in
+ a foreign key constraint */
+#ifdef WITH_WSREP
+ bool foreign,/*!< in: whether this is a foreign key */
+#endif
+ mtr_t* mtr) /*!< in,out: mini-transaction;
+ will be committed and restarted */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ rec_t* rec;
+ trx_t* trx = thr_get_trx(thr);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(node->is_delete == PLAIN_DELETE);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ /* Store row because we have to build also the secondary index
+ entries */
+
+ if (!row_upd_store_row(node, trx->mysql_thd,
+ thr->prebuilt && thr->prebuilt->table == node->table
+ ? thr->prebuilt->m_mysql_table : NULL)) {
+ return DB_COMPUTE_VALUE_FAILED;
+ }
+
+ /* Mark the clustered index record deleted; we do not have to check
+ locks, because we assume that we have an x-lock on the record */
+
+ rec = btr_cur_get_rec(btr_cur);
+
+ dberr_t err = btr_cur_del_mark_set_clust_rec(
+ btr_cur_get_block(btr_cur), rec,
+ index, offsets, thr, node->row, mtr);
+
+ if (err != DB_SUCCESS) {
+ } else if (referenced) {
+ /* NOTE that the following call loses the position of pcur ! */
+
+ err = row_upd_check_references_constraints(
+ node, pcur, index->table, index, offsets, thr, mtr);
+#ifdef WITH_WSREP
+ } else if (foreign && wsrep_must_process_fk(node, trx)) {
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, pcur, index->table, index, offsets, thr, mtr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_LOCK_WAIT:
+ case DB_DEADLOCK:
+ case DB_LOCK_WAIT_TIMEOUT:
+ WSREP_DEBUG("Foreign key check fail: "
+ "%d on table %s index %s query %s",
+ err, index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ default:
+ WSREP_ERROR("Foreign key check fail: "
+ "%d on table %s index %s query %s",
+ err, index->name(), index->table->name.m_name,
+ wsrep_thd_query(trx->mysql_thd));
+ break;
+ }
+#endif /* WITH_WSREP */
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_step(
+/*===============*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ dberr_t err;
+ mtr_t mtr;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets;
+ ulint flags;
+ trx_t* trx = thr_get_trx(thr);
+
+ rec_offs_init(offsets_);
+
+ index = dict_table_get_first_index(node->table);
+
+ if (index->is_corrupted()) {
+ return DB_TABLE_CORRUPT;
+ }
+
+ const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+ const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif
+
+ pcur = node->pcur;
+
+ /* We have to restore the cursor to its position */
+
+ mtr.start();
+
+ if (node->table->is_temporary()) {
+ /* Disable locking, because temporary tables are
+ private to the connection (no concurrent access). */
+ flags = node->table->no_rollback()
+ ? BTR_NO_ROLLBACK
+ : BTR_NO_LOCKING_FLAG;
+ /* Redo logging only matters for persistent tables. */
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ } else {
+ flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+ index->set_modified(mtr);
+ }
+
+ /* If the restoration does not succeed, then the same
+ transaction has deleted the record on which the cursor was,
+ and that is an SQL error. If the restoration succeeds, it may
+ still be that the same transaction has successively deleted
+ and inserted a record with the same ordering fields, but in
+ that case we know that the transaction has at least an
+ implicit x-lock on the record. */
+
+ ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+ btr_latch_mode mode;
+
+ DEBUG_SYNC_C_IF_THD(trx->mysql_thd, "innodb_row_upd_clust_step_enter");
+
+ if (dict_index_is_online_ddl(index)) {
+ ut_ad(node->table->id != DICT_INDEXES_ID);
+ mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+ mtr_s_lock_index(index, &mtr);
+ } else {
+ mode = BTR_MODIFY_LEAF;
+ }
+
+ if (pcur->restore_position(mode, &mtr) != btr_pcur_t::SAME_ALL) {
+ err = DB_RECORD_NOT_FOUND;
+ goto exit_func;
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (!flags && !node->has_clust_rec_x_lock) {
+ err = lock_clust_rec_modify_check_and_lock(
+ btr_pcur_get_block(pcur),
+ rec, index, offsets, thr);
+ if (err != DB_SUCCESS) {
+ goto exit_func;
+ }
+ }
+
+ ut_ad(index->table->no_rollback() || index->table->is_temporary()
+ || row_get_rec_trx_id(rec, index, offsets) == trx->id
+ || lock_trx_has_expl_x_lock(*trx, *index->table,
+ btr_pcur_get_block(pcur)->page.id(),
+ page_rec_get_heap_no(rec)));
+
+ if (node->is_delete == PLAIN_DELETE) {
+ err = row_upd_del_mark_clust_rec(
+ node, index, offsets, thr, referenced,
+#ifdef WITH_WSREP
+ foreign,
+#endif
+ &mtr);
+ goto all_done;
+ }
+
+ /* If the update is made for MySQL, we already have the update vector
+ ready, else we have to do some evaluation: */
+
+ if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+ /* Copy the necessary columns from clust_rec and calculate the
+ new values to set */
+ row_upd_copy_columns(rec, offsets, index,
+ UT_LIST_GET_FIRST(node->columns));
+ row_upd_eval_new_vals(node->update);
+ }
+
+ if (!node->is_delete && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ err = row_upd_clust_rec(
+ flags, node, index, offsets, &heap, thr, &mtr);
+ goto exit_func;
+ }
+
+ if (!row_upd_store_row(node, trx->mysql_thd, thr->prebuilt
+ ? thr->prebuilt->m_mysql_table : NULL)) {
+ err = DB_COMPUTE_VALUE_FAILED;
+ goto exit_func;
+ }
+
+ if (row_upd_changes_ord_field_binary(index, node->update, thr,
+ node->row, node->ext)) {
+
+ /* Update causes an ordering field (ordering fields within
+ the B-tree) of the clustered index record to change: perform
+ the update by delete marking and inserting.
+
+ TODO! What to do to the 'Halloween problem', where an update
+ moves the record forward in index so that it is again
+ updated when the cursor arrives there? Solution: the
+ read operation must check the undo record undo number when
+ choosing records to update. MySQL solves now the problem
+ externally! */
+
+ err = row_upd_clust_rec_by_insert(
+ node, index, thr, referenced,
+#ifdef WITH_WSREP
+ foreign,
+#endif
+ &mtr);
+all_done:
+ if (err == DB_SUCCESS) {
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+success:
+ node->index = dict_table_get_next_index(index);
+ }
+ } else {
+ err = row_upd_clust_rec(
+ flags, node, index, offsets, &heap, thr, &mtr);
+
+ if (err == DB_SUCCESS) {
+ ut_ad(node->is_delete != PLAIN_DELETE);
+ node->state = node->is_delete
+ ? UPD_NODE_UPDATE_ALL_SEC
+ : UPD_NODE_UPDATE_SOME_SEC;
+ goto success;
+ }
+ }
+
+exit_func:
+ mtr.commit();
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return err;
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+dberr_t
+row_upd(
+/*====*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dberr_t err = DB_SUCCESS;
+ DBUG_ENTER("row_upd");
+
+ ut_ad(!thr_get_trx(thr)->in_rollback);
+
+ DBUG_PRINT("row_upd", ("table: %s", node->table->name.m_name));
+ DBUG_PRINT("row_upd", ("info bits in update vector: 0x%x",
+ node->update ? node->update->info_bits: 0));
+ DBUG_PRINT("row_upd", ("foreign_id: %s",
+ node->foreign ? node->foreign->id: "NULL"));
+
+ if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+ /* We do not get the cmpl_info value from the MySQL
+ interpreter: we must calculate it on the fly: */
+
+ if (node->is_delete == PLAIN_DELETE
+ || row_upd_changes_some_index_ord_field_binary(
+ node->table, node->update)) {
+ node->cmpl_info = 0;
+ } else {
+ node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+ }
+ }
+
+ switch (node->state) {
+ case UPD_NODE_UPDATE_CLUSTERED:
+ case UPD_NODE_INSERT_CLUSTERED:
+ log_free_check();
+
+ err = row_upd_clust_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ DBUG_RETURN(err);
+ }
+ }
+
+ DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+ "after_row_upd_clust");
+
+ if (node->index == NULL
+ || (!node->is_delete
+ && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+
+ DBUG_RETURN(DB_SUCCESS);
+ }
+
+ DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
+
+ do {
+ if (!node->index) {
+ break;
+ }
+
+ if (!(node->index->type & (DICT_FTS | DICT_CORRUPT))
+ && node->index->is_committed()) {
+ err = row_upd_sec_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ DBUG_RETURN(err);
+ }
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ } while (node->index != NULL);
+
+ ut_ad(err == DB_SUCCESS);
+
+ /* Do some cleanup */
+
+ if (node->row != NULL) {
+ node->row = NULL;
+ node->ext = NULL;
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ mem_heap_empty(node->heap);
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ upd_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* parent;
+ dberr_t err = DB_SUCCESS;
+ trx_t* trx;
+ DBUG_ENTER("row_upd_step");
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ node = static_cast<upd_node_t*>(thr->run_node);
+
+ sel_node = node->select;
+
+ parent = que_node_get_parent(node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ if (thr->prev_node == parent) {
+ node->state = UPD_NODE_SET_IX_LOCK;
+ }
+
+ if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+ if (!node->has_clust_rec_x_lock) {
+ /* It may be that the current session has not yet
+ started its transaction, or it has been committed: */
+
+ err = lock_table(node->table, nullptr, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ if (node->searched_update) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to update */
+
+ thr->run_node = sel_node;
+
+ DBUG_RETURN(thr);
+ }
+ }
+
+ /* sel_node is NULL if we are in the MySQL interface */
+
+ if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+ if (!node->searched_update) {
+ /* An explicit cursor should be positioned on a row
+ to update */
+
+ ut_error;
+
+ err = DB_ERROR;
+
+ goto error_handling;
+ }
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to update, or the select node performed the
+ updates directly in-place */
+
+ thr->run_node = parent;
+
+ DBUG_RETURN(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_upd(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ DBUG_RETURN(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->searched_update) {
+ /* Fetch next row to update */
+
+ thr->run_node = sel_node;
+ } else {
+ /* It was an explicit cursor update */
+
+ thr->run_node = parent;
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ DBUG_RETURN(thr);
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param thd Thread object
+@param buf Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Appends row_start or row_end field to update vector and sets a
+CURRENT_TIMESTAMP/trx->id value to it. Called by vers_make_update() and
+vers_make_delete().
+@param[in] trx transaction
+@param[in] vers_sys_idx table->row_start or table->row_end */
+void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx)
+{
+ ut_ad(in_mysql_interface); // otherwise needs to recalculate node->cmpl_info
+ ut_ad(idx == table->vers_start || idx == table->vers_end);
+
+ dict_index_t *clust_index= dict_table_get_first_index(table);
+ const dict_col_t *col= dict_table_get_nth_col(table, idx);
+ ulint field_no= dict_col_get_clust_pos(col, clust_index);
+ upd_field_t *ufield;
+
+ for (ulint i= 0; i < update->n_fields; ++i)
+ {
+ if (update->fields[i].field_no == field_no)
+ {
+ ufield= &update->fields[i];
+ goto skip_append;
+ }
+ }
+
+ /* row_create_update_node_for_mysql() pre-allocated this much.
+ At least one PK column always remains unchanged. */
+ ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols));
+
+ update->n_fields++;
+ ufield= upd_get_nth_field(update, update->n_fields - 1);
+ upd_field_set_field_no(ufield, static_cast<uint16_t>(field_no), clust_index);
+
+skip_append:
+ char *where= reinterpret_cast<char *>(update->vers_sys_value);
+ if (col->vers_native())
+ mach_write_to_8(where, trx->id);
+ else
+ thd_get_query_start_data(trx->mysql_thd, where);
+
+ dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len);
+
+ for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
+ {
+ const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
+ if (!v_col->m_col.ord_part)
+ continue;
+ for (ulint i= 0; i < unsigned(v_col->num_base); i++)
+ {
+ dict_col_t *base_col= v_col->base_col[i];
+ if (base_col->ind == col->ind)
+ {
+ /* Virtual column depends on system field value
+ which we updated above. Remove it from update
+ vector, so it is recalculated in
+ row_upd_store_v_row() (see !update branch). */
+ update->remove(v_col->v_pos);
+ break;
+ }
+ }
+ }
+}
+
+
+/** Prepare update vector for versioned delete.
+Set row_end to CURRENT_TIMESTAMP or trx->id.
+Initialize fts_next_doc_id for versioned delete.
+@param[in] trx transaction */
+void upd_node_t::vers_make_delete(trx_t* trx)
+{
+ update->n_fields= 0;
+ is_delete= VERSIONED_DELETE;
+ vers_update_fields(trx, table->vers_end);
+ trx->fts_next_doc_id= table->fts ? UINT64_UNDEFINED : 0;
+}
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
new file mode 100644
index 00000000..c3acf325
--- /dev/null
+++ b/storage/innobase/row/row0vers.cc
@@ -0,0 +1,1419 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.cc
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+
+/** Check whether all non-virtual index fields are equal.
+@param[in] index the secondary index
+@param[in] a first index entry to compare
+@param[in] b second index entry to compare
+@return whether all non-virtual fields are equal */
+static
+bool
+row_vers_non_virtual_fields_equal(
+ const dict_index_t* index,
+ const dfield_t* a,
+ const dfield_t* b)
+{
+ const dict_field_t* end = &index->fields[index->n_fields];
+
+ for (const dict_field_t* ifield = index->fields; ifield != end;
+ ifield++) {
+ if (!ifield->col->is_virtual()
+ && cmp_dfield_dfield(a++, b++)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out] caller_trx trx of current thread
+@param[in] clust_rec clustered index record
+@param[in] clust_index clustered index
+@param[in] rec secondary index record
+@param[in] index secondary index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in,out] mtr mini-transaction
+@return the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval NULL if the record was committed */
+UNIV_INLINE
+trx_t*
+row_vers_impl_x_locked_low(
+ trx_t* caller_trx,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets,
+ mtr_t* mtr)
+{
+ trx_id_t trx_id;
+ rec_t* prev_version = NULL;
+ rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* clust_offsets;
+ mem_heap_t* heap;
+ dtuple_t* ientry = NULL;
+ mem_heap_t* v_heap = NULL;
+ dtuple_t* cur_vrow = NULL;
+
+ rec_offs_init(clust_offsets_);
+
+ DBUG_ENTER("row_vers_impl_x_locked_low");
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->memo_contains_page_flagged(clust_rec,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+
+ if (ulint trx_id_offset = clust_index->trx_id_offset) {
+ trx_id = mach_read_from_6(clust_rec + trx_id_offset);
+ if (trx_id == 0) {
+ /* The transaction history was already purged. */
+ DBUG_RETURN(0);
+ }
+ }
+
+ heap = mem_heap_create(1024);
+
+ clust_offsets = rec_get_offsets(clust_rec, clust_index, clust_offsets_,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+ if (trx_id == 0) {
+ /* The transaction history was already purged. */
+ mem_heap_free(heap);
+ DBUG_RETURN(0);
+ }
+
+ ut_ad(!clust_index->table->is_temporary());
+
+ trx_t* trx;
+
+ if (trx_id == caller_trx->id) {
+ trx = caller_trx;
+ trx->reference();
+ } else {
+ trx = trx_sys.find(caller_trx, trx_id);
+ if (trx == 0) {
+ /* The transaction that modified or inserted
+ clust_rec is no longer active, or it is
+ corrupt: no implicit lock on rec */
+ lock_check_trx_id_sanity(trx_id, clust_rec,
+ clust_index, clust_offsets);
+ mem_heap_free(heap);
+ DBUG_RETURN(0);
+ }
+ }
+
+ const ulint comp = page_rec_is_comp(rec);
+ ut_ad(index->table == clust_index->table);
+ ut_ad(!!comp == dict_table_is_comp(index->table));
+ ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+ const ulint rec_del = rec_get_deleted_flag(rec, comp);
+
+ if (dict_index_has_virtual(index)) {
+ ulint est_size = DTUPLE_EST_ALLOC(index->n_fields);
+
+ /* Allocate the dtuple for virtual columns extracted from undo
+ log with its own heap, so to avoid it being freed as we
+ iterating in the version loop below. */
+ v_heap = mem_heap_create(est_size);
+ ientry = row_rec_to_index_entry(rec, index, offsets, v_heap);
+ }
+
+ /* We look up if some earlier version, which was modified by
+ the trx_id transaction, of the clustered index record would
+ require rec to be in a different state (delete marked or
+ unmarked, or have different field values, or not existing). If
+ there is such a version, then rec was modified by the trx_id
+ transaction, and it has an implicit x-lock on rec. Note that
+ if clust_rec itself would require rec to be in a different
+ state, then the trx_id transaction has not yet had time to
+ modify rec, and does not necessarily have an implicit x-lock
+ on rec. */
+
+ for (const rec_t* version = clust_rec;; version = prev_version) {
+ row_ext_t* ext;
+ dtuple_t* row;
+ dtuple_t* entry;
+ ulint vers_del;
+ trx_id_t prev_trx_id;
+ mem_heap_t* old_heap = heap;
+ dtuple_t* vrow = NULL;
+
+ /* We keep the semaphore in mtr on the clust_rec page, so
+ that no other transaction can update it and get an
+ implicit x-lock on rec until mtr_commit(mtr). */
+
+ heap = mem_heap_create(1024);
+
+ trx_undo_prev_version_build(
+ version, clust_index, clust_offsets,
+ heap, &prev_version, NULL,
+ dict_index_has_virtual(index) ? &vrow : NULL, 0);
+
+ ut_d(trx->mutex_lock());
+ const bool committed = trx_state_eq(
+ trx, TRX_STATE_COMMITTED_IN_MEMORY);
+ ut_d(trx->mutex_unlock());
+
+ /* The oldest visible clustered index version must not be
+ delete-marked, because we never start a transaction by
+ inserting a delete-marked record. */
+ ut_ad(committed || prev_version
+ || !rec_get_deleted_flag(version, comp));
+
+ /* Free version and clust_offsets. */
+ mem_heap_free(old_heap);
+
+ if (committed) {
+ goto not_locked;
+ }
+
+ if (prev_version == NULL) {
+
+ /* We reached the oldest visible version without
+ finding an older version of clust_rec that would
+ match the secondary index record. If the secondary
+ index record is not delete marked, then clust_rec
+ is considered the correct match of the secondary
+ index record and hence holds the implicit lock. */
+
+ if (rec_del) {
+ /* The secondary index record is del marked.
+ So, the implicit lock holder of clust_rec
+ did not modify the secondary index record yet,
+ and is not holding an implicit lock on it.
+
+ This assumes that whenever a row is inserted
+ or updated, the leaf page record always is
+ created with a clear delete-mark flag.
+ (We never insert a delete-marked record.) */
+not_locked:
+ trx->release_reference();
+ trx = 0;
+ }
+
+ break;
+ }
+
+ clust_offsets = rec_get_offsets(
+ prev_version, clust_index, clust_offsets_,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ vers_del = rec_get_deleted_flag(prev_version, comp);
+
+ prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+ clust_offsets);
+
+ /* The stack of versions is locked by mtr. Thus, it
+ is safe to fetch the prefixes for externally stored
+ columns. */
+
+ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+ clust_offsets,
+ NULL, NULL, NULL, &ext, heap);
+
+ if (dict_index_has_virtual(index)) {
+ if (vrow) {
+ /* Keep the virtual row info for the next
+ version */
+ cur_vrow = dtuple_copy(vrow, v_heap);
+ dtuple_dup_v_fld(cur_vrow, v_heap);
+ }
+
+ if (!cur_vrow) {
+ /* Build index entry out of row */
+ entry = row_build_index_entry(row, ext, index,
+ heap);
+
+ /* entry could only be NULL (the
+ clustered index record could contain
+ BLOB pointers that are NULL) if we
+ were accessing a freshly inserted
+ record before it was fully inserted.
+ prev_version cannot possibly be such
+ an incomplete record, because its
+ transaction would have to be committed
+ in order for later versions of the
+ record to be able to exist. */
+ ut_ad(entry);
+
+ /* If the indexed virtual columns has changed,
+ there must be log record to generate vrow.
+ Otherwise, it is not changed, so no need
+ to compare */
+ if (!row_vers_non_virtual_fields_equal(
+ index,
+ ientry->fields, entry->fields)) {
+ if (rec_del != vers_del) {
+ break;
+ }
+ } else if (!rec_del) {
+ break;
+ }
+
+ goto result_check;
+ } else {
+ ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+ dtuple_copy_v_fields(row, cur_vrow);
+ }
+ }
+
+ entry = row_build_index_entry(row, ext, index, heap);
+
+ /* entry could only be NULL (the clustered index
+ record could contain BLOB pointers that are NULL) if
+ we were accessing a freshly inserted record before it
+ was fully inserted. prev_version cannot possibly be
+ such an incomplete record, because its transaction
+ would have to be committed in order for later versions
+ of the record to be able to exist. */
+ ut_ad(entry);
+
+ /* If we get here, we know that the trx_id transaction
+ modified prev_version. Let us check if prev_version
+ would require rec to be in a different state. */
+
+ /* The previous version of clust_rec must be
+ accessible, because clust_rec was not a fresh insert.
+ There is no guarantee that the transaction is still
+ active. */
+
+ /* We check if entry and rec are identified in the alphabetical
+ ordering */
+ if (0 == cmp_dtuple_rec(entry, rec, index, offsets)) {
+ /* The delete marks of rec and prev_version should be
+ equal for rec to be in the state required by
+ prev_version */
+
+ if (rec_del != vers_del) {
+
+ break;
+ }
+
+ /* It is possible that the row was updated so that the
+ secondary index record remained the same in
+ alphabetical ordering, but the field values changed
+ still. For example, 'abc' -> 'ABC'. Check also that. */
+
+ dtuple_set_types_binary(
+ entry, dtuple_get_n_fields(entry));
+
+ if (cmp_dtuple_rec(entry, rec, index, offsets)) {
+
+ break;
+ }
+
+ } else if (!rec_del) {
+ /* The delete mark should be set in rec for it to be
+ in the state required by prev_version */
+
+ break;
+ }
+
+result_check:
+ if (trx->id != prev_trx_id) {
+ /* prev_version was the first version modified by
+ the trx_id transaction: no implicit x-lock */
+ goto not_locked;
+ }
+ }
+
+ if (trx) {
+ DBUG_PRINT("info", ("Implicit lock is held by trx:" TRX_ID_FMT,
+ trx_id));
+ }
+
+ if (v_heap != NULL) {
+ mem_heap_free(v_heap);
+ }
+
+ mem_heap_free(heap);
+ DBUG_RETURN(trx);
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out] caller_trx trx of current thread
+@param[in] rec secondary index record
+@param[in] index secondary index
+@param[in] offsets rec_get_offsets(rec, index)
+@return the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+ trx_t* caller_trx,
+ const rec_t* rec,
+ dict_index_t* index,
+ const rec_offs* offsets)
+{
+ mtr_t mtr;
+ trx_t* trx;
+ const rec_t* clust_rec;
+ dict_index_t* clust_index;
+
+ lock_sys.assert_unlocked();
+
+ mtr_start(&mtr);
+
+ /* Search for the clustered index record. The latch on the
+ page of clust_rec locks the top of the stack of versions. The
+ bottom of the version stack is not locked; oldest versions may
+ disappear by the fact that transactions may be committed and
+ collected by the purge. This is not a problem, because we are
+ only interested in active transactions. */
+
+ clust_rec = row_get_clust_rec(
+ BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+ if (!clust_rec) {
+ /* In a rare case it is possible that no clust rec is found
+ for a secondary index record: if in row0umod.cc
+ row_undo_mod_remove_clust_low() we have already removed the
+ clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case there cannot be
+ any implicit lock on the secondary index record, because
+ an active transaction which has modified the secondary index
+ record has also modified the clustered index record. And in
+ a rollback we always undo the modifications to secondary index
+ records before the clustered index record. */
+
+ trx = 0;
+ } else {
+ trx = row_vers_impl_x_locked_low(
+ caller_trx, clust_rec, clust_index, rec, index,
+ offsets, &mtr);
+
+ ut_ad(trx == 0 || trx->is_referenced());
+ }
+
+ mtr_commit(&mtr);
+
+ return(trx);
+}
+
+/** build virtual column value from current cluster index record data
+@param[in,out] row the cluster index row in dtuple form
+@param[in] clust_index clustered index
+@param[in] index the secondary index
+@param[in] heap heap used to build virtual dtuple. */
+static
+bool
+row_vers_build_clust_v_col(
+ dtuple_t* row,
+ dict_index_t* clust_index,
+ dict_index_t* index,
+ mem_heap_t* heap)
+{
+ THD* thd= current_thd;
+ TABLE* maria_table= 0;
+
+ ut_ad(dict_index_has_virtual(index));
+ ut_ad(index->table == clust_index->table);
+
+ DEBUG_SYNC(current_thd, "ib_clust_v_col_before_row_allocated");
+
+ ib_vcol_row vc(nullptr);
+ byte *record = vc.record(thd, index, &maria_table);
+
+ ut_ad(maria_table);
+
+ for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_col_t* c = dict_index_get_nth_col(index, i);
+
+ if (c->is_virtual()) {
+ const dict_v_col_t* col
+ = reinterpret_cast<const dict_v_col_t*>(c);
+
+ dfield_t *vfield = innobase_get_computed_value(
+ row, col, clust_index, &vc.heap,
+ heap, NULL, thd, maria_table, record, NULL,
+ NULL);
+ if (!vfield) {
+ innobase_report_computed_value_failed(row);
+ ut_ad(0);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/** Build latest virtual column data from undo log
+@param[in] in_purge whether this is the purge thread
+@param[in] rec clustered index record
+@param[in] clust_index clustered index
+@param[in,out] clust_offsets offsets on the clustered index record
+@param[in] index the secondary index
+@param[in] roll_ptr the rollback pointer for the purging record
+@param[in] trx_id trx id for the purging record
+@param[in,out] v_heap heap used to build vrow
+@param[out] v_row dtuple holding the virtual rows
+@param[in,out] mtr mtr holding the latch on rec */
+static
+void
+row_vers_build_cur_vrow_low(
+ bool in_purge,
+ const rec_t* rec,
+ dict_index_t* clust_index,
+ rec_offs* clust_offsets,
+ dict_index_t* index,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id,
+ mem_heap_t* v_heap,
+ dtuple_t** vrow,
+ mtr_t* mtr)
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ mem_heap_t* heap = NULL;
+ ulint num_v = dict_table_get_n_v_cols(index->table);
+ const dfield_t* field;
+ ulint i;
+ bool all_filled = false;
+
+ *vrow = dtuple_create_with_vcol(v_heap, 0, num_v);
+ dtuple_init_v_fld(*vrow);
+
+ for (i = 0; i < num_v; i++) {
+ dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+ = DATA_MISSING;
+ }
+
+ ut_ad(mtr->memo_contains_page_flagged(rec,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+
+ version = rec;
+
+ /* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE
+ bit to search the undo log until we hit the current undo log with
+ roll_ptr */
+ const ulint status = in_purge
+ ? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE
+ : TRX_UNDO_GET_OLD_V_VALUE;
+
+ while (!all_filled) {
+ mem_heap_t* heap2 = heap;
+ heap = mem_heap_create(1024);
+ roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr(
+ version, clust_index, clust_offsets);
+
+ trx_undo_prev_version_build(
+ version, clust_index, clust_offsets,
+ heap, &prev_version, NULL, vrow, status);
+
+ if (heap2) {
+ mem_heap_free(heap2);
+ }
+
+ if (!prev_version) {
+ /* Versions end here */
+ break;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ulint entry_len = dict_index_get_n_fields(index);
+
+ all_filled = true;
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, i);
+
+ if (!col->is_virtual()) {
+ continue;
+ }
+
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(col);
+ field = dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+ if (dfield_get_type(field)->mtype == DATA_MISSING) {
+ all_filled = false;
+ break;
+ }
+
+ }
+
+ trx_id_t rec_trx_id = row_get_rec_trx_id(
+ prev_version, clust_index, clust_offsets);
+
+ if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+ break;
+ }
+
+ version = prev_version;
+ }
+
+ mem_heap_free(heap);
+}
+
+/** Check a virtual column value index secondary virtual index matches
+that of current cluster index record, which is recreated from information
+stored in undo log
+@param[in] rec record in the clustered index
+@param[in] icentry the index entry built from a cluster row
+@param[in] clust_index cluster index
+@param[in] clust_offsets offsets on the cluster record
+@param[in] index the secondary index
+@param[in] ientry the secondary index entry
+@param[in] roll_ptr the rollback pointer for the purging record
+@param[in] trx_id trx id for the purging record
+@param[in,out] v_heap heap used to build virtual dtuple
+@param[in,out] v_row dtuple holding the virtual rows (if needed)
+@param[in] mtr mtr holding the latch on rec
+@return true if matches, false otherwise */
+static
+bool
+row_vers_vc_matches_cluster(
+ const rec_t* rec,
+ const dtuple_t* icentry,
+ dict_index_t* clust_index,
+ rec_offs* clust_offsets,
+ dict_index_t* index,
+ const dtuple_t* ientry,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id,
+ mem_heap_t* v_heap,
+ dtuple_t** vrow,
+ mtr_t* mtr)
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ mem_heap_t* heap2;
+ mem_heap_t* heap = NULL;
+ mem_heap_t* tuple_heap;
+ ulint num_v = dict_table_get_n_v_cols(index->table);
+ bool compare[REC_MAX_N_FIELDS];
+ ulint n_fields = dtuple_get_n_fields(ientry);
+ ulint n_non_v_col = 0;
+ ulint n_cmp_v_col = 0;
+ const dfield_t* field1;
+ dfield_t* field2;
+ ulint i;
+
+ /* First compare non-virtual columns (primary keys) */
+ ut_ad(index->n_fields == n_fields);
+ ut_ad(n_fields == dtuple_get_n_fields(icentry));
+ ut_ad(mtr->memo_contains_page_flagged(rec,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+
+ {
+ const dfield_t* a = ientry->fields;
+ const dfield_t* b = icentry->fields;
+
+ for (const dict_field_t *ifield = index->fields,
+ *const end = &index->fields[index->n_fields];
+ ifield != end; ifield++, a++, b++) {
+ if (!ifield->col->is_virtual()) {
+ if (cmp_dfield_dfield(a, b)) {
+ return false;
+ }
+ n_non_v_col++;
+ }
+ }
+ }
+
+ tuple_heap = mem_heap_create(1024);
+
+ ut_ad(n_fields > n_non_v_col);
+
+ *vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v);
+ dtuple_init_v_fld(*vrow);
+
+ for (i = 0; i < num_v; i++) {
+ dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+ = DATA_MISSING;
+ compare[i] = false;
+ }
+
+ version = rec;
+
+ while (n_cmp_v_col < n_fields - n_non_v_col) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr(
+ version, clust_index, clust_offsets);
+
+ ut_ad(cur_roll_ptr != 0);
+ ut_ad(roll_ptr != 0);
+
+ trx_undo_prev_version_build(
+ version, clust_index, clust_offsets,
+ heap, &prev_version, NULL, vrow,
+ TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE);
+
+ if (heap2) {
+ mem_heap_free(heap2);
+ }
+
+ if (!prev_version) {
+ /* Versions end here */
+ goto func_exit;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ ulint entry_len = dict_index_get_n_fields(index);
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col = ind_field->col;
+ field1 = dtuple_get_nth_field(ientry, i);
+
+ if (!col->is_virtual()) {
+ continue;
+ }
+
+ const dict_v_col_t* v_col
+ = reinterpret_cast<const dict_v_col_t*>(col);
+ field2
+ = dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+ if ((dfield_get_type(field2)->mtype != DATA_MISSING)
+ && (!compare[v_col->v_pos])) {
+
+ if (ind_field->prefix_len != 0
+ && !dfield_is_null(field2)) {
+ field2->len = unsigned(
+ dtype_get_at_most_n_mbchars(
+ field2->type.prtype,
+ field2->type.mbminlen,
+ field2->type.mbmaxlen,
+ ind_field->prefix_len,
+ field2->len,
+ static_cast<char*>
+ (field2->data)));
+ }
+
+ /* The index field mismatch */
+ if (v_heap
+ || cmp_dfield_dfield(field2, field1)) {
+ if (v_heap) {
+ dtuple_dup_v_fld(*vrow, v_heap);
+ }
+
+ mem_heap_free(tuple_heap);
+ mem_heap_free(heap);
+ return(false);
+ }
+
+ compare[v_col->v_pos] = true;
+ n_cmp_v_col++;
+ }
+ }
+
+ trx_id_t rec_trx_id = row_get_rec_trx_id(
+ prev_version, clust_index, clust_offsets);
+
+ if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+ break;
+ }
+
+ version = prev_version;
+ }
+
+func_exit:
+ if (n_cmp_v_col == 0) {
+ *vrow = NULL;
+ }
+
+ mem_heap_free(tuple_heap);
+ mem_heap_free(heap);
+
+ /* FIXME: In the case of n_cmp_v_col is not the same as
+ n_fields - n_non_v_col, callback is needed to compare the rest
+ columns. At the timebeing, we will need to return true */
+ return (true);
+}
+
+/** Build a dtuple contains virtual column data for current cluster index
+@param[in] in_purge called by purge thread
+@param[in] rec cluster index rec
+@param[in] clust_index cluster index
+@param[in] clust_offsets cluster rec offset
+@param[in] index secondary index
+@param[in] roll_ptr roll_ptr for the purge record
+@param[in] trx_id transaction ID on the purging record
+@param[in,out] heap heap memory
+@param[in,out] v_heap heap memory to keep virtual colum dtuple
+@param[in] mtr mtr holding the latch on rec
+@return dtuple contains virtual column data */
+static
+dtuple_t*
+row_vers_build_cur_vrow(
+ bool in_purge,
+ const rec_t* rec,
+ dict_index_t* clust_index,
+ rec_offs** clust_offsets,
+ dict_index_t* index,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id,
+ mem_heap_t* heap,
+ mem_heap_t* v_heap,
+ mtr_t* mtr)
+{
+ dtuple_t* cur_vrow = NULL;
+
+ roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+ rec, clust_index, *clust_offsets);
+
+ /* if the row is newly inserted, then the virtual
+ columns need to be computed */
+ if (trx_undo_roll_ptr_is_insert(t_roll_ptr)) {
+
+ ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+
+ /* This is a newly inserted record and cannot
+ be deleted, So the externally stored field
+ cannot be freed yet. */
+ dtuple_t* row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, *clust_offsets,
+ NULL, NULL, NULL, NULL, heap);
+
+ if (!row_vers_build_clust_v_col(row, clust_index, index,
+ heap)) {
+ return nullptr;
+ }
+
+ cur_vrow = dtuple_copy(row, v_heap);
+ dtuple_dup_v_fld(cur_vrow, v_heap);
+ } else {
+ /* Try to fetch virtual column data from undo log */
+ row_vers_build_cur_vrow_low(
+ in_purge, rec, clust_index, *clust_offsets,
+ index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr);
+ }
+
+ *clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ return(cur_vrow);
+}
+
+/** @return whether two data tuples are equal */
+static bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2)
+{
+ ut_ad(tuple1.magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(tuple2.magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(&tuple1));
+ ut_ad(dtuple_check_typed(&tuple2));
+ ut_ad(tuple1.n_fields == tuple2.n_fields);
+
+ for (ulint i= 0; i < tuple1.n_fields; i++)
+ if (cmp_dfield_dfield(&tuple1.fields[i], &tuple2.fields[i]))
+ return false;
+ return true;
+}
+
+/** Find out whether data tuple has missing data type
+for indexed virtual column.
+@param tuple data tuple
+@param index virtual index
+@return true if tuple has missing column type */
+static bool dtuple_vcol_data_missing(const dtuple_t &tuple,
+ dict_index_t *index)
+{
+ for (ulint i= 0; i < index->n_uniq; i++)
+ {
+ dict_col_t *col= index->fields[i].col;
+ if (!col->is_virtual())
+ continue;
+ dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col);
+ for (ulint j= 0; j < index->table->n_v_cols; j++)
+ {
+ if (vcol == &index->table->v_cols[j]
+ && tuple.v_fields[j].type.mtype == DATA_MISSING)
+ return true;
+ }
+ }
+ return false;
+}
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in] also_curr TRUE if also rec is included in the versions
+ to search; otherwise only versions prior
+ to it are searched
+@param[in] rec record in the clustered index; the caller
+ must have a latch on the page
+@param[in] mtr mtr holding the latch on rec; it will
+ also hold the latch on purge_view
+@param[in] index secondary index
+@param[in] ientry secondary index entry
+@param[in] roll_ptr roll_ptr for the purge record
+@param[in] trx_id transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+ bool also_curr,
+ const rec_t* rec,
+ mtr_t* mtr,
+ dict_index_t* index,
+ const dtuple_t* ientry,
+ roll_ptr_t roll_ptr,
+ trx_id_t trx_id)
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ dict_index_t* clust_index;
+ rec_offs* clust_offsets;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ const dtuple_t* entry;
+ ulint comp;
+ dtuple_t* vrow = NULL;
+ mem_heap_t* v_heap = NULL;
+ dtuple_t* cur_vrow = NULL;
+
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+ clust_index = dict_table_get_first_index(index->table);
+
+ comp = page_rec_is_comp(rec);
+ ut_ad(!dict_table_is_comp(index->table) == !comp);
+ heap = mem_heap_create(1024);
+ clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_index_has_virtual(index)) {
+ v_heap = mem_heap_create(100);
+ }
+
+ DBUG_EXECUTE_IF("ib_purge_virtual_index_crash",
+ DBUG_SUICIDE(););
+
+ if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+ row_ext_t* ext;
+
+ /* The top of the stack of versions is locked by the
+ mtr holding a latch on the page containing the
+ clustered index record. The bottom of the stack is
+ locked by the fact that the purge_sys.view must
+ 'overtake' any read view of an active transaction.
+ Thus, it is safe to fetch the prefixes for
+ externally stored columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, clust_offsets,
+ NULL, NULL, NULL, &ext, heap);
+
+ if (dict_index_has_virtual(index)) {
+
+
+#ifdef DBUG_OFF
+# define dbug_v_purge false
+#else /* DBUG_OFF */
+ bool dbug_v_purge = false;
+#endif /* DBUG_OFF */
+
+ DBUG_EXECUTE_IF(
+ "ib_purge_virtual_index_callback",
+ dbug_v_purge = true;);
+
+ roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+ rec, clust_index, clust_offsets);
+
+ /* if the row is newly inserted, then the virtual
+ columns need to be computed */
+ if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
+ || dbug_v_purge) {
+
+ if (!row_vers_build_clust_v_col(
+ row, clust_index, index, heap)) {
+ goto unsafe_to_purge;
+ }
+
+ entry = row_build_index_entry(
+ row, ext, index, heap);
+ if (entry && dtuple_coll_eq(*ientry, *entry)) {
+ goto unsafe_to_purge;
+ }
+ } else {
+ /* Build index entry out of row */
+ entry = row_build_index_entry(row, ext, index, heap);
+ /* entry could only be NULL if
+ the clustered index record is an uncommitted
+ inserted record whose BLOBs have not been
+ written yet. The secondary index record
+ can be safely removed, because it cannot
+ possibly refer to this incomplete
+ clustered index record. (Insert would
+ always first be completed for the
+ clustered index record, then proceed to
+ secondary indexes.) */
+
+ if (entry && row_vers_vc_matches_cluster(
+ rec, entry,
+ clust_index, clust_offsets,
+ index, ientry, roll_ptr,
+ trx_id, NULL, &vrow, mtr)) {
+ goto unsafe_to_purge;
+ }
+ }
+ clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ clust_index
+ ->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ } else {
+
+ entry = row_build_index_entry(
+ row, ext, index, heap);
+
+ /* If entry == NULL, the record contains unset BLOB
+ pointers. This must be a freshly inserted record. If
+ this is called from
+ row_purge_remove_sec_if_poss_low(), the thread will
+ hold latches on the clustered index and the secondary
+ index. Because the insert works in three steps:
+
+ (1) insert the record to clustered index
+ (2) store the BLOBs and update BLOB pointers
+ (3) insert records to secondary indexes
+
+ the purge thread can safely ignore freshly inserted
+ records and delete the secondary index record. The
+ thread that inserted the new record will be inserting
+ the secondary index records. */
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because the row is maybe being modified so that
+ the clustered index record has already been updated to
+ a different binary value in a char field, but the
+ collation identifies the old and new value anyway! */
+ if (entry && dtuple_coll_eq(*ientry, *entry)) {
+unsafe_to_purge:
+ mem_heap_free(heap);
+
+ if (v_heap) {
+ mem_heap_free(v_heap);
+ }
+ return true;
+ }
+ }
+ } else if (dict_index_has_virtual(index)) {
+ /* The current cluster index record could be
+ deleted, but the previous version of it might not. We will
+ need to get the virtual column data from undo record
+ associated with current cluster index */
+
+ cur_vrow = row_vers_build_cur_vrow(
+ also_curr, rec, clust_index, &clust_offsets,
+ index, roll_ptr, trx_id, heap, v_heap, mtr);
+ }
+
+ version = rec;
+
+ for (;;) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ vrow = NULL;
+
+ trx_undo_prev_version_build(version,
+ clust_index, clust_offsets,
+ heap, &prev_version, nullptr,
+ dict_index_has_virtual(index)
+ ? &vrow : nullptr,
+ TRX_UNDO_CHECK_PURGEABILITY);
+ mem_heap_free(heap2); /* free version and clust_offsets */
+
+ if (!prev_version) {
+ /* Versions end here */
+ mem_heap_free(heap);
+
+ if (v_heap) {
+ mem_heap_free(v_heap);
+ }
+
+ return false;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL,
+ clust_index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_index_has_virtual(index)) {
+ if (vrow) {
+ if (dtuple_vcol_data_missing(*vrow, index)) {
+ goto nochange_index;
+ }
+ /* Keep the virtual row info for the next
+ version, unless it is changed */
+ mem_heap_empty(v_heap);
+ cur_vrow = dtuple_copy(vrow, v_heap);
+ dtuple_dup_v_fld(cur_vrow, v_heap);
+ }
+
+ if (!cur_vrow) {
+ /* Nothing for this index has changed,
+ continue */
+nochange_index:
+ version = prev_version;
+ continue;
+ }
+ }
+
+ if (!rec_get_deleted_flag(prev_version, comp)) {
+ row_ext_t* ext;
+
+ /* The stack of versions is locked by mtr.
+ Thus, it is safe to fetch the prefixes for
+ externally stored columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, clust_offsets,
+ NULL, NULL, NULL, &ext, heap);
+
+ if (dict_index_has_virtual(index)) {
+ ut_ad(cur_vrow);
+ ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+ dtuple_copy_v_fields(row, cur_vrow);
+ }
+
+ entry = row_build_index_entry(row, ext, index, heap);
+
+ /* If entry == NULL, the record contains unset
+ BLOB pointers. This must be a freshly
+ inserted record that we can safely ignore.
+ For the justification, see the comments after
+ the previous row_build_index_entry() call. */
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because maybe the secondary index record has
+ already been updated to a different binary value in
+ a char field, but the collation identifies the old
+ and new value anyway! */
+
+ if (entry && dtuple_coll_eq(*ientry, *entry)) {
+ goto unsafe_to_purge;
+ }
+ }
+
+ version = prev_version;
+ }
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ ReadView* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers,/*!< out, own: old version, or NULL
+ if the history is missing or the record
+ does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow) /*!< out: virtual row */
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ trx_id_t trx_id;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ dberr_t err;
+
+ ut_ad(index->is_primary());
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+ ut_ad(!view->changes_visible(trx_id));
+
+ ut_ad(!vrow || !(*vrow));
+
+ version = rec;
+
+ for (;;) {
+ mem_heap_t* prev_heap = heap;
+
+ heap = mem_heap_create(1024);
+
+ if (vrow) {
+ *vrow = NULL;
+ }
+
+ /* If purge can't see the record then we can't rely on
+ the UNDO log record. */
+
+ err = trx_undo_prev_version_build(
+ version, index, *offsets, heap,
+ &prev_version, NULL, vrow, 0);
+
+ if (prev_heap != NULL) {
+ mem_heap_free(prev_heap);
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ ut_ad(!vrow || !(*vrow));
+ break;
+ }
+
+ *offsets = rec_get_offsets(
+ prev_version, index, *offsets,
+ index->n_core_fields, ULINT_UNDEFINED, offset_heap);
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+ if (view->changes_visible(trx_id)) {
+
+ /* The view already sees this version: we can copy
+ it to in_heap and return */
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(
+ in_heap, rec_offs_size(*offsets)));
+
+ *old_vers = rec_copy(buf, prev_version, *offsets);
+ rec_offs_make_valid(*old_vers, index, true, *offsets);
+
+ if (vrow && *vrow) {
+ *vrow = dtuple_copy(*vrow, in_heap);
+ dtuple_dup_v_fld(*vrow, in_heap);
+ }
+ break;
+ } else if (trx_id >= view->low_limit_id()
+ && trx_id >= trx_sys.get_max_trx_id()) {
+ err = DB_CORRUPTION;
+ break;
+ }
+ version = prev_version;
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */
+# pragma GCC optimize ("O0")
+#endif
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ trx_t* caller_trx,/*!<in/out: trx of current thread */
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ rec_offs** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers,/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+ dtuple_t** vrow) /*!< out: virtual row, old version, or NULL
+ if it is not updated in the view */
+{
+ const rec_t* version;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ trx_id_t rec_trx_id = 0;
+
+ ut_ad(index->is_primary());
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+ | MTR_MEMO_PAGE_S_FIX));
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ version = rec;
+ ut_ad(!vrow || !(*vrow));
+
+ for (;;) {
+ mem_heap_t* heap2;
+ rec_t* prev_version;
+ trx_id_t version_trx_id;
+
+ version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+ if (rec == version) {
+ rec_trx_id = version_trx_id;
+ }
+
+ if (!trx_sys.is_registered(caller_trx, version_trx_id)) {
+committed_version_trx:
+ /* We found a version that belongs to a
+ committed transaction: return it. */
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ if (rec == version) {
+ *old_vers = rec;
+ if (vrow) {
+ *vrow = NULL;
+ }
+ break;
+ }
+
+ /* We assume that a rolled-back transaction stays in
+ TRX_STATE_ACTIVE state until all the changes have been
+ rolled back and the transaction is removed from
+ the global list of transactions. */
+
+ if (rec_trx_id == version_trx_id) {
+ /* The transaction was committed while
+ we searched for earlier versions.
+ Return the current version as a
+ semi-consistent read. */
+
+ version = rec;
+ *offsets = rec_get_offsets(
+ version, index, *offsets,
+ index->n_core_fields, ULINT_UNDEFINED,
+ offset_heap);
+ }
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(
+ in_heap, rec_offs_size(*offsets)));
+
+ *old_vers = rec_copy(buf, version, *offsets);
+ rec_offs_make_valid(*old_vers, index, true, *offsets);
+ if (vrow && *vrow) {
+ *vrow = dtuple_copy(*vrow, in_heap);
+ dtuple_dup_v_fld(*vrow, in_heap);
+ }
+ break;
+ }
+
+ DEBUG_SYNC_C("after_row_vers_check_trx_active");
+
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ if (trx_undo_prev_version_build(version, index, *offsets, heap,
+ &prev_version, in_heap, vrow,
+ 0) != DB_SUCCESS) {
+ mem_heap_free(heap);
+ heap = heap2;
+ heap2 = NULL;
+ goto committed_version_trx;
+ }
+
+ if (heap2) {
+ mem_heap_free(heap2); /* free version */
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ ut_ad(!vrow || !(*vrow));
+ break;
+ }
+
+ version = prev_version;
+ *offsets = rec_get_offsets(version, index, *offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ }/* for (;;) */
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+}