diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/row | |
parent | Initial commit. (diff) | |
download | mariadb-upstream.tar.xz mariadb-upstream.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | storage/innobase/row/row0ext.cc | 132 | ||||
-rw-r--r-- | storage/innobase/row/row0ftsort.cc | 1791 | ||||
-rw-r--r-- | storage/innobase/row/row0import.cc | 4585 | ||||
-rw-r--r-- | storage/innobase/row/row0ins.cc | 3843 | ||||
-rw-r--r-- | storage/innobase/row/row0log.cc | 4134 | ||||
-rw-r--r-- | storage/innobase/row/row0merge.cc | 5406 | ||||
-rw-r--r-- | storage/innobase/row/row0mysql.cc | 2916 | ||||
-rw-r--r-- | storage/innobase/row/row0purge.cc | 1304 | ||||
-rw-r--r-- | storage/innobase/row/row0quiesce.cc | 715 | ||||
-rw-r--r-- | storage/innobase/row/row0row.cc | 1720 | ||||
-rw-r--r-- | storage/innobase/row/row0sel.cc | 6947 | ||||
-rw-r--r-- | storage/innobase/row/row0uins.cc | 652 | ||||
-rw-r--r-- | storage/innobase/row/row0umod.cc | 1288 | ||||
-rw-r--r-- | storage/innobase/row/row0undo.cc | 453 | ||||
-rw-r--r-- | storage/innobase/row/row0upd.cc | 3002 | ||||
-rw-r--r-- | storage/innobase/row/row0vers.cc | 1419 |
16 files changed, 40307 insertions, 0 deletions
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc new file mode 100644 index 00000000..b7a62760 --- /dev/null +++ b/storage/innobase/row/row0ext.cc @@ -0,0 +1,132 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ext.cc +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "row0ext.h" +#include "btr0cur.h" + +/** Fills the column prefix cache of an externally stored column. +@param[in,out] ext column prefix cache +@param[in] i index of ext->ext[] +@param[in] space tablespace +@param[in] dfield data field */ +static +void +row_ext_cache_fill( + row_ext_t* ext, + ulint i, + fil_space_t* space, + const dfield_t* dfield) +{ + const byte* field = static_cast<const byte*>( + dfield_get_data(dfield)); + ulint f_len = dfield_get_len(dfield); + byte* buf = ext->buf + i * ext->max_len; + + ut_ad(ext->max_len > 0); + ut_ad(i < ext->n_ext); + ut_ad(dfield_is_ext(dfield)); + ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY(!memcmp(field_ref_zero, + field + f_len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* The BLOB pointer is not set: we cannot fetch it */ + ext->len[i] = 0; + } else { + if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN + && f_len > BTR_EXTERN_FIELD_REF_SIZE) { + /* In this case, the field is in B format or beyond, + (refer to the definition of row_ext_t.max_len) + and the field is already fill with prefix, otherwise + f_len would be BTR_EXTERN_FIELD_REF_SIZE. + So there is no need to re-read the prefix externally, + but just copy the local prefix to buf. Please note + if the ext->len[i] is zero, it means an error + as above. */ + memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE); + ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE; + } else { + /* Fetch at most ext->max_len of the column. + The column should be non-empty. However, + trx_rollback_all_recovered() may try to + access a half-deleted BLOB if the server previously + crashed during the execution of + btr_free_externally_stored_field(). */ + ext->len[i] = btr_copy_externally_stored_field_prefix( + buf, ext->max_len, ext->zip_size, + field, f_len); + } + } +} + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + const dict_table_t& table, /*!< in: table */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + mem_heap_t* heap) /*!< in: heap where created */ +{ + if (!table.space) { + return NULL; + } + + ut_ad(n_ext > 0); + + row_ext_t* ret = static_cast<row_ext_t*>( + mem_heap_alloc(heap, + (sizeof *ret) + (n_ext - 1) * sizeof ret->len)); + + ret->n_ext = n_ext; + ret->ext = ext; + ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(table.flags); + ret->zip_size = dict_tf_get_zip_size(table.flags); + + ret->buf = static_cast<byte*>( + mem_heap_alloc(heap, n_ext * ret->max_len)); + + /* Fetch the BLOB prefixes */ + for (ulint i = 0; i < n_ext; i++) { + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple, ext[i]); + row_ext_cache_fill(ret, i, table.space, dfield); + } + + return(ret); +} diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc new file mode 100644 index 00000000..17a2f034 --- /dev/null +++ b/storage/innobase/row/row0ftsort.cc @@ -0,0 +1,1791 @@ +/***************************************************************************** + +Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ftsort.cc +Create Full Text Index with (parallel) merge sort + +Created 10/13/2010 Jimmy Yang +*******************************************************/ + +#include "row0ftsort.h" +#include "dict0dict.h" +#include "row0merge.h" +#include "row0row.h" +#include "btr0cur.h" +#include "fts0plugin.h" +#include "log0crypt.h" + +/** Read the next record to buffer N. +@param N index into array of merge info structure */ +#define ROW_MERGE_READ_GET_NEXT(N) \ + do { \ + b[N] = row_merge_read_rec( \ + block[N], buf[N], b[N], index, \ + fd[N], &foffs[N], &mrec[N], offsets[N], \ + crypt_block[N], space); \ + if (UNIV_UNLIKELY(!b[N])) { \ + if (mrec[N]) { \ + goto exit; \ + } \ + } \ + } while (0) + +/** Parallel sort degree */ +ulong fts_sort_pll_degree = 2; + +/*********************************************************************//** +Create a temporary "fts sort index" used to merge sort the +tokenized doc string. The index has three "fields": + +1) Tokenized word, +2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes +integer value) +3) Word's position in original doc. + +@see fts_create_one_index_table() + +@return dict_index_t structure for the fts sort index */ +dict_index_t* +row_merge_create_fts_sort_index( +/*============================*/ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + dict_table_t* table, /*!< in,out: table that FTS index + is being created on */ + ibool* opt_doc_id_size) + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ +{ + dict_index_t* new_index; + dict_field_t* field; + dict_field_t* idx_field; + CHARSET_INFO* charset; + + // FIXME: This name shouldn't be hard coded here. + new_index = dict_mem_index_create(table, "tmp_fts_idx", DICT_FTS, 3); + + new_index->id = index->id; + new_index->n_uniq = FTS_NUM_FIELDS_SORT; + new_index->n_def = FTS_NUM_FIELDS_SORT; + new_index->cached = TRUE; + new_index->parser = index->parser; + + idx_field = dict_index_get_nth_field(index, 0); + charset = fts_index_get_charset(index); + + /* The first field is on the Tokenized Word */ + field = dict_index_get_nth_field(new_index, 0); + field->name = NULL; + field->prefix_len = 0; + field->descending = false; + field->col = static_cast<dict_col_t*>( + mem_heap_zalloc(new_index->heap, sizeof(dict_col_t))); + field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL; + field->col->mtype = charset == &my_charset_latin1 + ? DATA_VARCHAR : DATA_VARMYSQL; + field->col->mbminlen = idx_field->col->mbminlen; + field->col->mbmaxlen = idx_field->col->mbmaxlen; + field->col->len = static_cast<uint16_t>( + HA_FT_MAXCHARLEN * field->col->mbmaxlen); + + field->fixed_len = 0; + + /* Doc ID */ + field = dict_index_get_nth_field(new_index, 1); + field->name = NULL; + field->prefix_len = 0; + field->descending = false; + field->col = static_cast<dict_col_t*>( + mem_heap_zalloc(new_index->heap, sizeof(dict_col_t))); + field->col->mtype = DATA_INT; + *opt_doc_id_size = FALSE; + + /* Check whether we can use 4 bytes instead of 8 bytes integer + field to hold the Doc ID, thus reduce the overall sort size */ + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + /* If Doc ID column is being added by this create + index, then just check the number of rows in the table */ + if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) { + *opt_doc_id_size = TRUE; + } + } else { + doc_id_t max_doc_id; + + /* If the Doc ID column is supplied by user, then + check the maximum Doc ID in the table */ + max_doc_id = fts_get_max_doc_id((dict_table_t*) table); + + if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) { + *opt_doc_id_size = TRUE; + } + } + + if (*opt_doc_id_size) { + field->col->len = sizeof(ib_uint32_t); + field->fixed_len = sizeof(ib_uint32_t); + } else { + field->col->len = FTS_DOC_ID_LEN; + field->fixed_len = FTS_DOC_ID_LEN; + } + + field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE; + + /* The third field is on the word's position in the original doc */ + field = dict_index_get_nth_field(new_index, 2); + field->name = NULL; + field->prefix_len = 0; + field->descending = false; + field->col = static_cast<dict_col_t*>( + mem_heap_zalloc(new_index->heap, sizeof(dict_col_t))); + field->col->mtype = DATA_INT; + field->col->len = 4 ; + field->fixed_len = 4; + field->col->prtype = DATA_NOT_NULL; + + return(new_index); +} + +/** Initialize FTS parallel sort structures. +@param[in] trx transaction +@param[in,out] dup descriptor of FTS index being created +@param[in,out] new_table table where indexes are created +@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes + integer to store Doc ID during sort +@param[in] old_zip_size page size of the old table during alter +@param[out] psort parallel sort info to be instantiated +@param[out] merge parallel merge info to be instantiated +@return true if all successful */ +bool +row_fts_psort_info_init( + trx_t* trx, + row_merge_dup_t*dup, + dict_table_t* new_table, + bool opt_doc_id_size, + ulint old_zip_size, + fts_psort_t** psort, + fts_psort_t** merge) +{ + ulint i; + ulint j; + fts_psort_common_t* common_info = NULL; + fts_psort_t* psort_info = NULL; + fts_psort_t* merge_info = NULL; + ulint block_size; + ibool ret = TRUE; + ut_ad(ut_is_2pow(old_zip_size)); + + block_size = 3 * srv_sort_buf_size; + + *psort = psort_info = static_cast<fts_psort_t*>(ut_zalloc_nokey( + fts_sort_pll_degree * sizeof *psort_info)); + + if (!psort_info) { + ut_free(dup); + return(FALSE); + } + + /* Common Info for all sort threads */ + common_info = static_cast<fts_psort_common_t*>( + ut_malloc_nokey(sizeof *common_info)); + + if (!common_info) { + ut_free(dup); + ut_free(psort_info); + return(FALSE); + } + + common_info->dup = dup; + common_info->new_table = new_table; + common_info->old_zip_size = old_zip_size; + common_info->trx = trx; + common_info->all_info = psort_info; + pthread_cond_init(&common_info->sort_cond, nullptr); + common_info->opt_doc_id_size = opt_doc_id_size; + + ut_ad(trx->mysql_thd != NULL); + const char* path = thd_innodb_tmpdir(trx->mysql_thd); + /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for + each parallel sort thread. Each "sort bucket" holds records for + a particular "FTS index partition" */ + for (j = 0; j < fts_sort_pll_degree; j++) { + + UT_LIST_INIT( + psort_info[j].fts_doc_list, &fts_doc_item_t::doc_list); + + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + + psort_info[j].merge_file[i] = + static_cast<merge_file_t*>( + ut_zalloc_nokey(sizeof(merge_file_t))); + + if (!psort_info[j].merge_file[i]) { + ret = FALSE; + goto func_exit; + } + + psort_info[j].merge_buf[i] = row_merge_buf_create( + dup->index); + + if (row_merge_file_create(psort_info[j].merge_file[i], + path) == OS_FILE_CLOSED) { + goto func_exit; + } + + /* Need to align memory for O_DIRECT write */ + psort_info[j].merge_block[i] = + static_cast<row_merge_block_t*>( + aligned_malloc(block_size, 1024)); + + if (!psort_info[j].merge_block[i]) { + ret = FALSE; + goto func_exit; + } + + /* If tablespace is encrypted, allocate additional buffer for + encryption/decryption. */ + if (srv_encrypt_log) { + /* Need to align memory for O_DIRECT write */ + psort_info[j].crypt_block[i] = + static_cast<row_merge_block_t*>( + aligned_malloc(block_size, + 1024)); + + if (!psort_info[j].crypt_block[i]) { + ret = FALSE; + goto func_exit; + } + } else { + psort_info[j].crypt_block[i] = NULL; + } + } + + psort_info[j].child_status = 0; + psort_info[j].state = 0; + psort_info[j].psort_common = common_info; + psort_info[j].error = DB_SUCCESS; + psort_info[j].memory_used = 0; + mysql_mutex_init(0, &psort_info[j].mutex, nullptr); + } + + /* Initialize merge_info structures parallel merge and insert + into auxiliary FTS tables (FTS_INDEX_TABLE) */ + *merge = merge_info = static_cast<fts_psort_t*>( + ut_malloc_nokey(FTS_NUM_AUX_INDEX * sizeof *merge_info)); + + for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { + + merge_info[j].child_status = 0; + merge_info[j].state = 0; + merge_info[j].psort_common = common_info; + } + +func_exit: + if (!ret) { + row_fts_psort_info_destroy(psort_info, merge_info); + } + + return(ret); +} +/*********************************************************************//** +Clean up and deallocate FTS parallel sort structures, and close the +merge sort files */ +void +row_fts_psort_info_destroy( +/*=======================*/ + fts_psort_t* psort_info, /*!< parallel sort info */ + fts_psort_t* merge_info) /*!< parallel merge info */ +{ + ulint i; + ulint j; + + if (psort_info) { + for (j = 0; j < fts_sort_pll_degree; j++) { + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (psort_info[j].merge_file[i]) { + row_merge_file_destroy( + psort_info[j].merge_file[i]); + } + + aligned_free(psort_info[j].merge_block[i]); + ut_free(psort_info[j].merge_file[i]); + aligned_free(psort_info[j].crypt_block[i]); + } + + mysql_mutex_destroy(&psort_info[j].mutex); + } + + pthread_cond_destroy(&merge_info[0].psort_common->sort_cond); + ut_free(merge_info[0].psort_common->dup); + ut_free(merge_info[0].psort_common); + ut_free(psort_info); + } + + ut_free(merge_info); +} +/*********************************************************************//** +Free up merge buffers when merge sort is done */ +void +row_fts_free_pll_merge_buf( +/*=======================*/ + fts_psort_t* psort_info) /*!< in: parallel sort info */ +{ + ulint j; + ulint i; + + if (!psort_info) { + return; + } + + for (j = 0; j < fts_sort_pll_degree; j++) { + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + row_merge_buf_free(psort_info[j].merge_buf[i]); + } + } + + return; +} + +/*********************************************************************//** +FTS plugin parser 'myql_add_word' callback function for row merge. +Refer to 'st_mysql_ftparser_param' for more detail. +@return always returns 0 */ +static +int +row_merge_fts_doc_add_word_for_parser( +/*==================================*/ + MYSQL_FTPARSER_PARAM *param, /* in: parser paramter */ + const char *word, /* in: token word */ + int word_len, /* in: word len */ + MYSQL_FTPARSER_BOOLEAN_INFO* boolean_info) /* in: boolean info */ +{ + fts_string_t str; + fts_tokenize_ctx_t* t_ctx; + row_fts_token_t* fts_token; + byte* ptr; + + ut_ad(param); + ut_ad(param->mysql_ftparam); + ut_ad(word); + ut_ad(boolean_info); + + t_ctx = static_cast<fts_tokenize_ctx_t*>(param->mysql_ftparam); + ut_ad(t_ctx); + + str.f_str = (byte*)(word); + str.f_len = ulint(word_len); + str.f_n_char = fts_get_token_size( + (CHARSET_INFO*)param->cs, word, ulint(word_len)); + + /* JAN: TODO: MySQL 5.7 FTS + ut_ad(boolean_info->position >= 0); + */ + + ptr = static_cast<byte*>(ut_malloc_nokey(sizeof(row_fts_token_t) + + sizeof(fts_string_t) + str.f_len)); + fts_token = reinterpret_cast<row_fts_token_t*>(ptr); + fts_token->text = reinterpret_cast<fts_string_t*>( + ptr + sizeof(row_fts_token_t)); + fts_token->text->f_str = static_cast<byte*>( + ptr + sizeof(row_fts_token_t) + sizeof(fts_string_t)); + + fts_token->text->f_len = str.f_len; + fts_token->text->f_n_char = str.f_n_char; + memcpy(fts_token->text->f_str, str.f_str, str.f_len); + + /* JAN: TODO: MySQL 5.7 FTS + fts_token->position = boolean_info->position; + */ + + /* Add token to list */ + UT_LIST_ADD_LAST(t_ctx->fts_token_list, fts_token); + + return(0); +} + +/*********************************************************************//** +Tokenize by fts plugin parser */ +static +void +row_merge_fts_doc_tokenize_by_parser( +/*=================================*/ + fts_doc_t* doc, /* in: doc to tokenize */ + st_mysql_ftparser* parser, /* in: plugin parser instance */ + fts_tokenize_ctx_t* t_ctx) /* in/out: tokenize ctx instance */ +{ + MYSQL_FTPARSER_PARAM param; + + ut_a(parser); + + /* Set paramters for param */ + param.mysql_parse = fts_tokenize_document_internal; + param.mysql_add_word = row_merge_fts_doc_add_word_for_parser; + param.mysql_ftparam = t_ctx; + param.cs = doc->charset; + param.doc = reinterpret_cast<char*>(doc->text.f_str); + param.length = static_cast<int>(doc->text.f_len); + param.mode= MYSQL_FTPARSER_SIMPLE_MODE; + + PARSER_INIT(parser, ¶m); + /* We assume parse returns successfully here. */ + parser->parse(¶m); + PARSER_DEINIT(parser, ¶m); +} + +/*********************************************************************//** +Tokenize incoming text data and add to the sort buffer. +@see row_merge_buf_encode() +@return TRUE if the record passed, FALSE if out of space */ +static +ibool +row_merge_fts_doc_tokenize( +/*=======================*/ + row_merge_buf_t** sort_buf, /*!< in/out: sort buffer */ + doc_id_t doc_id, /*!< in: Doc ID */ + fts_doc_t* doc, /*!< in: Doc to be tokenized */ + merge_file_t** merge_file, /*!< in/out: merge file */ + ibool opt_doc_id_size,/*!< in: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort*/ + fts_tokenize_ctx_t* t_ctx) /*!< in/out: tokenize context */ +{ + ulint inc = 0; + fts_string_t str; + ulint len; + row_merge_buf_t* buf; + dfield_t* field; + fts_string_t t_str; + ibool buf_full = FALSE; + byte str_buf[FTS_MAX_WORD_LEN + 1]; + ulint data_size[FTS_NUM_AUX_INDEX]; + ulint n_tuple[FTS_NUM_AUX_INDEX]; + st_mysql_ftparser* parser; + + t_str.f_n_char = 0; + t_ctx->buf_used = 0; + + memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); + memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); + + parser = sort_buf[0]->index->parser; + + /* Tokenize the data and add each word string, its corresponding + doc id and position to sort buffer */ + while (parser + ? (!t_ctx->processed_len + || UT_LIST_GET_LEN(t_ctx->fts_token_list)) + : t_ctx->processed_len < doc->text.f_len) { + ulint idx = 0; + ulint cur_len; + doc_id_t write_doc_id; + row_fts_token_t* fts_token = NULL; + + if (parser != NULL) { + if (t_ctx->processed_len == 0) { + UT_LIST_INIT(t_ctx->fts_token_list, &row_fts_token_t::token_list); + + /* Parse the whole doc and cache tokens */ + row_merge_fts_doc_tokenize_by_parser(doc, + parser, t_ctx); + + /* Just indictate we have parsed all the word */ + t_ctx->processed_len += 1; + } + + /* Then get a token */ + fts_token = UT_LIST_GET_FIRST(t_ctx->fts_token_list); + if (fts_token) { + str.f_len = fts_token->text->f_len; + str.f_n_char = fts_token->text->f_n_char; + str.f_str = fts_token->text->f_str; + } else { + ut_ad(UT_LIST_GET_LEN(t_ctx->fts_token_list) == 0); + /* Reach the end of the list */ + t_ctx->processed_len = doc->text.f_len; + break; + } + } else { + inc = innobase_mysql_fts_get_token( + doc->charset, + doc->text.f_str + t_ctx->processed_len, + doc->text.f_str + doc->text.f_len, &str); + + ut_a(inc > 0); + } + + /* Ignore string whose character number is less than + "fts_min_token_size" or more than "fts_max_token_size" */ + if (!fts_check_token(&str, NULL, NULL)) { + if (parser != NULL) { + UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token); + ut_free(fts_token); + } else { + t_ctx->processed_len += inc; + } + + continue; + } + + t_str.f_len = innobase_fts_casedn_str( + doc->charset, (char*) str.f_str, str.f_len, + (char*) &str_buf, FTS_MAX_WORD_LEN + 1); + + t_str.f_str = (byte*) &str_buf; + + /* if "cached_stopword" is defined, ignore words in the + stopword list */ + if (!fts_check_token(&str, t_ctx->cached_stopword, + doc->charset)) { + if (parser != NULL) { + UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token); + ut_free(fts_token); + } else { + t_ctx->processed_len += inc; + } + + continue; + } + + /* There are FTS_NUM_AUX_INDEX auxiliary tables, find + out which sort buffer to put this word record in */ + t_ctx->buf_used = fts_select_index( + doc->charset, t_str.f_str, t_str.f_len); + + buf = sort_buf[t_ctx->buf_used]; + + ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX); + idx = t_ctx->buf_used; + + mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]]; + + field = mtuple->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, + FTS_NUM_FIELDS_SORT * sizeof *field)); + + /* The first field is the tokenized word */ + dfield_set_data(field, t_str.f_str, t_str.f_len); + len = dfield_get_len(field); + + dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type); + field->type.prtype |= DATA_NOT_NULL; + ut_ad(len <= field->type.len); + + /* For the temporary file, row_merge_buf_encode() uses + 1 byte for representing the number of extra_size bytes. + This number will always be 1, because for this 3-field index + consisting of one variable-size column, extra_size will always + be 1 or 2, which can be encoded in one byte. + + The extra_size is 1 byte if the length of the + variable-length column is less than 128 bytes or the + maximum length is less than 256 bytes. */ + + /* One variable length column, word with its lenght less than + fts_max_token_size, add one extra size and one extra byte. + + Since the max length for FTS token now is larger than 255, + so we will need to signify length byte itself, so only 1 to 128 + bytes can be used for 1 bytes, larger than that 2 bytes. */ + if (len < 128 || field->type.len < 256) { + /* Extra size is one byte. */ + cur_len = 2 + len; + } else { + /* Extra size is two bytes. */ + cur_len = 3 + len; + } + + dfield_dup(field, buf->heap); + field++; + + /* The second field is the Doc ID */ + + ib_uint32_t doc_id_32_bit; + + if (!opt_doc_id_size) { + fts_write_doc_id((byte*) &write_doc_id, doc_id); + + dfield_set_data( + field, &write_doc_id, sizeof(write_doc_id)); + } else { + mach_write_to_4( + (byte*) &doc_id_32_bit, (ib_uint32_t) doc_id); + + dfield_set_data( + field, &doc_id_32_bit, sizeof(doc_id_32_bit)); + } + + len = field->len; + ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t)); + + field->type.mtype = DATA_INT; + field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE; + field->type.len = static_cast<uint16_t>(field->len); + field->type.mbminlen = 0; + field->type.mbmaxlen = 0; + + cur_len += len; + dfield_dup(field, buf->heap); + + ++field; + + /* The third field is the position. + MySQL 5.7 changed the fulltext parser plugin interface + by adding MYSQL_FTPARSER_BOOLEAN_INFO::position. + Below we assume that the field is always 0. */ + ulint pos = t_ctx->init_pos; + byte position[4]; + if (parser == NULL) { + pos += t_ctx->processed_len + inc - str.f_len; + } + len = 4; + mach_write_to_4(position, pos); + dfield_set_data(field, &position, len); + + field->type.mtype = DATA_INT; + field->type.prtype = DATA_NOT_NULL; + field->type.len = 4; + field->type.mbminlen = 0; + field->type.mbmaxlen = 0; + cur_len += len; + dfield_dup(field, buf->heap); + + /* Reserve one byte for the end marker of row_merge_block_t */ + if (buf->total_size + data_size[idx] + cur_len + >= srv_sort_buf_size - 1) { + + buf_full = TRUE; + break; + } + + /* Increment the number of tuples */ + n_tuple[idx]++; + if (parser != NULL) { + UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token); + ut_free(fts_token); + } else { + t_ctx->processed_len += inc; + } + data_size[idx] += cur_len; + } + + /* Update the data length and the number of new word tuples + added in this round of tokenization */ + for (ulint i = 0; i < FTS_NUM_AUX_INDEX; i++) { + /* The computation of total_size below assumes that no + delete-mark flags will be stored and that all fields + are NOT NULL and fixed-length. */ + + sort_buf[i]->total_size += data_size[i]; + + sort_buf[i]->n_tuples += n_tuple[i]; + + merge_file[i]->n_rec += n_tuple[i]; + t_ctx->rows_added[i] += n_tuple[i]; + } + + if (!buf_full) { + /* we pad one byte between text accross two fields */ + t_ctx->init_pos += doc->text.f_len + 1; + } + + return(!buf_full); +} + +/*********************************************************************//** +Get next doc item from fts_doc_list */ +UNIV_INLINE +void +row_merge_fts_get_next_doc_item( +/*============================*/ + fts_psort_t* psort_info, /*!< in: psort_info */ + fts_doc_item_t** doc_item) /*!< in/out: doc item */ +{ + if (*doc_item != NULL) { + ut_free(*doc_item); + } + + mysql_mutex_lock(&psort_info->mutex); + + *doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list); + if (*doc_item != NULL) { + UT_LIST_REMOVE(psort_info->fts_doc_list, *doc_item); + + ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t) + + (*doc_item)->field->len); + psort_info->memory_used -= sizeof(fts_doc_item_t) + + (*doc_item)->field->len; + } + + mysql_mutex_unlock(&psort_info->mutex); +} + +/*********************************************************************//** +Function performs parallel tokenization of the incoming doc strings. +It also performs the initial in memory sort of the parsed records. +*/ +static +void fts_parallel_tokenization( +/*======================*/ + void* arg) /*!< in: psort_info for the thread */ +{ + fts_psort_t* psort_info = (fts_psort_t*) arg; + ulint i; + fts_doc_item_t* doc_item = NULL; + row_merge_buf_t** buf; + ibool processed = FALSE; + merge_file_t** merge_file; + row_merge_block_t** block; + row_merge_block_t** crypt_block; + pfs_os_file_t tmpfd[FTS_NUM_AUX_INDEX]; + ulint mycount[FTS_NUM_AUX_INDEX]; + ulint num_doc_processed = 0; + doc_id_t last_doc_id = 0; + mem_heap_t* blob_heap = NULL; + fts_doc_t doc; + dict_table_t* table = psort_info->psort_common->new_table; + fts_tokenize_ctx_t t_ctx; + ulint retried = 0; + dberr_t error = DB_SUCCESS; + + ut_ad(psort_info->psort_common->trx->mysql_thd != NULL); + + /* const char* path = thd_innodb_tmpdir( + psort_info->psort_common->trx->mysql_thd); + */ + + ut_ad(psort_info->psort_common->trx->mysql_thd != NULL); + + const char* path = thd_innodb_tmpdir( + psort_info->psort_common->trx->mysql_thd); + + ut_ad(psort_info); + + buf = psort_info->merge_buf; + merge_file = psort_info->merge_file; + blob_heap = mem_heap_create(512); + memset(&doc, 0, sizeof(doc)); + memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int)); + + doc.charset = fts_index_get_charset( + psort_info->psort_common->dup->index); + + block = psort_info->merge_block; + crypt_block = psort_info->crypt_block; + + const ulint zip_size = psort_info->psort_common->old_zip_size; + + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + + t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword; + processed = TRUE; +loop: + while (doc_item) { + dfield_t* dfield = doc_item->field; + + last_doc_id = doc_item->doc_id; + + ut_ad (dfield->data != NULL + && dfield_get_len(dfield) != UNIV_SQL_NULL); + + /* If finish processing the last item, update "doc" with + strings in the doc_item, otherwise continue processing last + item */ + if (processed) { + byte* data; + ulint data_len; + + dfield = doc_item->field; + data = static_cast<byte*>(dfield_get_data(dfield)); + data_len = dfield_get_len(dfield); + + if (dfield_is_ext(dfield)) { + doc.text.f_str = + btr_copy_externally_stored_field( + &doc.text.f_len, data, + zip_size, data_len, blob_heap); + } else { + doc.text.f_str = data; + doc.text.f_len = data_len; + } + + doc.tokens = 0; + t_ctx.processed_len = 0; + } else { + /* Not yet finish processing the "doc" on hand, + continue processing it */ + ut_ad(doc.text.f_str); + ut_ad(buf[0]->index->parser + || t_ctx.processed_len < doc.text.f_len); + } + + processed = row_merge_fts_doc_tokenize( + buf, doc_item->doc_id, &doc, + merge_file, psort_info->psort_common->opt_doc_id_size, + &t_ctx); + + /* Current sort buffer full, need to recycle */ + if (!processed) { + ut_ad(buf[0]->index->parser + || t_ctx.processed_len < doc.text.f_len); + ut_ad(t_ctx.rows_added[t_ctx.buf_used]); + break; + } + + num_doc_processed++; + + if (UNIV_UNLIKELY(fts_enable_diag_print) + && num_doc_processed % 10000 == 1) { + ib::info() << "Number of documents processed: " + << num_doc_processed; +#ifdef FTS_INTERNAL_DIAG_PRINT + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + ib::info() << "ID " << psort_info->psort_id + << ", partition " << i << ", word " + << mycount[i]; + } +#endif + } + + mem_heap_empty(blob_heap); + + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + + if (doc_item && last_doc_id != doc_item->doc_id) { + t_ctx.init_pos = 0; + } + } + + /* If we run out of current sort buffer, need to sort + and flush the sort buffer to disk */ + if (t_ctx.rows_added[t_ctx.buf_used] && !processed) { + row_merge_buf_sort(buf[t_ctx.buf_used], NULL); + row_merge_buf_write(buf[t_ctx.buf_used], +#ifndef DBUG_OFF + merge_file[t_ctx.buf_used], +#endif + block[t_ctx.buf_used]); + + if (!row_merge_write(merge_file[t_ctx.buf_used]->fd, + merge_file[t_ctx.buf_used]->offset++, + block[t_ctx.buf_used], + crypt_block[t_ctx.buf_used], + table->space_id)) { + error = DB_TEMP_FILE_WRITE_FAIL; + goto func_exit; + } + + MEM_UNDEFINED(block[t_ctx.buf_used], srv_sort_buf_size); + buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]); + mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used]; + t_ctx.rows_added[t_ctx.buf_used] = 0; + + ut_a(doc_item); + goto loop; + } + + /* Parent done scanning, and if finish processing all the docs, exit */ + if (psort_info->state == FTS_PARENT_COMPLETE) { + if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) { + goto exit; + } else if (retried > 10000) { + ut_ad(!doc_item); + /* retried too many times and cannot get new record */ + ib::error() << "FTS parallel sort processed " + << num_doc_processed + << " records, the sort queue has " + << UT_LIST_GET_LEN(psort_info->fts_doc_list) + << " records. But sort cannot get the next" + " records during alter table " << table->name; + goto exit; + } + } else if (psort_info->state == FTS_PARENT_EXITING) { + /* Parent abort */ + goto func_exit; + } + + if (doc_item == NULL) { + std::this_thread::yield(); + } + + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + + if (doc_item != NULL) { + if (last_doc_id != doc_item->doc_id) { + t_ctx.init_pos = 0; + } + + retried = 0; + } else if (psort_info->state == FTS_PARENT_COMPLETE) { + retried++; + } + + goto loop; + +exit: + /* Do a final sort of the last (or latest) batch of records + in block memory. Flush them to temp file if records cannot + be hold in one block memory */ + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (t_ctx.rows_added[i]) { + row_merge_buf_sort(buf[i], NULL); + row_merge_buf_write(buf[i], +#ifndef DBUG_OFF + merge_file[i], +#endif + block[i]); + + /* Write to temp file, only if records have + been flushed to temp file before (offset > 0): + The pseudo code for sort is following: + + while (there are rows) { + tokenize rows, put result in block[] + if (block[] runs out) { + sort rows; + write to temp file with + row_merge_write(); + offset++; + } + } + + # write out the last batch + if (offset > 0) { + row_merge_write(); + offset++; + } else { + # no need to write anything + offset stay as 0 + } + + so if merge_file[i]->offset is 0 when we come to + here as the last batch, this means rows have + never flush to temp file, it can be held all in + memory */ + if (merge_file[i]->offset != 0) { + if (!row_merge_write(merge_file[i]->fd, + merge_file[i]->offset++, + block[i], + crypt_block[i], + table->space_id)) { + error = DB_TEMP_FILE_WRITE_FAIL; + goto func_exit; + } + +#ifdef HAVE_valgrind + MEM_UNDEFINED(block[i], srv_sort_buf_size); + + if (crypt_block[i]) { + MEM_UNDEFINED(crypt_block[i], + srv_sort_buf_size); + } +#endif /* HAVE_valgrind */ + } + + buf[i] = row_merge_buf_empty(buf[i]); + t_ctx.rows_added[i] = 0; + } + } + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: start merge sort\n"); + } + + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + if (!merge_file[i]->offset) { + continue; + } + + tmpfd[i] = row_merge_file_create_low(path); + if (tmpfd[i] == OS_FILE_CLOSED) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + error = row_merge_sort(psort_info->psort_common->trx, + psort_info->psort_common->dup, + merge_file[i], block[i], &tmpfd[i], + false, 0.0/* pct_progress */, 0.0/* pct_cost */, + crypt_block[i], table->space_id); + + if (error != DB_SUCCESS) { + row_merge_file_destroy_low(tmpfd[i]); + goto func_exit; + } + + row_merge_file_destroy_low(tmpfd[i]); + } + +func_exit: + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: complete merge sort\n"); + } + + mem_heap_free(blob_heap); + + mysql_mutex_lock(&psort_info->mutex); + psort_info->error = error; + mysql_mutex_unlock(&psort_info->mutex); + + if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) { + /* child can exit either with error or told by parent. */ + ut_ad(error != DB_SUCCESS + || psort_info->state == FTS_PARENT_EXITING); + } + + /* Free fts doc list in case of error. */ + do { + row_merge_fts_get_next_doc_item(psort_info, &doc_item); + } while (doc_item != NULL); + + mysql_mutex_lock(&psort_info->mutex); + psort_info->child_status = FTS_CHILD_COMPLETE; + pthread_cond_signal(&psort_info->psort_common->sort_cond); + mysql_mutex_unlock(&psort_info->mutex); +} + +/*********************************************************************//** +Start the parallel tokenization and parallel merge sort */ +void +row_fts_start_psort( +/*================*/ + fts_psort_t* psort_info) /*!< parallel sort structure */ +{ + ulint i = 0; + + for (i = 0; i < fts_sort_pll_degree; i++) { + psort_info[i].psort_id = i; + psort_info[i].task = + new tpool::waitable_task(fts_parallel_tokenization,&psort_info[i]); + srv_thread_pool->submit_task(psort_info[i].task); + } +} + +/*********************************************************************//** +Function performs the merge and insertion of the sorted records. */ +static +void +fts_parallel_merge( +/*===============*/ + void* arg) /*!< in: parallel merge info */ +{ + fts_psort_t* psort_info = (fts_psort_t*) arg; + ulint id; + + ut_ad(psort_info); + + id = psort_info->psort_id; + + row_fts_merge_insert(psort_info->psort_common->dup->index, + psort_info->psort_common->new_table, + psort_info->psort_common->all_info, id); +} + +/*********************************************************************//** +Kick off the parallel merge and insert thread */ +void +row_fts_start_parallel_merge( +/*=========================*/ + fts_psort_t* merge_info) /*!< in: parallel sort info */ +{ + ulint i = 0; + + /* Kick off merge/insert tasks */ + for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + merge_info[i].psort_id = i; + merge_info[i].child_status = 0; + + merge_info[i].task = new tpool::waitable_task( + fts_parallel_merge, + (void*) &merge_info[i]); + srv_thread_pool->submit_task(merge_info[i].task); + } +} + +/** +Write out a single word's data as new entry/entries in the INDEX table. +@param[in] ins_ctx insert context +@param[in] word word string +@param[in] node node colmns +@return DB_SUCCUESS if insertion runs fine, otherwise error code */ +static +dberr_t +row_merge_write_fts_node( + const fts_psort_insert_t* ins_ctx, + const fts_string_t* word, + const fts_node_t* node) +{ + dtuple_t* tuple; + dfield_t* field; + dberr_t ret = DB_SUCCESS; + doc_id_t write_first_doc_id[8]; + doc_id_t write_last_doc_id[8]; + ib_uint32_t write_doc_count; + + tuple = ins_ctx->tuple; + + /* The first field is the tokenized word */ + field = dtuple_get_nth_field(tuple, 0); + dfield_set_data(field, word->f_str, word->f_len); + + /* The second field is first_doc_id */ + field = dtuple_get_nth_field(tuple, 1); + fts_write_doc_id((byte*)&write_first_doc_id, node->first_doc_id); + dfield_set_data(field, &write_first_doc_id, sizeof(doc_id_t)); + + /* The third and fourth fileds(TRX_ID, ROLL_PTR) are filled already.*/ + /* The fifth field is last_doc_id */ + field = dtuple_get_nth_field(tuple, 4); + fts_write_doc_id((byte*)&write_last_doc_id, node->last_doc_id); + dfield_set_data(field, &write_last_doc_id, sizeof(doc_id_t)); + + /* The sixth field is doc_count */ + field = dtuple_get_nth_field(tuple, 5); + mach_write_to_4((byte*)&write_doc_count, (ib_uint32_t)node->doc_count); + dfield_set_data(field, &write_doc_count, sizeof(ib_uint32_t)); + + /* The seventh field is ilist */ + field = dtuple_get_nth_field(tuple, 6); + dfield_set_data(field, node->ilist, node->ilist_size); + + ret = ins_ctx->btr_bulk->insert(tuple); + + return(ret); +} + +/********************************************************************//** +Insert processed FTS data to auxillary index tables. +@return DB_SUCCESS if insertion runs fine */ +static MY_ATTRIBUTE((nonnull)) +dberr_t +row_merge_write_fts_word( +/*=====================*/ + fts_psort_insert_t* ins_ctx, /*!< in: insert context */ + fts_tokenizer_word_t* word) /*!< in: sorted and tokenized + word */ +{ + dberr_t ret = DB_SUCCESS; + + ut_ad(ins_ctx->aux_index_id == fts_select_index( + ins_ctx->charset, word->text.f_str, word->text.f_len)); + + /* Pop out each fts_node in word->nodes write them to auxiliary table */ + for (ulint i = 0; i < ib_vector_size(word->nodes); i++) { + dberr_t error; + fts_node_t* fts_node; + + fts_node = static_cast<fts_node_t*>(ib_vector_get(word->nodes, i)); + + error = row_merge_write_fts_node(ins_ctx, &word->text, fts_node); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "Failed to write word to FTS auxiliary" + " index table " + << ins_ctx->btr_bulk->table_name() + << ", error " << error; + ret = error; + } + + ut_free(fts_node->ilist); + fts_node->ilist = NULL; + } + + ib_vector_reset(word->nodes); + + return(ret); +} + +/*********************************************************************//** +Read sorted FTS data files and insert data tuples to auxillary tables. +@return DB_SUCCESS or error number */ +static +void +row_fts_insert_tuple( +/*=================*/ + fts_psort_insert_t* + ins_ctx, /*!< in: insert context */ + fts_tokenizer_word_t* word, /*!< in: last processed + tokenized word */ + ib_vector_t* positions, /*!< in: word position */ + doc_id_t* in_doc_id, /*!< in: last item doc id */ + dtuple_t* dtuple) /*!< in: entry to insert */ +{ + fts_node_t* fts_node = NULL; + dfield_t* dfield; + doc_id_t doc_id; + ulint position; + fts_string_t token_word; + ulint i; + + /* Get fts_node for the FTS auxillary INDEX table */ + if (ib_vector_size(word->nodes) > 0) { + fts_node = static_cast<fts_node_t*>( + ib_vector_last(word->nodes)); + } + + if (fts_node == NULL + || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) { + + fts_node = static_cast<fts_node_t*>( + ib_vector_push(word->nodes, NULL)); + + memset(fts_node, 0x0, sizeof(*fts_node)); + } + + /* If dtuple == NULL, this is the last word to be processed */ + if (!dtuple) { + if (fts_node && ib_vector_size(positions) > 0) { + fts_cache_node_add_positions( + NULL, fts_node, *in_doc_id, + positions); + + /* Write out the current word */ + row_merge_write_fts_word(ins_ctx, word); + } + + return; + } + + /* Get the first field for the tokenized word */ + dfield = dtuple_get_nth_field(dtuple, 0); + + token_word.f_n_char = 0; + token_word.f_len = dfield->len; + token_word.f_str = static_cast<byte*>(dfield_get_data(dfield)); + + if (!word->text.f_str) { + fts_string_dup(&word->text, &token_word, ins_ctx->heap); + } + + /* compare to the last word, to see if they are the same + word */ + if (innobase_fts_text_cmp(ins_ctx->charset, + &word->text, &token_word) != 0) { + ulint num_item; + + /* Getting a new word, flush the last position info + for the currnt word in fts_node */ + if (ib_vector_size(positions) > 0) { + fts_cache_node_add_positions( + NULL, fts_node, *in_doc_id, positions); + } + + /* Write out the current word */ + row_merge_write_fts_word(ins_ctx, word); + + /* Copy the new word */ + fts_string_dup(&word->text, &token_word, ins_ctx->heap); + + num_item = ib_vector_size(positions); + + /* Clean up position queue */ + for (i = 0; i < num_item; i++) { + ib_vector_pop(positions); + } + + /* Reset Doc ID */ + *in_doc_id = 0; + memset(fts_node, 0x0, sizeof(*fts_node)); + } + + /* Get the word's Doc ID */ + dfield = dtuple_get_nth_field(dtuple, 1); + + if (!ins_ctx->opt_doc_id_size) { + doc_id = fts_read_doc_id( + static_cast<byte*>(dfield_get_data(dfield))); + } else { + doc_id = (doc_id_t) mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + } + + /* Get the word's position info */ + dfield = dtuple_get_nth_field(dtuple, 2); + position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))); + + /* If this is the same word as the last word, and they + have the same Doc ID, we just need to add its position + info. Otherwise, we will flush position info to the + fts_node and initiate a new position vector */ + if (!(*in_doc_id) || *in_doc_id == doc_id) { + ib_vector_push(positions, &position); + } else { + ulint num_pos = ib_vector_size(positions); + + fts_cache_node_add_positions(NULL, fts_node, + *in_doc_id, positions); + for (i = 0; i < num_pos; i++) { + ib_vector_pop(positions); + } + ib_vector_push(positions, &position); + } + + /* record the current Doc ID */ + *in_doc_id = doc_id; +} + +/*********************************************************************//** +Propagate a newly added record up one level in the selection tree +@return parent where this value propagated to */ +static +ulint +row_fts_sel_tree_propagate( +/*=======================*/ + ulint propogated, /*<! in: tree node propagated */ + int* sel_tree, /*<! in: selection tree */ + const mrec_t** mrec, /*<! in: sort record */ + rec_offs** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in/out: FTS index */ +{ + ulint parent; + int child_left; + int child_right; + int selected; + + /* Find which parent this value will be propagated to */ + parent = (propogated - 1) / 2; + + /* Find out which value is smaller, and to propagate */ + child_left = sel_tree[parent * 2 + 1]; + child_right = sel_tree[parent * 2 + 2]; + + if (child_left == -1 || mrec[child_left] == NULL) { + if (child_right == -1 + || mrec[child_right] == NULL) { + selected = -1; + } else { + selected = child_right ; + } + } else if (child_right == -1 + || mrec[child_right] == NULL) { + selected = child_left; + } else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right], + offsets[child_left], + offsets[child_right], + index, NULL) < 0) { + selected = child_left; + } else { + selected = child_right; + } + + sel_tree[parent] = selected; + + return parent; +} + +/*********************************************************************//** +Readjust selection tree after popping the root and read a new value +@return the new root */ +static +int +row_fts_sel_tree_update( +/*====================*/ + int* sel_tree, /*<! in/out: selection tree */ + ulint propagated, /*<! in: node to propagate up */ + ulint height, /*<! in: tree height */ + const mrec_t** mrec, /*<! in: sort record */ + rec_offs** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in: index dictionary */ +{ + ulint i; + + for (i = 1; i <= height; i++) { + propagated = row_fts_sel_tree_propagate( + propagated, sel_tree, mrec, offsets, index); + } + + return(sel_tree[0]); +} + +/*********************************************************************//** +Build selection tree at a specified level */ +static +void +row_fts_build_sel_tree_level( +/*=========================*/ + int* sel_tree, /*<! in/out: selection tree */ + ulint level, /*<! in: selection tree level */ + const mrec_t** mrec, /*<! in: sort record */ + rec_offs** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in: index dictionary */ +{ + ulint start; + int child_left; + int child_right; + ulint i; + ulint num_item = ulint(1) << level; + + start = num_item - 1; + + for (i = 0; i < num_item; i++) { + child_left = sel_tree[(start + i) * 2 + 1]; + child_right = sel_tree[(start + i) * 2 + 2]; + + if (child_left == -1) { + if (child_right == -1) { + sel_tree[start + i] = -1; + } else { + sel_tree[start + i] = child_right; + } + continue; + } else if (child_right == -1) { + sel_tree[start + i] = child_left; + continue; + } + + /* Deal with NULL child conditions */ + if (!mrec[child_left]) { + if (!mrec[child_right]) { + sel_tree[start + i] = -1; + } else { + sel_tree[start + i] = child_right; + } + continue; + } else if (!mrec[child_right]) { + sel_tree[start + i] = child_left; + continue; + } + + /* Select the smaller one to set parent pointer */ + int cmp = cmp_rec_rec_simple( + mrec[child_left], mrec[child_right], + offsets[child_left], offsets[child_right], + index, NULL); + + sel_tree[start + i] = cmp < 0 ? child_left : child_right; + } +} + +/*********************************************************************//** +Build a selection tree for merge. The selection tree is a binary tree +and should have fts_sort_pll_degree / 2 levels. With root as level 0 +@return number of tree levels */ +static +ulint +row_fts_build_sel_tree( +/*===================*/ + int* sel_tree, /*<! in/out: selection tree */ + const mrec_t** mrec, /*<! in: sort record */ + rec_offs** offsets, /*<! in: record offsets */ + dict_index_t* index) /*<! in: index dictionary */ +{ + ulint treelevel = 1; + ulint num = 2; + ulint i = 0; + ulint start; + + /* No need to build selection tree if we only have two merge threads */ + if (fts_sort_pll_degree <= 2) { + return(0); + } + + while (num < fts_sort_pll_degree) { + num = num << 1; + treelevel++; + } + + start = (ulint(1) << treelevel) - 1; + + for (i = 0; i < fts_sort_pll_degree; i++) { + sel_tree[i + start] = int(i); + } + + i = treelevel; + do { + row_fts_build_sel_tree_level( + sel_tree, --i, mrec, offsets, index); + } while (i > 0); + + return(treelevel); +} + +/*********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +dberr_t +row_fts_merge_insert( +/*=================*/ + dict_index_t* index, /*!< in: index */ + dict_table_t* table, /*!< in: new table */ + fts_psort_t* psort_info, /*!< parallel sort info */ + ulint id) /* !< in: which auxiliary table's data + to insert to */ +{ + const byte** b; + mem_heap_t* tuple_heap; + mem_heap_t* heap; + dberr_t error = DB_SUCCESS; + ulint* foffs; + rec_offs** offsets; + fts_tokenizer_word_t new_word; + ib_vector_t* positions; + doc_id_t last_doc_id; + ib_alloc_t* heap_alloc; + ulint i; + mrec_buf_t** buf; + pfs_os_file_t* fd; + byte** block; + byte** crypt_block; + const mrec_t** mrec; + ulint count = 0; + int* sel_tree; + ulint height; + ulint start; + fts_psort_insert_t ins_ctx; + uint64_t count_diag = 0; + fts_table_t fts_table; + char aux_table_name[MAX_FULL_NAME_LEN]; + dict_table_t* aux_table; + dict_index_t* aux_index; + trx_t* trx; + + /* We use the insert query graph as the dummy graph + needed in the row module call */ + + trx = trx_create(); + trx_start_if_not_started(trx, true); + + trx->op_info = "inserting index entries"; + + ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size; + + heap = mem_heap_create(500 + sizeof(mrec_buf_t)); + + b = (const byte**) mem_heap_alloc( + heap, sizeof (*b) * fts_sort_pll_degree); + foffs = (ulint*) mem_heap_alloc( + heap, sizeof(*foffs) * fts_sort_pll_degree); + offsets = (rec_offs**) mem_heap_alloc( + heap, sizeof(*offsets) * fts_sort_pll_degree); + buf = (mrec_buf_t**) mem_heap_alloc( + heap, sizeof(*buf) * fts_sort_pll_degree); + fd = (pfs_os_file_t*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree); + block = (byte**) mem_heap_alloc( + heap, sizeof(*block) * fts_sort_pll_degree); + crypt_block = (byte**) mem_heap_alloc( + heap, sizeof(*block) * fts_sort_pll_degree); + mrec = (const mrec_t**) mem_heap_alloc( + heap, sizeof(*mrec) * fts_sort_pll_degree); + sel_tree = (int*) mem_heap_alloc( + heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2)); + + tuple_heap = mem_heap_create(1000); + + ins_ctx.charset = fts_index_get_charset(index); + ins_ctx.heap = heap; + + for (i = 0; i < fts_sort_pll_degree; i++) { + ulint num; + + num = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + offsets[i] = static_cast<rec_offs*>(mem_heap_zalloc( + heap, num * sizeof *offsets[i])); + rec_offs_set_n_alloc(offsets[i], num); + rec_offs_set_n_fields(offsets[i], dict_index_get_n_fields(index)); + block[i] = psort_info[i].merge_block[id]; + crypt_block[i] = psort_info[i].crypt_block[id]; + b[i] = psort_info[i].merge_block[id]; + fd[i] = psort_info[i].merge_file[id]->fd; + foffs[i] = 0; + + buf[i] = static_cast<mrec_buf_t*>( + mem_heap_alloc(heap, sizeof *buf[i])); + + count_diag += psort_info[i].merge_file[id]->n_rec; + } + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "InnoDB_FTS: to insert " << count_diag + << " records"; + } + + /* Initialize related variables if creating FTS indexes */ + heap_alloc = ib_heap_allocator_create(heap); + + memset(&new_word, 0, sizeof(new_word)); + + new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4); + positions = ib_vector_create(heap_alloc, sizeof(ulint), 32); + last_doc_id = 0; + + /* We should set the flags2 with aux_table_name here, + in order to get the correct aux table names. */ + index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME + & ((1U << DICT_TF2_BITS) - 1);); + fts_table.type = FTS_INDEX_TABLE; + fts_table.index_id = index->id; + fts_table.table_id = table->id; + fts_table.table = index->table; + fts_table.suffix = fts_get_suffix(id); + + /* Get aux index */ + fts_get_table_name(&fts_table, aux_table_name); + aux_table = dict_table_open_on_name(aux_table_name, false, + DICT_ERR_IGNORE_NONE); + ut_ad(aux_table != NULL); + aux_index = dict_table_get_first_index(aux_table); + + ut_ad(!aux_index->is_instant()); + /* row_merge_write_fts_node() depends on the correct value */ + ut_ad(aux_index->n_core_null_bytes + == UT_BITS_IN_BYTES(aux_index->n_nullable)); + + /* Create bulk load instance */ + ins_ctx.btr_bulk = UT_NEW_NOKEY(BtrBulk(aux_index, trx)); + + /* Create tuple for insert */ + ins_ctx.tuple = dtuple_create(heap, dict_index_get_n_fields(aux_index)); + dict_index_copy_types(ins_ctx.tuple, aux_index, + dict_index_get_n_fields(aux_index)); + + /* Set TRX_ID and ROLL_PTR */ + dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 2), + &reset_trx_id, DATA_TRX_ID_LEN); + dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 3), + &reset_trx_id[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN); + + ut_d(ins_ctx.aux_index_id = id); + + const ulint space = table->space_id; + + for (i = 0; i < fts_sort_pll_degree; i++) { + if (psort_info[i].merge_file[id]->n_rec == 0) { + /* No Rows to read */ + mrec[i] = b[i] = NULL; + } else { + /* Read from temp file only if it has been + written to. Otherwise, block memory holds + all the sorted records */ + if (psort_info[i].merge_file[id]->offset > 0 + && (!row_merge_read( + fd[i], foffs[i], + (row_merge_block_t*) block[i], + (row_merge_block_t*) crypt_block[i], + space))) { + error = DB_CORRUPTION; + goto exit; + } + + ROW_MERGE_READ_GET_NEXT(i); + } + } + + height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec, + offsets, index); + + start = (1U << height) - 1; + + /* Fetch sorted records from sort buffer and insert them into + corresponding FTS index auxiliary tables */ + for (;;) { + dtuple_t* dtuple; + int min_rec = 0; + + if (fts_sort_pll_degree <= 2) { + while (!mrec[min_rec]) { + min_rec++; + + if (min_rec >= (int) fts_sort_pll_degree) { + row_fts_insert_tuple( + &ins_ctx, &new_word, + positions, &last_doc_id, + NULL); + + goto exit; + } + } + + for (i = min_rec + 1; i < fts_sort_pll_degree; i++) { + if (!mrec[i]) { + continue; + } + + if (cmp_rec_rec_simple( + mrec[i], mrec[min_rec], + offsets[i], offsets[min_rec], + index, NULL) < 0) { + min_rec = static_cast<int>(i); + } + } + } else { + min_rec = sel_tree[0]; + + if (min_rec == -1) { + row_fts_insert_tuple( + &ins_ctx, &new_word, + positions, &last_doc_id, + NULL); + + goto exit; + } + } + + dtuple = row_rec_to_index_entry_low( + mrec[min_rec], index, offsets[min_rec], + tuple_heap); + + row_fts_insert_tuple( + &ins_ctx, &new_word, positions, + &last_doc_id, dtuple); + + + ROW_MERGE_READ_GET_NEXT(min_rec); + + if (fts_sort_pll_degree > 2) { + if (!mrec[min_rec]) { + sel_tree[start + min_rec] = -1; + } + + row_fts_sel_tree_update(sel_tree, start + min_rec, + height, mrec, + offsets, index); + } + + count++; + + mem_heap_empty(tuple_heap); + } + +exit: + fts_sql_commit(trx); + + trx->op_info = ""; + + mem_heap_free(tuple_heap); + + error = ins_ctx.btr_bulk->finish(error); + UT_DELETE(ins_ctx.btr_bulk); + + aux_table->release(); + + trx->free(); + + mem_heap_free(heap); + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "InnoDB_FTS: inserted " << count << " records"; + } + + return(error); +} diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc new file mode 100644 index 00000000..d2609fdb --- /dev/null +++ b/storage/innobase/row/row0import.cc @@ -0,0 +1,4585 @@ +/***************************************************************************** + +Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0import.cc +Import a tablespace to a running instance. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0import.h" +#include "btr0pcur.h" +#ifdef BTR_CUR_HASH_ADAPT +# include "btr0sea.h" +#endif +#include "buf0flu.h" +#include "que0que.h" +#include "dict0boot.h" +#include "dict0load.h" +#include "pars0pars.h" +#include "row0row.h" +#include "row0sel.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "row0quiesce.h" +#include "fil0pagecompress.h" +#include "trx0undo.h" +#include "lock0lock.h" +#include "lzo/lzo1x.h" +#include "snappy-c.h" +#include "log.h" + +#include "scope.h" + +#include <vector> + +#ifdef HAVE_MY_AES_H +#include <my_aes.h> +#endif + +using st_::span; + +/** The size of the buffer to use for IO. +@param n physical page size +@return number of pages */ +#define IO_BUFFER_SIZE(n) ((1024 * 1024) / (n)) + +/** For gathering stats on records during phase I */ +struct row_stats_t { + ulint m_n_deleted; /*!< Number of deleted records + found in the index */ + + ulint m_n_purged; /*!< Number of records purged + optimisatically */ + + ulint m_n_rows; /*!< Number of rows */ + + ulint m_n_purge_failed; /*!< Number of deleted rows + that could not be purged */ +}; + +/** Index information required by IMPORT. */ +struct row_index_t { + index_id_t m_id; /*!< Index id of the table + in the exporting server */ + byte* m_name; /*!< Index name */ + + uint32_t m_space; /*!< Space where it is placed */ + + uint32_t m_page_no; /*!< Root page number */ + + ulint m_type; /*!< Index type */ + + ulint m_trx_id_offset; /*!< Relevant only for clustered + indexes, offset of transaction + id system column */ + + ulint m_n_user_defined_cols; /*!< User defined columns */ + + ulint m_n_uniq; /*!< Number of columns that can + uniquely identify the row */ + + ulint m_n_nullable; /*!< Number of nullable + columns */ + + ulint m_n_fields; /*!< Total number of fields */ + + dict_field_t* m_fields; /*!< Index fields */ + + const dict_index_t* + m_srv_index; /*!< Index instance in the + importing server */ + + row_stats_t m_stats; /*!< Statistics gathered during + the import phase */ + +}; + +/** Meta data required by IMPORT. */ +struct row_import { + row_import() UNIV_NOTHROW + : + m_table(NULL), + m_version(0), + m_hostname(NULL), + m_table_name(NULL), + m_autoinc(0), + m_zip_size(0), + m_flags(0), + m_n_cols(0), + m_cols(NULL), + m_col_names(NULL), + m_n_indexes(0), + m_indexes(NULL), + m_missing(true) { } + + ~row_import() UNIV_NOTHROW; + + /** Find the index entry in in the indexes array. + @param name index name + @return instance if found else 0. */ + row_index_t* get_index(const char* name) const UNIV_NOTHROW; + + /** Get the number of rows in the index. + @param name index name + @return number of rows (doesn't include delete marked rows). */ + ulint get_n_rows(const char* name) const UNIV_NOTHROW; + + /** Find the ordinal value of the column name in the cfg table columns. + @param name of column to look for. + @return ULINT_UNDEFINED if not found. */ + ulint find_col(const char* name) const UNIV_NOTHROW; + + /** Get the number of rows for which purge failed during the + convert phase. + @param name index name + @return number of rows for which purge failed. */ + ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW; + + /** Check if the index is clean. ie. no delete-marked records + @param name index name + @return true if index needs to be purged. */ + bool requires_purge(const char* name) const UNIV_NOTHROW + { + return(get_n_purge_failed(name) > 0); + } + + /** Set the index root <space, pageno> using the index name */ + void set_root_by_name() UNIV_NOTHROW; + + /** Set the index root <space, pageno> using a heuristic + @return DB_SUCCESS or error code */ + dberr_t set_root_by_heuristic() UNIV_NOTHROW; + + /** Check if the index schema that was read from the .cfg file + matches the in memory index definition. + Note: It will update row_import_t::m_srv_index to map the meta-data + read from the .cfg file to the server index instance. + @return DB_SUCCESS or error code. */ + dberr_t match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW; + + /** Check if the table schema that was read from the .cfg file + matches the in memory table definition. + @param thd MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_table_columns( + THD* thd) UNIV_NOTHROW; + + /** Check if the table (and index) schema that was read from the + .cfg file matches the in memory table definition. + @param thd MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_schema( + THD* thd) UNIV_NOTHROW; + + dberr_t match_flags(THD *thd) const ; + + + dict_table_t* m_table; /*!< Table instance */ + + ulint m_version; /*!< Version of config file */ + + byte* m_hostname; /*!< Hostname where the + tablespace was exported */ + byte* m_table_name; /*!< Exporting instance table + name */ + + ib_uint64_t m_autoinc; /*!< Next autoinc value */ + + ulint m_zip_size; /*!< ROW_FORMAT=COMPRESSED + page size, or 0 */ + + ulint m_flags; /*!< Table flags */ + + ulint m_n_cols; /*!< Number of columns in the + meta-data file */ + + dict_col_t* m_cols; /*!< Column data */ + + byte** m_col_names; /*!< Column names, we store the + column naems separately becuase + there is no field to store the + value in dict_col_t */ + + ulint m_n_indexes; /*!< Number of indexes, + including clustered index */ + + row_index_t* m_indexes; /*!< Index meta data */ + + bool m_missing; /*!< true if a .cfg file was + found and was readable */ +}; + +struct fil_iterator_t { + pfs_os_file_t file; /*!< File handle */ + const char* filepath; /*!< File path name */ + os_offset_t start; /*!< From where to start */ + os_offset_t end; /*!< Where to stop */ + os_offset_t file_size; /*!< File size in bytes */ + ulint n_io_buffers; /*!< Number of pages to use + for IO */ + byte* io_buffer; /*!< Buffer to use for IO */ + fil_space_crypt_t *crypt_data; /*!< Crypt data (if encrypted) */ + byte* crypt_io_buffer; /*!< IO buffer when encrypted */ +}; + +/** Use the page cursor to iterate over records in a block. */ +class RecIterator { +public: + /** Default constructor */ + RecIterator() UNIV_NOTHROW + { + memset(&m_cur, 0x0, sizeof(m_cur)); + /* Make page_cur_delete_rec() happy. */ + m_mtr.start(); + m_mtr.set_log_mode(MTR_LOG_NO_REDO); + } + + /** Position the cursor on the first user record. */ + rec_t* open(buf_block_t* block, const dict_index_t* index) noexcept + MY_ATTRIBUTE((warn_unused_result)) + { + m_cur.index = const_cast<dict_index_t*>(index); + page_cur_set_before_first(block, &m_cur); + return next(); + } + + /** Move to the next record. */ + rec_t* next() noexcept MY_ATTRIBUTE((warn_unused_result)) + { + return page_cur_move_to_next(&m_cur); + } + + /** + @return the current record */ + rec_t* current() UNIV_NOTHROW + { + ut_ad(!end()); + return(page_cur_get_rec(&m_cur)); + } + + buf_block_t* current_block() const { return m_cur.block; } + + /** + @return true if cursor is at the end */ + bool end() UNIV_NOTHROW + { + return(page_cur_is_after_last(&m_cur) == TRUE); + } + + /** Remove the current record + @return true on success */ + bool remove(rec_offs* offsets) UNIV_NOTHROW + { + const dict_index_t* const index = m_cur.index; + ut_ad(page_is_leaf(m_cur.block->page.frame)); + /* We can't end up with an empty page unless it is root. */ + if (page_get_n_recs(m_cur.block->page.frame) <= 1) { + return(false); + } + + if (!rec_offs_any_extern(offsets) + && m_cur.block->page.id().page_no() != index->page + && ((page_get_data_size(m_cur.block->page.frame) + - rec_offs_size(offsets) + < BTR_CUR_PAGE_COMPRESS_LIMIT(index)) + || !page_has_siblings(m_cur.block->page.frame) + || (page_get_n_recs(m_cur.block->page.frame) < 2))) { + return false; + } + +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block); + ut_a(!page_zip || page_zip_validate( + page_zip, m_cur.block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_cur_delete_rec(&m_cur, offsets, &m_mtr); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate( + page_zip, m_cur.block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + return true; + } + +private: + page_cur_t m_cur; +public: + mtr_t m_mtr; +}; + +/** Class that purges delete marked records from indexes, both secondary +and cluster. It does a pessimistic delete. This should only be done if we +couldn't purge the delete marked reocrds during Phase I. */ +class IndexPurge { +public: + /** Constructor + @param trx the user transaction covering the import tablespace + @param index to be imported + @param space_id space id of the tablespace */ + IndexPurge( + trx_t* trx, + dict_index_t* index) UNIV_NOTHROW + : + m_trx(trx), + m_index(index), + m_n_rows(0) + { + ib::info() << "Phase II - Purge records from index " + << index->name; + } + + /** Destructor */ + ~IndexPurge() UNIV_NOTHROW = default; + + /** Purge delete marked records. + @return DB_SUCCESS or error code. */ + dberr_t garbage_collect() UNIV_NOTHROW; + + /** The number of records that are not delete marked. + @return total records in the index after purge */ + ulint get_n_rows() const UNIV_NOTHROW + { + return(m_n_rows); + } + +private: + /** Begin import, position the cursor on the first record. */ + inline bool open() noexcept; + + /** Close the persistent cursor and commit the mini-transaction. */ + void close() noexcept { m_mtr.commit(); btr_pcur_close(&m_pcur); } + + /** Position the cursor on the next record. + @return DB_SUCCESS or error code */ + dberr_t next() noexcept; + + /** Store the persistent cursor position and reopen the + B-tree cursor in BTR_MODIFY_TREE mode, because the + tree structure may be changed during a pessimistic delete. */ + inline dberr_t purge_pessimistic_delete() noexcept; + + /** Purge a delete-marked record. */ + dberr_t purge() noexcept; + +protected: + // Disable copying + IndexPurge(); + IndexPurge(const IndexPurge&); + IndexPurge &operator=(const IndexPurge&); + +private: + trx_t* m_trx; /*!< User transaction */ + mtr_t m_mtr; /*!< Mini-transaction */ + btr_pcur_t m_pcur; /*!< Persistent cursor */ + dict_index_t* m_index; /*!< Index to be processed */ + ulint m_n_rows; /*!< Records in index */ +}; + +/** Functor that is called for each physical page that is read from the +tablespace file. */ +class AbstractCallback +{ +public: + /** Constructor + @param trx covering transaction */ + AbstractCallback(trx_t* trx, uint32_t space_id) + : + m_zip_size(0), + m_trx(trx), + m_space(space_id), + m_xdes(), + m_xdes_page_no(UINT32_MAX), + m_space_flags(UINT32_MAX) UNIV_NOTHROW { } + + /** Free any extent descriptor instance */ + virtual ~AbstractCallback() + { + UT_DELETE_ARRAY(m_xdes); + } + + /** Determine the page size to use for traversing the tablespace + @param file_size size of the tablespace file in bytes + @param block contents of the first page in the tablespace file. + @retval DB_SUCCESS or error code. */ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW; + + /** @return true if compressed table. */ + bool is_compressed_table() const UNIV_NOTHROW + { + return get_zip_size(); + } + + /** @return the tablespace flags */ + uint32_t get_space_flags() const { return m_space_flags; } + + /** + Set the name of the physical file and the file handle that is used + to open it for the file that is being iterated over. + @param filename the physical name of the tablespace file + @param file OS file handle */ + void set_file(const char* filename, pfs_os_file_t file) UNIV_NOTHROW + { + m_file = file; + m_filepath = filename; + } + + ulint get_zip_size() const { return m_zip_size; } + ulint physical_size() const + { + return m_zip_size ? m_zip_size : srv_page_size; + } + + const char* filename() const { return m_filepath; } + + /** + Called for every page in the tablespace. If the page was not + updated then its state must be set to BUF_PAGE_NOT_USED. For + compressed tables the page descriptor memory will be at offset: + block->page.frame + srv_page_size; + @param block block read from file, note it is not from the buffer pool + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0; + + /** @return the tablespace identifier */ + uint32_t get_space_id() const { return m_space; } + + bool is_interrupted() const { return trx_is_interrupted(m_trx); } + + /** + Get the data page depending on the table type, compressed or not. + @param block - block read from disk + @retval the buffer frame */ + static byte* get_frame(const buf_block_t* block) + { + return block->page.zip.data + ? block->page.zip.data : block->page.frame; + } + + /** Invoke the functionality for the callback */ + virtual dberr_t run(const fil_iterator_t& iter, + buf_block_t* block) UNIV_NOTHROW = 0; + +protected: + /** Get the physical offset of the extent descriptor within the page. + @param page_no page number of the extent descriptor + @param page contents of the page containing the extent descriptor. + @return the start of the xdes array in a page */ + const xdes_t* xdes( + ulint page_no, + const page_t* page) const UNIV_NOTHROW + { + ulint offset; + + offset = xdes_calc_descriptor_index(get_zip_size(), page_no); + + return(page + XDES_ARR_OFFSET + XDES_SIZE * offset); + } + + /** Set the current page directory (xdes). If the extent descriptor is + marked as free then free the current extent descriptor and set it to + 0. This implies that all pages that are covered by this extent + descriptor are also freed. + + @param page_no offset of page within the file + @param page page contents + @return DB_SUCCESS or error code. */ + dberr_t set_current_xdes( + uint32_t page_no, + const page_t* page) UNIV_NOTHROW + { + m_xdes_page_no = page_no; + + UT_DELETE_ARRAY(m_xdes); + m_xdes = NULL; + + if (mach_read_from_4(XDES_ARR_OFFSET + XDES_STATE + page) + != XDES_FREE) { + const ulint physical_size = m_zip_size + ? m_zip_size : srv_page_size; + + m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, physical_size); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_13", + UT_DELETE_ARRAY(m_xdes); + m_xdes = NULL; + ); + + if (m_xdes == NULL) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(m_xdes, page, physical_size); + } + + return(DB_SUCCESS); + } + + /** Check if the page is marked as free in the extent descriptor. + @param page_no page number to check in the extent descriptor. + @return true if the page is marked as free */ + bool is_free(uint32_t page_no) const UNIV_NOTHROW + { + ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no) + == m_xdes_page_no); + + if (m_xdes != 0) { + const xdes_t* xdesc = xdes(page_no, m_xdes); + ulint pos = page_no % FSP_EXTENT_SIZE; + + return xdes_is_free(xdesc, pos); + } + + /* If the current xdes was free, the page must be free. */ + return(true); + } + +protected: + /** The ROW_FORMAT=COMPRESSED page size, or 0. */ + ulint m_zip_size; + + /** File handle to the tablespace */ + pfs_os_file_t m_file; + + /** Physical file path. */ + const char* m_filepath; + + /** Covering transaction. */ + trx_t* m_trx; + + /** Space id of the file being iterated over. */ + uint32_t m_space; + + /** Current extent descriptor page */ + xdes_t* m_xdes; + + /** Physical page offset in the file of the extent descriptor */ + uint32_t m_xdes_page_no; + + /** Flags value read from the header page */ + uint32_t m_space_flags; +}; + +ATTRIBUTE_COLD static dberr_t invalid_space_flags(uint32_t flags) +{ + if (fsp_flags_is_incompatible_mysql(flags)) + { + sql_print_error("InnoDB: unsupported MySQL tablespace"); + return DB_UNSUPPORTED; + } + + sql_print_error("InnoDB: Invalid FSP_SPACE_FLAGS=0x%" PRIx32, flags); + return DB_CORRUPTION; +} + +/** Determine the page size to use for traversing the tablespace +@param file_size size of the tablespace file in bytes +@param block contents of the first page in the tablespace file. +@retval DB_SUCCESS or error code. */ +dberr_t +AbstractCallback::init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW +{ + const page_t* page = block->page.frame; + + m_space_flags = fsp_header_get_flags(page); + if (!fil_space_t::is_valid_flags(m_space_flags, true)) { + uint32_t cflags = fsp_flags_convert_from_101(m_space_flags); + if (cflags == UINT32_MAX) { + return DB_CORRUPTION; + } + m_space_flags = cflags; + } + + /* Clear the DATA_DIR flag, which is basically garbage. */ + m_space_flags &= ~(1U << FSP_FLAGS_POS_RESERVED); + m_zip_size = fil_space_t::zip_size(m_space_flags); + const ulint logical_size = fil_space_t::logical_size(m_space_flags); + const ulint physical_size = fil_space_t::physical_size(m_space_flags); + + if (logical_size != srv_page_size) { + + ib::error() << "Page size " << logical_size + << " of ibd file is not the same as the server page" + " size " << srv_page_size; + + return(DB_CORRUPTION); + + } else if (file_size & (physical_size - 1)) { + + ib::error() << "File size " << file_size << " is not a" + " multiple of the page size " + << physical_size; + + return(DB_CORRUPTION); + } + + if (m_space == UINT32_MAX) { + m_space = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + + page); + } + + return set_current_xdes(0, page); +} + +/** +TODO: This can be made parallel trivially by chunking up the file +and creating a callback per thread.. Main benefit will be to use +multiple CPUs for checksums and compressed tables. We have to do +compressed tables block by block right now. Secondly we need to +decompress/compress and copy too much of data. These are +CPU intensive. + +Iterate over all the pages in the tablespace. +@param iter - Tablespace iterator +@param block - block to use for IO +@param callback - Callback to inspect and update page contents +@retval DB_SUCCESS or error code */ +static dberr_t fil_iterate( + const fil_iterator_t& iter, + buf_block_t* block, + AbstractCallback& callback); + +/** +Try and determine the index root pages by checking if the next/prev +pointers are both FIL_NULL. We need to ensure that skip deleted pages. */ +struct FetchIndexRootPages : public AbstractCallback { + + /** Index information gathered from the .ibd file. */ + struct Index { + + Index(index_id_t id, uint32_t page_no) + : + m_id(id), + m_page_no(page_no) { } + + index_id_t m_id; /*!< Index id */ + uint32_t m_page_no; /*!< Root page number */ + }; + + /** Constructor + @param trx covering (user) transaction + @param table table definition in server .*/ + FetchIndexRootPages(const dict_table_t* table, trx_t* trx) + : + AbstractCallback(trx, UINT32_MAX), + m_table(table), m_index(0, 0) UNIV_NOTHROW { } + + /** Destructor */ + ~FetchIndexRootPages() UNIV_NOTHROW override = default; + + /** Fetch the clustered index root page in the tablespace + @param iter Tablespace iterator + @param block Block to use for IO + @retval DB_SUCCESS or error code */ + dberr_t run(const fil_iterator_t& iter, + buf_block_t* block) UNIV_NOTHROW override; + + /** Called for each block as it is read from the file. + @param block block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override; + + /** Update the import configuration that will be used to import + the tablespace. */ + dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW; + + /** Table definition in server. */ + const dict_table_t* m_table; + + /** Index information */ + Index m_index; +}; + +/** Called for each block as it is read from the file. Check index pages to +determine the exact row format. We can't get that from the tablespace +header flags alone. + +@param block block to convert, it is not from the buffer pool. +@retval DB_SUCCESS or error code. */ +dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW +{ + if (is_interrupted()) return DB_INTERRUPTED; + + const page_t* page = get_frame(block); + + m_index.m_id = btr_page_get_index_id(page); + m_index.m_page_no = block->page.id().page_no(); + + /* Check that the tablespace flags match the table flags. */ + const uint32_t expected = dict_tf_to_fsp_flags(m_table->flags); + if (!fsp_flags_match(expected, m_space_flags)) { + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Expected FSP_SPACE_FLAGS=0x%x, .ibd " + "file contains 0x%x.", + unsigned(expected), + unsigned(m_space_flags)); + return(DB_CORRUPTION); + } + + if (!page_is_comp(block->page.frame) != + !dict_table_is_comp(m_table)) { + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "ROW_FORMAT mismatch"); + return DB_CORRUPTION; + } + + return DB_SUCCESS; +} + +/** +Update the import configuration that will be used to import the tablespace. +@return error code or DB_SUCCESS */ +dberr_t +FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW +{ + ut_a(cfg->m_table == m_table); + cfg->m_zip_size = m_zip_size; + cfg->m_n_indexes = 1; + + if (cfg->m_n_indexes == 0) { + + ib::error() << "No B+Tree found in tablespace"; + + return(DB_CORRUPTION); + } + + cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_11", + UT_DELETE_ARRAY(cfg->m_indexes); + cfg->m_indexes = NULL; + ); + + if (cfg->m_indexes == NULL) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + row_index_t* cfg_index = cfg->m_indexes; + + char name[BUFSIZ]; + + snprintf(name, sizeof(name), "index" IB_ID_FMT, m_index.m_id); + + ulint len = strlen(name) + 1; + + cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_12", + UT_DELETE_ARRAY(cfg_index->m_name); + cfg_index->m_name = NULL; + ); + + if (cfg_index->m_name == NULL) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(cfg_index->m_name, name, len); + + cfg_index->m_id = m_index.m_id; + + cfg_index->m_space = m_space; + + cfg_index->m_page_no = m_index.m_page_no; + + return(DB_SUCCESS); +} + +/* Functor that is called for each physical page that is read from the +tablespace file. + + 1. Check each page for corruption. + + 2. Update the space id and LSN on every page + * For the header page + - Validate the flags + - Update the LSN + + 3. On Btree pages + * Set the index id + * Update the max trx id + * In a cluster index, update the system columns + * In a cluster index, update the BLOB ptr, set the space id + * Purge delete marked records, but only if they can be easily + removed from the page + * Keep a counter of number of rows, ie. non-delete-marked rows + * Keep a counter of number of delete marked rows + * Keep a counter of number of purge failure + * If a page is stamped with an index id that isn't in the .cfg file + we assume it is deleted and the page can be ignored. + + 4. Set the page state to dirty so that it will be written to disk. +*/ +class PageConverter : public AbstractCallback { +public: + /** Constructor + @param cfg config of table being imported. + @param space_id tablespace identifier + @param trx transaction covering the import */ + PageConverter(row_import* cfg, uint32_t space_id, trx_t* trx) + : + AbstractCallback(trx, space_id), + m_cfg(cfg), + m_index(cfg->m_indexes), + m_rec_iter(), + m_offsets_(), m_offsets(m_offsets_), + m_heap(0), + m_cluster_index(dict_table_get_first_index(cfg->m_table)) + { + rec_offs_init(m_offsets_); + } + + ~PageConverter() UNIV_NOTHROW override + { + if (m_heap != 0) { + mem_heap_free(m_heap); + } + } + + dberr_t run(const fil_iterator_t& iter, + buf_block_t* block) UNIV_NOTHROW override + { + return fil_iterate(iter, block, *this); + } + + /** Called for each block as it is read from the file. + @param block block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override; + +private: + /** Update the page, set the space id, max trx id and index id. + @param block block read from file + @param page_type type of the page + @retval DB_SUCCESS or error code */ + dberr_t update_page(buf_block_t* block, uint16_t& page_type) + UNIV_NOTHROW; + + /** Update the space, index id, trx id. + @param block block to convert + @return DB_SUCCESS or error code */ + dberr_t update_index_page(buf_block_t* block) UNIV_NOTHROW; + + /** Update the BLOB refrences and write UNDO log entries for + rows that can't be purged optimistically. + @param block block to update + @retval DB_SUCCESS or error code */ + dberr_t update_records(buf_block_t* block) UNIV_NOTHROW; + + /** Validate the space flags and update tablespace header page. + @param block block read from file, not from the buffer pool. + @retval DB_SUCCESS or error code */ + dberr_t update_header(buf_block_t* block) UNIV_NOTHROW; + + /** Adjust the BLOB reference for a single column that is externally stored + @param rec record to update + @param offsets column offsets for the record + @param i column ordinal value + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_column( + rec_t* rec, + const rec_offs* offsets, + ulint i) UNIV_NOTHROW; + + /** Adjusts the BLOB reference in the clustered index row for all + externally stored columns. + @param rec record to update + @param offsets column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_columns( + rec_t* rec, + const rec_offs* offsets) UNIV_NOTHROW; + + /** In the clustered index, adjist the BLOB pointers as needed. + Also update the BLOB reference, write the new space id. + @param rec record to update + @param offsets column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_ref( + rec_t* rec, + const rec_offs* offsets) UNIV_NOTHROW; + + /** Purge delete-marked records, only if it is possible to do + so without re-organising the B+tree. + @retval true if purged */ + bool purge() UNIV_NOTHROW; + + /** Adjust the BLOB references and sys fields for the current record. + @param rec record to update + @param offsets column offsets for the record + @return DB_SUCCESS or error code. */ + dberr_t adjust_cluster_record( + rec_t* rec, + const rec_offs* offsets) UNIV_NOTHROW; + + /** Find an index with the matching id. + @return row_index_t* instance or 0 */ + row_index_t* find_index(index_id_t id) UNIV_NOTHROW + { + row_index_t* index = &m_cfg->m_indexes[0]; + + for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) { + if (id == index->m_id) { + return(index); + } + } + + return(0); + + } +private: + /** Config for table that is being imported. */ + row_import* m_cfg; + + /** Current index whose pages are being imported */ + row_index_t* m_index; + + /** Iterator over records in a block */ + RecIterator m_rec_iter; + + /** Record offset */ + rec_offs m_offsets_[REC_OFFS_NORMAL_SIZE]; + + /** Pointer to m_offsets_ */ + rec_offs* m_offsets; + + /** Memory heap for the record offsets */ + mem_heap_t* m_heap; + + /** Cluster index instance */ + dict_index_t* m_cluster_index; +}; + +/** +row_import destructor. */ +row_import::~row_import() UNIV_NOTHROW +{ + for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) { + UT_DELETE_ARRAY(m_indexes[i].m_name); + + if (m_indexes[i].m_fields == NULL) { + continue; + } + + dict_field_t* fields = m_indexes[i].m_fields; + ulint n_fields = m_indexes[i].m_n_fields; + + for (ulint j = 0; j < n_fields; ++j) { + UT_DELETE_ARRAY(const_cast<char*>(fields[j].name())); + } + + UT_DELETE_ARRAY(fields); + } + + for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) { + UT_DELETE_ARRAY(m_col_names[i]); + } + + UT_DELETE_ARRAY(m_cols); + UT_DELETE_ARRAY(m_indexes); + UT_DELETE_ARRAY(m_col_names); + UT_DELETE_ARRAY(m_table_name); + UT_DELETE_ARRAY(m_hostname); +} + +/** Find the index entry in in the indexes array. +@param name index name +@return instance if found else 0. */ +row_index_t* +row_import::get_index( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_indexes; ++i) { + const char* index_name; + row_index_t* index = &m_indexes[i]; + + index_name = reinterpret_cast<const char*>(index->m_name); + + if (strcmp(index_name, name) == 0) { + + return(index); + } + } + + return(0); +} + +/** Get the number of rows in the index. +@param name index name +@return number of rows (doesn't include delete marked rows). */ +ulint +row_import::get_n_rows( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_rows); +} + +/** Get the number of rows for which purge failed uding the convert phase. +@param name index name +@return number of rows for which purge failed. */ +ulint +row_import::get_n_purge_failed( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_purge_failed); +} + +/** Find the ordinal value of the column name in the cfg table columns. +@param name of column to look for. +@return ULINT_UNDEFINED if not found. */ +ulint +row_import::find_col( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_cols; ++i) { + const char* col_name; + + col_name = reinterpret_cast<const char*>(m_col_names[i]); + + if (strcmp(col_name, name) == 0) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/** +Check if the index schema that was read from the .cfg file matches the +in memory index definition. +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW +{ + row_index_t* cfg_index; + dberr_t err = DB_SUCCESS; + + cfg_index = get_index(index->name); + + if (cfg_index == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s not found in tablespace meta-data file.", + index->name()); + + return(DB_ERROR); + } + + if (cfg_index->m_n_fields != index->n_fields) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index field count %u doesn't match" + " tablespace metadata file value " ULINTPF, + index->n_fields, cfg_index->m_n_fields); + + return(DB_ERROR); + } + + cfg_index->m_srv_index = index; + + const dict_field_t* field = index->fields; + const dict_field_t* cfg_field = cfg_index->m_fields; + + for (ulint i = 0; i < index->n_fields; ++i, ++field, ++cfg_field) { + + if (field->name() && cfg_field->name() + && strcmp(field->name(), cfg_field->name()) != 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index field name %s doesn't match" + " tablespace metadata field name %s" + " for field position " ULINTPF, + field->name(), cfg_field->name(), i); + + err = DB_ERROR; + } + + if (cfg_field->prefix_len != field->prefix_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s prefix len %u" + " doesn't match metadata file value %u", + index->name(), field->name(), + field->prefix_len, cfg_field->prefix_len); + + err = DB_ERROR; + } + + if (cfg_field->fixed_len != field->fixed_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s fixed len %u" + " doesn't match metadata file value %u", + index->name(), field->name(), + field->fixed_len, + cfg_field->fixed_len); + + err = DB_ERROR; + } + } + + return(err); +} + +/** Check if the table schema that was read from the .cfg file matches the +in memory table definition. +@param thd MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_table_columns( + THD* thd) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + const dict_col_t* col = m_table->cols; + + for (ulint i = 0; i < m_table->n_cols; ++i, ++col) { + + const char* col_name; + ulint cfg_col_index; + + col_name = dict_table_get_col_name( + m_table, dict_col_get_no(col)); + + cfg_col_index = find_col(col_name); + + if (cfg_col_index == ULINT_UNDEFINED) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s not found in tablespace.", + col_name); + + err = DB_ERROR; + } else if (cfg_col_index != col->ind) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordinal value mismatch, it's at %u" + " in the table and " ULINTPF + " in the tablespace meta-data file", + col_name, col->ind, cfg_col_index); + + err = DB_ERROR; + } else { + const dict_col_t* cfg_col; + + cfg_col = &m_cols[cfg_col_index]; + ut_a(cfg_col->ind == cfg_col_index); + + if (cfg_col->prtype != col->prtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s precise type mismatch," + " it's 0X%X in the table and 0X%X" + " in the tablespace meta file", + col_name, col->prtype, cfg_col->prtype); + err = DB_ERROR; + } + + if (cfg_col->mtype != col->mtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s main type mismatch," + " it's 0X%X in the table and 0X%X" + " in the tablespace meta file", + col_name, col->mtype, cfg_col->mtype); + err = DB_ERROR; + } + + if (cfg_col->len != col->len) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s length mismatch," + " it's %u in the table and %u" + " in the tablespace meta file", + col_name, col->len, cfg_col->len); + err = DB_ERROR; + } + + if (cfg_col->mbminlen != col->mbminlen + || cfg_col->mbmaxlen != col->mbmaxlen) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s multi-byte len mismatch," + " it's %u-%u in the table and %u-%u" + " in the tablespace meta file", + col_name, col->mbminlen, col->mbmaxlen, + cfg_col->mbminlen, cfg_col->mbmaxlen); + err = DB_ERROR; + } + + if (cfg_col->ind != col->ind) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s position mismatch," + " it's %u in the table and %u" + " in the tablespace meta file", + col_name, col->ind, cfg_col->ind); + err = DB_ERROR; + } + + if (cfg_col->ord_part != col->ord_part) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordering mismatch," + " it's %u in the table and %u" + " in the tablespace meta file", + col_name, col->ord_part, + cfg_col->ord_part); + err = DB_ERROR; + } + + if (cfg_col->max_prefix != col->max_prefix) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s max prefix mismatch" + " it's %u in the table and %u" + " in the tablespace meta file", + col_name, col->max_prefix, + cfg_col->max_prefix); + err = DB_ERROR; + } + } + } + + return(err); +} + +dberr_t row_import::match_flags(THD *thd) const +{ + ulint mismatch= (m_table->flags ^ m_flags) & ~DICT_TF_MASK_DATA_DIR; + if (!mismatch) + return DB_SUCCESS; + + const char *msg; + if (mismatch & DICT_TF_MASK_ZIP_SSIZE) + { + if ((m_table->flags & DICT_TF_MASK_ZIP_SSIZE) && + (m_flags & DICT_TF_MASK_ZIP_SSIZE)) + { + switch (m_flags & DICT_TF_MASK_ZIP_SSIZE) { + case 0U << DICT_TF_POS_ZIP_SSIZE: + goto uncompressed; + case 1U << DICT_TF_POS_ZIP_SSIZE: + msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1"; + break; + case 2U << DICT_TF_POS_ZIP_SSIZE: + msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2"; + break; + case 3U << DICT_TF_POS_ZIP_SSIZE: + msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4"; + break; + case 4U << DICT_TF_POS_ZIP_SSIZE: + msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8"; + break; + case 5U << DICT_TF_POS_ZIP_SSIZE: + msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=16"; + break; + default: + msg= "strange KEY_BLOCK_SIZE"; + } + } + else if (m_flags & DICT_TF_MASK_ZIP_SSIZE) + msg= "ROW_FORMAT=COMPRESSED"; + else + goto uncompressed; + } + else + { + uncompressed: + msg= (m_flags & DICT_TF_MASK_ATOMIC_BLOBS) ? "ROW_FORMAT=DYNAMIC" + : (m_flags & DICT_TF_MASK_COMPACT) ? "ROW_FORMAT=COMPACT" + : "ROW_FORMAT=REDUNDANT"; + } + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Table flags don't match, server table has 0x%x and the meta-data " + "file has 0x%zx; .cfg file uses %s", + m_table->flags, m_flags, msg); + + return DB_ERROR; +} + +/** Check if the table (and index) schema that was read from the .cfg file +matches the in memory table definition. +@param thd MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_schema( + THD* thd) UNIV_NOTHROW +{ + /* Do some simple checks. */ + + if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + /* If the number of indexes don't match then it is better + to abort the IMPORT. It is easy for the user to create a + table matching the IMPORT definition. */ + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of indexes don't match, table has " ULINTPF + " indexes but the tablespace meta-data file has " + ULINTPF " indexes", + UT_LIST_GET_LEN(m_table->indexes), m_n_indexes); + + return(DB_ERROR); + } + + dberr_t err = match_table_columns(thd); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Check if the index definitions match. */ + + const dict_index_t* index; + + for (index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + dberr_t index_err; + + index_err = match_index_columns(thd, index); + + if (index_err != DB_SUCCESS) { + err = index_err; + } + } + + return(err); +} + +/** +Set the index root <space, pageno>, using index name. */ +void +row_import::set_root_by_name() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) { + dict_index_t* index; + + const char* index_name; + + index_name = reinterpret_cast<const char*>(cfg_index->m_name); + + index = dict_table_get_index_on_name(m_table, index_name); + + /* We've already checked that it exists. */ + ut_a(index != 0); + + index->page = cfg_index->m_page_no; + } +} + +/** +Set the index root <space, pageno>, using a heuristic. +@return DB_SUCCESS or error code */ +dberr_t +row_import::set_root_by_heuristic() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + ut_a(m_n_indexes > 0); + + // TODO: For now use brute force, based on ordinality + + if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + ib::warn() << "Table " << m_table->name << " should have " + << UT_LIST_GET_LEN(m_table->indexes) << " indexes but" + " the tablespace has " << m_n_indexes << " indexes"; + } + + ulint i = 0; + dberr_t err = DB_SUCCESS; + + for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + index->type |= DICT_CORRUPT; + ib::warn() << "Skipping FTS index: " << index->name; + } else if (i < m_n_indexes) { + + UT_DELETE_ARRAY(cfg_index[i].m_name); + + ulint len = strlen(index->name) + 1; + + cfg_index[i].m_name = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_14", + UT_DELETE_ARRAY(cfg_index[i].m_name); + cfg_index[i].m_name = NULL; + ); + + if (cfg_index[i].m_name == NULL) { + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(cfg_index[i].m_name, index->name, len); + + cfg_index[i].m_srv_index = index; + + index->page = cfg_index[i++].m_page_no; + } + } + + return(err); +} + +/** +Purge delete marked records. +@return DB_SUCCESS or error code. */ +dberr_t +IndexPurge::garbage_collect() UNIV_NOTHROW +{ + ibool comp = dict_table_is_comp(m_index->table); + + /* Open the persistent cursor and start the mini-transaction. */ + + dberr_t err = open() ? next() : DB_CORRUPTION; + + for (; err == DB_SUCCESS; err = next()) { + + rec_t* rec = btr_pcur_get_rec(&m_pcur); + ibool deleted = rec_get_deleted_flag(rec, comp); + + if (!deleted) { + ++m_n_rows; + } else { + err = purge(); + if (err != DB_SUCCESS) { + break; + } + } + } + + /* Close the persistent cursor and commit the mini-transaction. */ + + close(); + + return(err == DB_END_OF_INDEX ? DB_SUCCESS : err); +} + +/** +Begin import, position the cursor on the first record. */ +inline bool IndexPurge::open() noexcept +{ + m_mtr.start(); + m_mtr.set_log_mode(MTR_LOG_NO_REDO); + + btr_pcur_init(&m_pcur); + + if (m_pcur.open_leaf(true, m_index, BTR_MODIFY_LEAF, &m_mtr) != DB_SUCCESS) + return false; + + rec_t *rec= page_rec_get_next(btr_pcur_get_rec(&m_pcur)); + if (!rec) + return false; + if (rec_is_metadata(rec, *m_index)) + /* Skip the metadata pseudo-record. */ + btr_pcur_get_page_cur(&m_pcur)->rec= rec; + return true; +} + +/** +Position the cursor on the next record. +@return DB_SUCCESS or error code */ +dberr_t IndexPurge::next() noexcept +{ + if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(&m_pcur))) { + return DB_CORRUPTION; + } + + /* When switching pages, commit the mini-transaction + in order to release the latch on the old page. */ + + if (!btr_pcur_is_after_last_on_page(&m_pcur)) { + return(DB_SUCCESS); + } else if (trx_is_interrupted(m_trx)) { + /* Check after every page because the check + is expensive. */ + return(DB_INTERRUPTED); + } + + btr_pcur_store_position(&m_pcur, &m_mtr); + + mtr_commit(&m_mtr); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + if (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr) + == btr_pcur_t::CORRUPTED) { + return DB_CORRUPTION; + } + /* The following is based on btr_pcur_move_to_next_user_rec(). */ + m_pcur.old_rec = nullptr; + ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF); + do { + if (btr_pcur_is_after_last_on_page(&m_pcur)) { + if (btr_pcur_is_after_last_in_tree(&m_pcur)) { + return DB_END_OF_INDEX; + } + + if (dberr_t err = btr_pcur_move_to_next_page(&m_pcur, + &m_mtr)) { + return err; + } + } else if (!btr_pcur_move_to_next_on_page(&m_pcur)) { + return DB_CORRUPTION; + } + } while (!btr_pcur_is_on_user_rec(&m_pcur)); + + return DB_SUCCESS; +} + +/** +Store the persistent cursor position and reopen the +B-tree cursor in BTR_MODIFY_TREE mode, because the +tree structure may be changed during a pessimistic delete. */ +inline dberr_t IndexPurge::purge_pessimistic_delete() noexcept +{ + dberr_t err; + if (m_pcur.restore_position(BTR_PURGE_TREE, &m_mtr) != btr_pcur_t::CORRUPTED) + { + ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(&m_pcur), + m_index->table->not_redundant())); + btr_cur_pessimistic_delete(&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, + false, &m_mtr); + } + else + err= DB_CORRUPTION; + + m_mtr.commit(); + return err; +} + +dberr_t IndexPurge::purge() noexcept +{ + btr_pcur_store_position(&m_pcur, &m_mtr); + m_mtr.commit(); + m_mtr.start(); + m_mtr.set_log_mode(MTR_LOG_NO_REDO); + dberr_t err= purge_pessimistic_delete(); + + m_mtr.start(); + m_mtr.set_log_mode(MTR_LOG_NO_REDO); + if (err == DB_SUCCESS) + err= (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr) == + btr_pcur_t::CORRUPTED) + ? DB_CORRUPTION : DB_SUCCESS; + return err; +} + +/** Adjust the BLOB reference for a single column that is externally stored +@param rec record to update +@param offsets column offsets for the record +@param i column ordinal value +@return DB_SUCCESS or error code */ +inline +dberr_t +PageConverter::adjust_cluster_index_blob_column( + rec_t* rec, + const rec_offs* offsets, + ulint i) UNIV_NOTHROW +{ + ulint len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &len); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_2", + len = BTR_EXTERN_FIELD_REF_SIZE - 1;); + + if (len < BTR_EXTERN_FIELD_REF_SIZE) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Externally stored column(" ULINTPF + ") has a reference length of " ULINTPF + " in the cluster index %s", + i, len, m_cluster_index->name()); + + return(DB_CORRUPTION); + } + + field += len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_SPACE_ID); + + mach_write_to_4(field, get_space_id()); + + if (UNIV_LIKELY_NULL(m_rec_iter.current_block()->page.zip.data)) { + page_zip_write_blob_ptr( + m_rec_iter.current_block(), rec, m_cluster_index, + offsets, i, &m_rec_iter.m_mtr); + } + + return(DB_SUCCESS); +} + +/** Adjusts the BLOB reference in the clustered index row for all externally +stored columns. +@param rec record to update +@param offsets column offsets for the record +@return DB_SUCCESS or error code */ +inline +dberr_t +PageConverter::adjust_cluster_index_blob_columns( + rec_t* rec, + const rec_offs* offsets) UNIV_NOTHROW +{ + ut_ad(rec_offs_any_extern(offsets)); + + /* Adjust the space_id in the BLOB pointers. */ + + for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) { + + /* Only if the column is stored "externally". */ + + if (rec_offs_nth_extern(offsets, i)) { + dberr_t err; + + err = adjust_cluster_index_blob_column(rec, offsets, i); + + if (err != DB_SUCCESS) { + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/** In the clustered index, adjust BLOB pointers as needed. Also update the +BLOB reference, write the new space id. +@param rec record to update +@param offsets column offsets for the record +@return DB_SUCCESS or error code */ +inline +dberr_t +PageConverter::adjust_cluster_index_blob_ref( + rec_t* rec, + const rec_offs* offsets) UNIV_NOTHROW +{ + if (rec_offs_any_extern(offsets)) { + dberr_t err; + + err = adjust_cluster_index_blob_columns(rec, offsets); + + if (err != DB_SUCCESS) { + return(err); + } + } + + return(DB_SUCCESS); +} + +/** Purge delete-marked records, only if it is possible to do so without +re-organising the B+tree. +@return true if purge succeeded */ +inline bool PageConverter::purge() UNIV_NOTHROW +{ + /* We can't have a page that is empty and not root. */ + if (m_rec_iter.remove(m_offsets)) { + + ++m_index->m_stats.m_n_purged; + + return(true); + } else { + ++m_index->m_stats.m_n_purge_failed; + } + + return(false); +} + +/** Adjust the BLOB references and sys fields for the current record. +@param rec record to update +@param offsets column offsets for the record +@return DB_SUCCESS or error code. */ +inline +dberr_t +PageConverter::adjust_cluster_record( + rec_t* rec, + const rec_offs* offsets) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) { + + /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields + are only written in conjunction with other changes to the + record. */ + ulint trx_id_pos = m_cluster_index->n_uniq + ? m_cluster_index->n_uniq : 1; + if (UNIV_LIKELY_NULL(m_rec_iter.current_block() + ->page.zip.data)) { + page_zip_write_trx_id_and_roll_ptr( + m_rec_iter.current_block(), + rec, m_offsets, trx_id_pos, + 0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS, + &m_rec_iter.m_mtr); + } else { + ulint len; + byte* ptr = rec_get_nth_field( + rec, m_offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + memcpy(ptr, reset_trx_id, sizeof reset_trx_id); + } + } + + return(err); +} + +/** Update the BLOB refrences and write UNDO log entries for +rows that can't be purged optimistically. +@param block block to update +@retval DB_SUCCESS or error code */ +inline +dberr_t +PageConverter::update_records( + buf_block_t* block) UNIV_NOTHROW +{ + ibool comp = dict_table_is_comp(m_cfg->m_table); + bool clust_index = m_index->m_srv_index == m_cluster_index; + + /* This will also position the cursor on the first user record. */ + + if (!m_rec_iter.open(block, m_index->m_srv_index)) { + return DB_CORRUPTION; + } + + while (!m_rec_iter.end()) { + rec_t* rec = m_rec_iter.current(); + ibool deleted = rec_get_deleted_flag(rec, comp); + + /* For the clustered index we have to adjust the BLOB + reference and the system fields irrespective of the + delete marked flag. The adjustment of delete marked + cluster records is required for purge to work later. */ + + if (deleted || clust_index) { + m_offsets = rec_get_offsets( + rec, m_index->m_srv_index, m_offsets, + m_index->m_srv_index->n_core_fields, + ULINT_UNDEFINED, &m_heap); + } + + if (clust_index) { + + dberr_t err = adjust_cluster_record(rec, m_offsets); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* If it is a delete marked record then try an + optimistic delete. */ + + if (deleted) { + ++m_index->m_stats.m_n_deleted; + /* A successful purge will move the cursor to the + next record. */ + + if (purge()) { + continue; + } + } else { + ++m_index->m_stats.m_n_rows; + } + + if (!m_rec_iter.next()) { + return DB_CORRUPTION; + } + } + + return(DB_SUCCESS); +} + +/** Update the space, index id, trx id. +@return DB_SUCCESS or error code */ +inline +dberr_t +PageConverter::update_index_page( + buf_block_t* block) UNIV_NOTHROW +{ + const page_id_t page_id(block->page.id()); + + if (is_free(page_id.page_no())) { + return(DB_SUCCESS); + } + + buf_frame_t* page = block->page.frame; + const index_id_t id = btr_page_get_index_id(page); + + if (id != m_index->m_id) { + row_index_t* index = find_index(id); + + if (UNIV_UNLIKELY(!index)) { + if (!m_cfg->m_missing) { + ib::warn() << "Unknown index id " << id + << " on page " << page_id.page_no(); + } + return DB_SUCCESS; + } + + m_index = index; + } + + /* If the .cfg file is missing and there is an index mismatch + then ignore the error. */ + if (m_cfg->m_missing && !m_index->m_srv_index) { + return(DB_SUCCESS); + } + + if (m_index && page_id.page_no() == m_index->m_page_no) { + byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE + + page; + mach_write_to_4(b, page_id.space()); + + memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE + + page, b, 4); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + memcpy(&block->page.zip.data[FIL_PAGE_DATA + + PAGE_BTR_SEG_TOP + + FSEG_HDR_SPACE], b, 4); + memcpy(&block->page.zip.data[FIL_PAGE_DATA + + PAGE_BTR_SEG_LEAF + + FSEG_HDR_SPACE], b, 4); + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!block->page.zip.data || page_zip_validate(&block->page.zip, page, + m_index->m_srv_index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* This has to be written to uncompressed index header. Set it to + the current index id. */ + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), + m_index->m_srv_index->id); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID], + &block->page.frame[PAGE_HEADER + PAGE_INDEX_ID], 8); + } + + if (m_index->m_srv_index->is_clust()) { + if (page_id.page_no() != m_index->m_srv_index->page) { + goto clear_page_max_trx_id; + } + } else if (page_is_leaf(page)) { + /* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */ + mach_write_to_8(&block->page.frame + [PAGE_HEADER + PAGE_MAX_TRX_ID], m_trx->id); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + memcpy_aligned<8>(&block->page.zip.data + [PAGE_HEADER + PAGE_MAX_TRX_ID], + &block->page.frame + [PAGE_HEADER + PAGE_MAX_TRX_ID], 8); + } + } else { +clear_page_max_trx_id: + /* Clear PAGE_MAX_TRX_ID so that it can be + used for other purposes in the future. IMPORT + in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1 + would set the field to the transaction ID even + on clustered index pages. */ + memset_aligned<8>(&block->page.frame + [PAGE_HEADER + PAGE_MAX_TRX_ID], + 0, 8); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + memset_aligned<8>(&block->page.zip.data + [PAGE_HEADER + PAGE_MAX_TRX_ID], + 0, 8); + } + } + + if (page_is_empty(page)) { + + /* Only a root page can be empty. */ + if (page_has_siblings(page)) { + // TODO: We should relax this and skip secondary + // indexes. Mark them as corrupt because they can + // always be rebuilt. + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); + } + + return page_is_leaf(block->page.frame) + ? update_records(block) + : DB_SUCCESS; +} + +/** Validate the space flags and update tablespace header page. +@param block block read from file, not from the buffer pool. +@retval DB_SUCCESS or error code */ +inline dberr_t PageConverter::update_header(buf_block_t* block) UNIV_NOTHROW +{ + byte *frame= get_frame(block); + if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + frame, + FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, 4)) + ib::warn() << "Space id check in the header failed: ignored"; + else if (!mach_read_from_4(FIL_PAGE_SPACE_ID + frame)) + return DB_CORRUPTION; + + memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + + /* Write space_id to the tablespace header, page 0. */ + mach_write_to_4(FIL_PAGE_SPACE_ID + frame, get_space_id()); + memcpy_aligned<2>(FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, + FIL_PAGE_SPACE_ID + frame, 4); + /* Write back the adjusted flags. */ + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + frame, m_space_flags); + + return DB_SUCCESS; +} + +/** Update the page, set the space id, max trx id and index id. +@param block block read from file +@retval DB_SUCCESS or error code */ +inline +dberr_t +PageConverter::update_page(buf_block_t* block, uint16_t& page_type) + UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + + ut_ad(!block->page.zip.data == !is_compressed_table()); + + switch (page_type = fil_page_get_type(get_frame(block))) { + case FIL_PAGE_TYPE_FSP_HDR: + ut_a(block->page.id().page_no() == 0); + /* Work directly on the uncompressed page headers. */ + return(update_header(block)); + + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + /* We need to decompress the contents + before we can do anything. */ + + if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) { + return(DB_CORRUPTION); + } + + /* fall through */ + case FIL_PAGE_TYPE_INSTANT: + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + /* Only update the Btree nodes. */ + return(update_index_page(block)); + + case FIL_PAGE_TYPE_SYS: + /* This is page 0 in the system tablespace. */ + return(DB_CORRUPTION); + + case FIL_PAGE_TYPE_XDES: + err = set_current_xdes( + block->page.id().page_no(), get_frame(block)); + /* fall through */ + case FIL_PAGE_INODE: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + + /* Work directly on the uncompressed page headers. */ + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + return(err); + } + + ib::warn() << "Unknown page type (" << page_type << ")"; + + return(DB_CORRUPTION); +} + +/** Called for every page in the tablespace. If the page was not +updated then its state must be set to BUF_PAGE_NOT_USED. +@param block block read from file, note it is not from the buffer pool +@retval DB_SUCCESS or error code. */ +dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW +{ + /* If we already had an old page with matching number + in the buffer pool, evict it now, because + we no longer evict the pages on DISCARD TABLESPACE. */ + buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH, + nullptr, BUF_PEEK_IF_IN_POOL, + nullptr, nullptr, false); + + uint16_t page_type; + + if (dberr_t err = update_page(block, page_type)) { + return err; + } + + const bool full_crc32 = fil_space_t::full_crc32(get_space_flags()); + byte* frame = get_frame(block); + memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8); + + if (!block->page.zip.data) { + buf_flush_init_for_writing( + NULL, block->page.frame, NULL, full_crc32); + } else if (fil_page_type_is_index(page_type)) { + buf_flush_init_for_writing( + NULL, block->page.zip.data, &block->page.zip, + full_crc32); + } else { + /* Calculate and update the checksum of non-index + pages for ROW_FORMAT=COMPRESSED tables. */ + buf_flush_update_zip_checksum( + block->page.zip.data, block->zip_size()); + } + + return DB_SUCCESS; +} + +/*****************************************************************//** +Clean up after import tablespace. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_cleanup( +/*===============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + dberr_t err) /*!< in: error code */ +{ + if (err != DB_SUCCESS) { + dict_table_t* table = prebuilt->table; + table->file_unreadable = true; + if (table->space) { + fil_close_tablespace(table->space_id); + table->space = NULL; + } + + prebuilt->trx->error_info = NULL; + + ib::info() << "Discarding tablespace of table " + << table->name << ": " << err; + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + index->page = FIL_NULL; + } + } + + DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE();); + + prebuilt->trx->commit(); + + if (prebuilt->trx->dict_operation_lock_mode) { + row_mysql_unlock_data_dictionary(prebuilt->trx); + } + + prebuilt->trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE();); + + return(err); +} + +/*****************************************************************//** +Report error during tablespace import. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_error( +/*=============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + dberr_t err) /*!< in: error code */ +{ + if (!trx_is_interrupted(prebuilt->trx)) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name.m_name); + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_INNODB_IMPORT_ERROR, + table_name, (ulong) err, ut_strerr(err)); + } + + return row_import_cleanup(prebuilt, err); +} + +/*****************************************************************//** +Adjust the root page index node and leaf node segment headers, update +with the new space id. For all the table's secondary indexes. +@return error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_adjust_root_pages_of_secondary_indexes( +/*==============================================*/ + trx_t* trx, /*!< in: transaction used for + the import */ + dict_table_t* table, /*!< in: table the indexes + belong to */ + const row_import& cfg) /*!< Import context */ +{ + dict_index_t* index; + ulint n_rows_in_table; + dberr_t err = DB_SUCCESS; + + /* Skip the clustered index. */ + index = dict_table_get_first_index(table); + + n_rows_in_table = cfg.get_n_rows(index->name); + + DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure", + n_rows_in_table++;); + + /* Adjust the root pages of the secondary indexes only. */ + while ((index = dict_table_get_next_index(index)) != NULL) { + ut_a(!dict_index_is_clust(index)); + + if (!(index->type & DICT_CORRUPT) + && index->page != FIL_NULL) { + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + } else { + ib::warn() << "Skip adjustment of root pages for" + " index " << index->name << "."; + + err = DB_CORRUPTION; + } + + if (err != DB_SUCCESS) { + + if (index->type & DICT_CLUSTERED) { + break; + } + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index %s not found or corrupt," + " you should recreate this index.", + index->name()); + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + index->type |= DICT_CORRUPT; + continue; + } + + /* If we failed to purge any records in the index then + do it the hard way. + + TODO: We can do this in the first pass by generating UNDO log + records for the failed rows. */ + + if (!cfg.requires_purge(index->name)) { + continue; + } + + IndexPurge purge(trx, index); + + trx->op_info = "secondary: purge delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + + if (err != DB_SUCCESS) { + break; + } else if (purge.get_n_rows() != n_rows_in_table) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' contains " ULINTPF " entries, " + "should be " ULINTPF ", you should recreate " + "this index.", index->name(), + purge.get_n_rows(), n_rows_in_table); + + index->type |= DICT_CORRUPT; + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + } + } + + return(err); +} + +/*****************************************************************//** +Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */ +MY_ATTRIBUTE((nonnull)) static +void +row_import_set_sys_max_row_id( +/*==========================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + const dict_table_t* table) /*!< in: table to import */ +{ + const rec_t* rec; + mtr_t mtr; + btr_pcur_t pcur; + row_id_t row_id = 0; + dict_index_t* index; + + index = dict_table_get_first_index(table); + ut_ad(index->is_primary()); + ut_ad(dict_index_is_auto_gen_clust(index)); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr) + == DB_SUCCESS) { + rec = btr_pcur_move_to_prev_on_page(&pcur); + + if (!rec) { + /* The table is corrupted. */ + } else if (page_rec_is_infimum(rec)) { + /* The table is empty. */ + } else if (rec_is_metadata(rec, *index)) { + /* The clustered index contains the metadata + record only, that is, the table is empty. */ + } else { + row_id = mach_read_from_6(rec); + } + } + + mtr_commit(&mtr); + + if (row_id) { + /* Update the system row id if the imported index row id is + greater than the max system row id. */ + dict_sys.update_row_id(row_id); + } +} + +/*****************************************************************//** +Read the a string from the meta data file. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_cfg_read_string( +/*=======================*/ + FILE* file, /*!< in/out: File to read from */ + byte* ptr, /*!< out: string to read */ + ulint max_len) /*!< in: maximum length of the output + buffer in bytes */ +{ + DBUG_EXECUTE_IF("ib_import_string_read_error", + errno = EINVAL; return(DB_IO_ERROR);); + + ulint len = 0; + + while (!feof(file)) { + int ch = fgetc(file); + + if (ch == EOF) { + break; + } else if (ch != 0) { + if (len < max_len) { + ptr[len++] = static_cast<byte>(ch); + } else { + break; + } + /* max_len includes the NUL byte */ + } else if (len != max_len - 1) { + break; + } else { + ptr[len] = 0; + return(DB_SUCCESS); + } + } + + errno = EINVAL; + + return(DB_IO_ERROR); +} + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_cfg_read_index_fields( +/*=============================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_index_t* index) /*!< Index being read in */ +{ + byte row[sizeof(ib_uint32_t) * 3]; + ulint n_fields = index->m_n_fields; + + index->m_fields = UT_NEW_ARRAY_NOKEY(dict_field_t, n_fields); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_4", + UT_DELETE_ARRAY(index->m_fields); + index->m_fields = NULL; + ); + + if (index->m_fields == NULL) { + return(DB_OUT_OF_MEMORY); + } + + dict_field_t* field = index->m_fields; + + for (ulint i = 0; i < n_fields; ++i, ++field) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_1", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading index fields."); + + return(DB_IO_ERROR); + } + + new (field) dict_field_t(); + + field->prefix_len = mach_read_from_4(ptr) & ((1U << 12) - 1); + ptr += sizeof(ib_uint32_t); + + field->fixed_len = mach_read_from_4(ptr) & ((1U << 10) - 1); + ptr += sizeof(ib_uint32_t); + + /* Include the NUL byte in the length. */ + ulint len = mach_read_from_4(ptr); + + byte* name = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_5", + UT_DELETE_ARRAY(name); + name = NULL; + ); + + if (name == NULL) { + return(DB_OUT_OF_MEMORY); + } + + field->name = reinterpret_cast<const char*>(name); + + dberr_t err = row_import_cfg_read_string(file, name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while parsing table name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the index names and root page numbers of the indexes and set the values. +Row format [root_page_no, len of str, str ... ] +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_read_index_data( +/*=======================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte* ptr; + row_index_t* cfg_index; + byte row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9]; + + /* FIXME: What is the max value? */ + ut_a(cfg->m_n_indexes > 0); + ut_a(cfg->m_n_indexes < 1024); + + cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_6", + UT_DELETE_ARRAY(cfg->m_indexes); + cfg->m_indexes = NULL; + ); + + if (cfg->m_indexes == NULL) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + cfg_index = cfg->m_indexes; + + for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) { + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_2", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the index data. */ + size_t n_bytes = fread(row, 1, sizeof(row), file); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error", + (void) fseek(file, 0L, SEEK_END);); + + if (n_bytes != sizeof(row)) { + char msg[BUFSIZ]; + + snprintf(msg, sizeof(msg), + "while reading index meta-data, expected " + "to read " ULINTPF + " bytes but read only " ULINTPF " bytes", + sizeof(row), n_bytes); + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), msg); + + ib::error() << "IO Error: " << msg; + + return(DB_IO_ERROR); + } + + ptr = row; + + cfg_index->m_id = mach_read_from_8(ptr); + ptr += sizeof(index_id_t); + + cfg_index->m_space = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_page_no = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_type = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_trx_id_offset = mach_read_from_4(ptr); + if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) { + ut_ad(0); + /* Overflow. Pretend that the clustered index + has a variable-length PRIMARY KEY. */ + cfg_index->m_trx_id_offset = 0; + } + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_uniq = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_nullable = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_fields = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* The NUL byte is included in the name length. */ + ulint len = mach_read_from_4(ptr); + + if (len > OS_FILE_MAX_PATH) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Index name length (" ULINTPF ") is too long, " + "the meta-data is corrupt", len); + + return(DB_CORRUPTION); + } + + cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_7", + UT_DELETE_ARRAY(cfg_index->m_name); + cfg_index->m_name = NULL; + ); + + if (cfg_index->m_name == NULL) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string(file, cfg_index->m_name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while parsing index name."); + + return(err); + } + + err = row_import_cfg_read_index_fields(file, thd, cfg_index); + + if (err != DB_SUCCESS) { + return(err); + } + + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the index root page number for v1 format. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_read_indexes( +/*====================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_3", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the number of indexes. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading number of indexes."); + + return(DB_IO_ERROR); + } + + cfg->m_n_indexes = mach_read_from_4(row); + + if (cfg->m_n_indexes == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is 0"); + + return(DB_CORRUPTION); + + } else if (cfg->m_n_indexes > 1024) { + // FIXME: What is the upper limit? */ + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is too high: " + ULINTPF, cfg->m_n_indexes); + cfg->m_n_indexes = 0; + + return(DB_CORRUPTION); + } + + return(row_import_read_index_data(file, thd, cfg)); +} + +/*********************************************************************//** +Read the meta data (table columns) config file. Deserialise the contents of +dict_col_t structure, along with the column name. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_read_columns( +/*====================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 8]; + + /* FIXME: What should the upper limit be? */ + ut_a(cfg->m_n_cols > 0); + ut_a(cfg->m_n_cols < 1024); + + cfg->m_cols = UT_NEW_ARRAY_NOKEY(dict_col_t, cfg->m_n_cols); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_8", + UT_DELETE_ARRAY(cfg->m_cols); + cfg->m_cols = NULL; + ); + + if (cfg->m_cols == NULL) { + return(DB_OUT_OF_MEMORY); + } + + cfg->m_col_names = UT_NEW_ARRAY_NOKEY(byte*, cfg->m_n_cols); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_9", + UT_DELETE_ARRAY(cfg->m_col_names); + cfg->m_col_names = NULL; + ); + + if (cfg->m_col_names == NULL) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols); + memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols); + + col = cfg->m_cols; + + for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_4", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading table column meta-data."); + + return(DB_IO_ERROR); + } + + col->prtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mtype = static_cast<byte>(mach_read_from_4(ptr)); + ptr += sizeof(ib_uint32_t); + + col->len = static_cast<uint16_t>(mach_read_from_4(ptr)); + ptr += sizeof(ib_uint32_t); + + uint32_t mbminmaxlen = mach_read_from_4(ptr); + col->mbmaxlen = (mbminmaxlen / 5) & 7; + col->mbminlen = (mbminmaxlen % 5) & 7; + ptr += sizeof(ib_uint32_t); + + col->ind = mach_read_from_4(ptr) & dict_index_t::MAX_N_FIELDS; + ptr += sizeof(ib_uint32_t); + + col->ord_part = mach_read_from_4(ptr) & 1; + ptr += sizeof(ib_uint32_t); + + col->max_prefix = mach_read_from_4(ptr) & ((1U << 12) - 1); + ptr += sizeof(ib_uint32_t); + + /* Read in the column name as [len, byte array]. The len + includes the NUL byte. */ + + ulint len = mach_read_from_4(ptr); + + /* FIXME: What is the maximum column name length? */ + if (len == 0 || len > 128) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_IO_READ_ERROR, + "Column name length " ULINTPF ", is invalid", + len); + + return(DB_CORRUPTION); + } + + cfg->m_col_names[i] = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_10", + UT_DELETE_ARRAY(cfg->m_col_names[i]); + cfg->m_col_names[i] = NULL; + ); + + if (cfg->m_col_names[i] == NULL) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string( + file, cfg->m_col_names[i], len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while parsing table column name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_read_v1( +/*===============*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< out: meta data */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_5", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the hostname where the tablespace was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading meta-data export hostname length."); + + return(DB_IO_ERROR); + } + + ulint len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_hostname = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_1", + UT_DELETE_ARRAY(cfg->m_hostname); + cfg->m_hostname = NULL; + ); + + if (cfg->m_hostname == NULL) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err = row_import_cfg_read_string(file, cfg->m_hostname, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while parsing export hostname."); + + return(err); + } + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_6", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the table name of tablespace that was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading meta-data table name length."); + + return(DB_IO_ERROR); + } + + len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_table_name = UT_NEW_ARRAY_NOKEY(byte, len); + + /* Trigger OOM */ + DBUG_EXECUTE_IF( + "ib_import_OOM_2", + UT_DELETE_ARRAY(cfg->m_table_name); + cfg->m_table_name = NULL; + ); + + if (cfg->m_table_name == NULL) { + return(DB_OUT_OF_MEMORY); + } + + err = row_import_cfg_read_string(file, cfg->m_table_name, len); + + if (err != DB_SUCCESS) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while parsing table name."); + + return(err); + } + + ib::info() << "Importing tablespace for table '" << cfg->m_table_name + << "' that was exported from host '" << cfg->m_hostname << "'"; + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_7", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the autoinc value. */ + if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading autoinc value."); + + return(DB_IO_ERROR); + } + + cfg->m_autoinc = mach_read_from_8(row); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_8", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the tablespace page size. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading meta-data header."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + const ulint logical_page_size = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + if (logical_page_size != srv_page_size) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Tablespace to be imported has a different" + " page size than this server. Server page size" + " is %lu, whereas tablespace page size" + " is " ULINTPF, + srv_page_size, + logical_page_size); + + return(DB_ERROR); + } + + cfg->m_flags = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg->m_zip_size = dict_tf_get_zip_size(cfg->m_flags); + cfg->m_n_cols = mach_read_from_4(ptr); + + if (!dict_tf_is_valid(cfg->m_flags)) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Invalid table flags: " ULINTPF, cfg->m_flags); + + return(DB_CORRUPTION); + } + + err = row_import_read_columns(file, thd, cfg); + + if (err == DB_SUCCESS) { + err = row_import_read_indexes(file, thd, cfg); + } + + return(err); +} + +/** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_read_meta_data( +/*======================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_9", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(&row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), + "while reading meta-data version."); + + return(DB_IO_ERROR); + } + + cfg.m_version = mach_read_from_4(row); + + /* Check the version number. */ + switch (cfg.m_version) { + case IB_EXPORT_CFG_VERSION_V1: + + return(row_import_read_v1(file, thd, &cfg)); + default: + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Unsupported meta-data version number (" ULINTPF "), " + "file ignored", cfg.m_version); + } + + return(DB_ERROR); +} + +#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no, + FIL_NULL if none */ +#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB part header, in bytes */ + +/* decrypt and decompress page if needed */ +static dberr_t decrypt_decompress(fil_space_crypt_t *space_crypt, + uint32_t space_flags, span<byte> page, + uint32_t space_id, byte *page_compress_buf) +{ + auto *data= page.data(); + + if (space_crypt && space_crypt->should_encrypt()) + { + if (!buf_page_verify_crypt_checksum(data, space_flags)) + return DB_CORRUPTION; + + if (dberr_t err= fil_space_decrypt(space_id, space_flags, space_crypt, + data, page.size(), data)) + return err; + } + + bool page_compressed= false; + + if (fil_space_t::full_crc32(space_flags) && + fil_space_t::is_compressed(space_flags)) + page_compressed= buf_page_is_compressed(data, space_flags); + else + { + switch (fil_page_get_type(data)) { + case FIL_PAGE_PAGE_COMPRESSED: + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + page_compressed= true; + } + } + + if (page_compressed) + { + auto compress_length= + fil_page_decompress(page_compress_buf, data, space_flags); + ut_ad(compress_length != srv_page_size); + + if (compress_length == 0) + return DB_CORRUPTION; + } + + return DB_SUCCESS; +} + +static size_t get_buf_size() +{ + return srv_page_size + ( + provider_service_lzo->is_loaded ? LZO1X_1_15_MEM_COMPRESS : + provider_service_snappy->is_loaded ? snappy_max_compressed_length(srv_page_size) : + 0 + ); +} + +/* find, parse instant metadata, performing variaous checks, +and apply it to dict_table_t +@return DB_SUCCESS or some error */ +static dberr_t handle_instant_metadata(dict_table_t *table, + const row_import &cfg) +{ + dict_get_and_save_data_dir_path(table); + + char *filepath; + if (DICT_TF_HAS_DATA_DIR(table->flags)) + { + ut_a(table->data_dir_path); + filepath= fil_make_filepath(table->data_dir_path, table->name, IBD, true); + } + else + filepath= fil_make_filepath(nullptr, table->name, IBD, false); + + if (!filepath) + return DB_OUT_OF_MEMORY; + + SCOPE_EXIT([filepath]() { ut_free(filepath); }); + + bool success; + auto file= os_file_create_simple_no_error_handling( + innodb_data_file_key, filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, false, + &success); + if (!success) + return DB_IO_ERROR; + + if (os_file_get_size(file) < srv_page_size) + return DB_CORRUPTION; + + SCOPE_EXIT([&file]() { os_file_close(file); }); + + std::unique_ptr<byte[], decltype(&aligned_free)> first_page( + static_cast<byte *>(aligned_malloc(srv_page_size, srv_page_size)), + &aligned_free); + + if (dberr_t err= os_file_read(IORequestReadPartial, file, first_page.get(), + 0, srv_page_size, nullptr)) + return err; + + auto space_flags= fsp_header_get_flags(first_page.get()); + + if (!fil_space_t::is_valid_flags(space_flags, true)) + { + auto cflags= fsp_flags_convert_from_101(space_flags); + if (cflags == UINT32_MAX) + return invalid_space_flags(space_flags); + space_flags= static_cast<decltype(space_flags)>(cflags); + } + + if (!cfg.m_missing) + { + if (dberr_t err= cfg.match_flags(current_thd)) + return err; + } + + const unsigned zip_size= fil_space_t::zip_size(space_flags); + const unsigned physical_size= zip_size ? zip_size : unsigned(srv_page_size); + ut_ad(physical_size <= UNIV_PAGE_SIZE_MAX); + const uint32_t space_id= page_get_space_id(first_page.get()); + + auto *space_crypt= fil_space_read_crypt_data(zip_size, first_page.get()); + SCOPE_EXIT([&space_crypt]() { + if (space_crypt) + fil_space_destroy_crypt_data(&space_crypt); + }); + + std::unique_ptr<byte[], decltype(&aligned_free)> page( + static_cast<byte *>( + aligned_malloc(UNIV_PAGE_SIZE_MAX, UNIV_PAGE_SIZE_MAX)), + &aligned_free); + + if (dberr_t err= os_file_read( + IORequestReadPartial, file, page.get(), 3 * physical_size, + physical_size, nullptr)) + return err; + + std::unique_ptr<byte[]> page_compress_buf(new byte[get_buf_size()]); + + if (dberr_t err= decrypt_decompress(space_crypt, space_flags, + {page.get(), static_cast<size_t> + (physical_size)}, + space_id, page_compress_buf.get())) + return err; + + if (table->supports_instant()) + { + dict_index_t *index= dict_table_get_first_index(table); + + if (!page_is_comp(page.get()) != !dict_table_is_comp(table)) + { + ib_errf(current_thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "ROW_FORMAT mismatch"); + return DB_CORRUPTION; + } + + if (btr_cur_instant_root_init(index, page.get())) + return DB_CORRUPTION; + + ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES); + + if (fil_page_get_type(page.get()) == FIL_PAGE_INDEX) + { + ut_ad(!index->is_instant()); + return DB_SUCCESS; + } + + mem_heap_t *heap= NULL; + SCOPE_EXIT([&heap]() { + if (heap) + mem_heap_free(heap); + }); + + while (btr_page_get_level(page.get()) != 0) + { + const rec_t *rec= page_rec_get_next(page_get_infimum_rec(page.get())); + if (!rec) + return DB_CORRUPTION; + + /* Relax the assertion in rec_init_offsets(). */ + ut_ad(!index->in_instant_init); + ut_d(index->in_instant_init= true); + rec_offs *offsets= + rec_get_offsets(rec, index, nullptr, 0, ULINT_UNDEFINED, &heap); + ut_d(index->in_instant_init= false); + + uint64_t child_page_no= btr_node_ptr_get_child_page_no(rec, offsets); + + if (dberr_t err= + os_file_read(IORequestReadPartial, file, page.get(), + child_page_no * physical_size, physical_size, nullptr)) + return err; + + if (dberr_t err= decrypt_decompress(space_crypt, space_flags, + {page.get(), static_cast<size_t> + (physical_size)}, space_id, + page_compress_buf.get())) + return err; + } + + const auto *rec= page_rec_get_next_const(page_get_infimum_rec(page.get())); + const auto comp= dict_table_is_comp(index->table); + + if (!rec || page_rec_is_supremum(rec)) + { + corrupted_metadata: + ib::error() << "Table " << index->table->name + << " is missing instant ALTER metadata"; + index->table->corrupted= true; + return DB_CORRUPTION; + } + + const auto info_bits= rec_get_info_bits(rec, comp); + if (!(info_bits & REC_INFO_MIN_REC_FLAG)) + goto corrupted_metadata; + + if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG || + (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) + { + incompatible: + ib::error() << "Table " << index->table->name + << " contains unrecognizable instant ALTER metadata"; + index->table->corrupted= true; + return DB_CORRUPTION; + } + + if (info_bits & REC_INFO_DELETED_FLAG) + { + ulint trx_id_offset= index->trx_id_offset; + ut_ad(index->n_uniq); + + if (trx_id_offset) + { + } + else if (index->table->not_redundant()) + { + + for (uint i= index->n_uniq; i--;) + trx_id_offset+= index->fields[i].fixed_len; + } + else if (rec_get_1byte_offs_flag(rec)) + { + trx_id_offset= rec_1_get_field_end_info(rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK)); + trx_id_offset&= ~REC_1BYTE_SQL_NULL_MASK; + } + else + { + trx_id_offset= rec_2_get_field_end_info(rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK)); + trx_id_offset&= ~REC_2BYTE_SQL_NULL_MASK; + } + + const byte *ptr= + rec + trx_id_offset + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) + goto incompatible; + + uint len= mach_read_from_4(ptr + BTR_EXTERN_LEN + 4); + if (!len || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) != FIL_PAGE_DATA) + goto incompatible; + + std::unique_ptr<byte[], decltype(&aligned_free)> + second_page(static_cast<byte*>(aligned_malloc(physical_size, + physical_size)), + &aligned_free); + + if (dberr_t err= + os_file_read(IORequestReadPartial, file, second_page.get(), + physical_size * + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO), + physical_size, nullptr)) + return err; + + if (dberr_t err= decrypt_decompress(space_crypt, space_flags, + {second_page.get(), + static_cast<size_t>(physical_size)}, + space_id, page_compress_buf.get())) + return err; + + if (fil_page_get_type(second_page.get()) != FIL_PAGE_TYPE_BLOB || + mach_read_from_4( + &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO]) != + FIL_NULL || + mach_read_from_4( + &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN]) != len) + goto incompatible; + + /* The unused part of the BLOB page should be zero-filled. */ + for (const byte * + b= second_page.get() + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + + len, + *const end= second_page.get() + srv_page_size - BTR_EXTERN_LEN; + b < end;) + { + if (*b++) + goto incompatible; + } + + if (index->table->deserialise_columns( + &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) + goto incompatible; + } + + rec_offs *offsets= rec_get_offsets( + rec, index, nullptr, index->n_core_fields, ULINT_UNDEFINED, &heap); + if (rec_offs_any_default(offsets)) + { + inconsistent: + goto incompatible; + } + + /* In fact, because we only ever append fields to the metadata + record, it is also OK to perform READ UNCOMMITTED and + then ignore any extra fields, provided that + trx_sys.is_registered(DB_TRX_ID). */ + if (rec_offs_n_fields(offsets) > + ulint(index->n_fields) + !!index->table->instant && + !trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, offsets))) + goto inconsistent; + + for (unsigned i= index->n_core_fields; i < index->n_fields; i++) + { + dict_col_t *col= index->fields[i].col; + const unsigned o= i + !!index->table->instant; + ulint len; + const byte *data= rec_get_nth_field(rec, offsets, o, &len); + ut_ad(!col->is_added()); + ut_ad(!col->def_val.data); + col->def_val.len= len; + switch (len) { + case UNIV_SQL_NULL: + continue; + case 0: + col->def_val.data= field_ref_zero; + continue; + } + ut_ad(len != UNIV_SQL_DEFAULT); + if (!rec_offs_nth_extern(offsets, o)) + col->def_val.data= mem_heap_dup(index->table->heap, data, len); + else if (len < BTR_EXTERN_FIELD_REF_SIZE || + !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) + { + col->def_val.len= UNIV_SQL_DEFAULT; + goto inconsistent; + } + else + { + col->def_val.data= btr_copy_externally_stored_field( + &col->def_val.len, data, srv_page_size, len, index->table->heap); + } + } + } + + return DB_SUCCESS; +} + +/** +Read the contents of the <tablename>.cfg file. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_import_read_cfg( +/*================*/ + dict_table_t* table, /*!< in: table */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + cfg.m_table = table; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + FILE* file = fopen(name, "rb"); + + if (file == NULL) { + char msg[BUFSIZ]; + + snprintf(msg, sizeof(msg), + "Error opening '%s', will attempt to import" + " without schema verification", name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR, + (ulong) errno, strerror(errno), msg); + + cfg.m_missing = true; + + err = DB_FAIL; + } else { + + cfg.m_missing = false; + + err = row_import_read_meta_data(file, thd, cfg); + fclose(file); + } + + return(err); +} + +/** Update the root page numbers and tablespace ID of a table. +@param[in,out] trx dictionary transaction +@param[in,out] table persistent table +@param[in] reset whether to reset the fields to FIL_NULL +@return DB_SUCCESS or error code */ +dberr_t +row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset) +{ + const dict_index_t* index; + que_t* graph = 0; + dberr_t err = DB_SUCCESS; + + ut_ad(reset || table->space->id == table->space_id); + + static const char sql[] = { + "PROCEDURE UPDATE_INDEX_ROOT() IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES\n" + "SET SPACE = :space,\n" + " PAGE_NO = :page,\n" + " TYPE = :type\n" + "WHERE TABLE_ID = :table_id AND ID = :index_id;\n" + "END;\n"}; + + table->def_trx_id = trx->id; + + for (index = dict_table_get_first_index(table); + index != 0; + index = dict_table_get_next_index(index)) { + + pars_info_t* info; + ib_uint32_t page; + ib_uint32_t space; + ib_uint32_t type; + index_id_t index_id; + table_id_t table_id; + + info = (graph != 0) ? graph->info : pars_info_create(); + + mach_write_to_4( + reinterpret_cast<byte*>(&type), + index->type); + + mach_write_to_4( + reinterpret_cast<byte*>(&page), + reset ? FIL_NULL : index->page); + + mach_write_to_4( + reinterpret_cast<byte*>(&space), + reset ? FIL_NULL : index->table->space_id); + + mach_write_to_8( + reinterpret_cast<byte*>(&index_id), + index->id); + + mach_write_to_8( + reinterpret_cast<byte*>(&table_id), + table->id); + + /* If we set the corrupt bit during the IMPORT phase then + we need to update the system tables. */ + pars_info_bind_int4_literal(info, "type", &type); + pars_info_bind_int4_literal(info, "space", &space); + pars_info_bind_int4_literal(info, "page", &page); + pars_info_bind_ull_literal(info, "index_id", &index_id); + pars_info_bind_ull_literal(info, "table_id", &table_id); + + if (graph == 0) { + graph = pars_sql(info, sql); + ut_a(graph); + graph->trx = trx; + } + + que_thr_t* thr; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + DBUG_EXECUTE_IF("ib_import_internal_error", + trx->error_state = DB_ERROR;); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "While updating the <space, root page" + " number> of index %s - %s", + index->name(), ut_strerr(err)); + + break; + } + } + + que_graph_free(graph); + + return(err); +} + +/** Callback arg for row_import_set_discarded. */ +struct discard_t { + ib_uint32_t flags2; /*!< Value read from column */ + bool state; /*!< New state of the flag */ + ulint n_recs; /*!< Number of recs processed */ +}; + +/******************************************************************//** +Fetch callback that sets or unsets the DISCARDED tablespace flag in +SYS_TABLES. The flags is stored in MIX_LEN column. +@return FALSE if all OK */ +static +ibool +row_import_set_discarded( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: bool set/unset flag */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + discard_t* discard = static_cast<discard_t*>(user_arg); + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == sizeof(ib_uint32_t)); + + ulint flags2 = mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + + if (discard->state) { + flags2 |= DICT_TF2_DISCARDED; + } else { + flags2 &= ~DICT_TF2_DISCARDED; + } + + mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2); + + ++discard->n_recs; + + /* There should be at most one matching record. */ + ut_a(discard->n_recs == 1); + + return(FALSE); +} + +/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN. +@param[in,out] trx dictionary transaction +@param[in] table_id table identifier +@param[in] discarded whether to set or clear the flag +@return DB_SUCCESS or error code */ +dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id, + bool discarded) +{ + pars_info_t* info; + discard_t discard; + + static const char sql[] = + "PROCEDURE UPDATE_DISCARDED_FLAG() IS\n" + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS\n" + " SELECT MIX_LEN" + " FROM SYS_TABLES" + " WHERE ID = :table_id FOR UPDATE;" + "\n" + "BEGIN\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_TABLES" + " SET MIX_LEN = :flags2" + " WHERE ID = :table_id;\n" + "CLOSE c;\n" + "END;\n"; + + discard.n_recs = 0; + discard.state = discarded; + discard.flags2 = ULINT32_UNDEFINED; + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "table_id", table_id); + pars_info_bind_int4_literal(info, "flags2", &discard.flags2); + + pars_info_bind_function( + info, "my_func", row_import_set_discarded, &discard); + + dberr_t err = que_eval_sql(info, sql, trx); + + ut_a(discard.n_recs == 1); + ut_a(discard.flags2 != ULINT32_UNDEFINED); + + return(err); +} + +/** InnoDB writes page by page when there is page compressed +tablespace involved. It does help to save the disk space when +punch hole is enabled +@param iter Tablespace iterator +@param full_crc32 whether the file is in the full_crc32 format +@param offset offset of the file to be written +@param writeptr buffer to be written +@param n_bytes number of bytes to be written +@param try_punch_only Try the range punch only because the + current range is full of empty pages +@return DB_SUCCESS */ +static +dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter, + bool full_crc32, + os_offset_t offset, + const byte *writeptr, + ulint n_bytes, + bool try_punch_only= false) +{ + if (dberr_t err= os_file_punch_hole(iter.file, offset, n_bytes)) + return err; + + if (try_punch_only) + return DB_SUCCESS; + + for (ulint j= 0; j < n_bytes; j+= srv_page_size) + { + /* Read the original data length from block and + safer to read FIL_PAGE_COMPRESSED_SIZE because it + is not encrypted*/ + ulint n_write_bytes= srv_page_size; + if (j || offset) + { + n_write_bytes= mach_read_from_2(writeptr + j + FIL_PAGE_DATA); + const unsigned ptype= mach_read_from_2(writeptr + j + FIL_PAGE_TYPE); + /* Ignore the empty page */ + if (ptype == 0 && n_write_bytes == 0) + continue; + if (full_crc32) + n_write_bytes= buf_page_full_crc32_size(writeptr + j, + nullptr, nullptr); + else + { + n_write_bytes+= ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED + ? FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN + : FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN; + } + } + + if (dberr_t err= os_file_write(IORequestWrite, iter.filepath, iter.file, + writeptr + j, offset + j, n_write_bytes)) + return err; + } + + return DB_SUCCESS; +} + +dberr_t FetchIndexRootPages::run(const fil_iterator_t& iter, + buf_block_t* block) UNIV_NOTHROW +{ + const unsigned zip_size= fil_space_t::zip_size(m_space_flags); + const unsigned size= zip_size ? zip_size : unsigned(srv_page_size); + byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size())); + const bool full_crc32 = fil_space_t::full_crc32(m_space_flags); + bool skip_checksum_check = false; + ut_ad(!srv_read_only_mode); + + if (!page_compress_buf) + return DB_OUT_OF_MEMORY; + + const bool encrypted= iter.crypt_data != NULL && + iter.crypt_data->should_encrypt(); + byte* const readptr= iter.io_buffer; + block->page.frame= readptr; + + if (block->page.zip.data) + block->page.zip.data= readptr; + + bool page_compressed= false; + + dberr_t err= os_file_read(IORequestReadPartial, iter.file, readptr, + 3 * size, size, nullptr); + if (err != DB_SUCCESS) + { + ib::error() << iter.filepath << ": os_file_read() failed"; + goto func_exit; + } + + if (page_get_page_no(readptr) != 3) + { +page_corrupted: + ib::warn() << filename() << ": Page 3 at offset " + << 3 * size << " looks corrupted."; + err= DB_CORRUPTION; + goto func_exit; + } + + block->page.id_.set_page_no(3); + if (full_crc32 && fil_space_t::is_compressed(m_space_flags)) + page_compressed= buf_page_is_compressed(readptr, m_space_flags); + else + { + switch (fil_page_get_type(readptr)) { + case FIL_PAGE_PAGE_COMPRESSED: + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + if (block->page.zip.data) + goto page_corrupted; + page_compressed= true; + } + } + + if (encrypted) + { + if (!buf_page_verify_crypt_checksum(readptr, m_space_flags)) + goto page_corrupted; + + if ((err= fil_space_decrypt(get_space_id(), m_space_flags, iter.crypt_data, + readptr, size, readptr))) + goto func_exit; + } + + /* For full_crc32 format, skip checksum check + after decryption. */ + skip_checksum_check= full_crc32 && encrypted; + + if (page_compressed) + { + ulint compress_length= fil_page_decompress(page_compress_buf, + readptr, + m_space_flags); + ut_ad(compress_length != srv_page_size); + if (compress_length == 0) + goto page_corrupted; + } + else if (!skip_checksum_check + && buf_page_is_corrupted(false, readptr, m_space_flags)) + goto page_corrupted; + + err= this->operator()(block); +func_exit: + free(page_compress_buf); + return err; +} + +static dberr_t fil_iterate( + const fil_iterator_t& iter, + buf_block_t* block, + AbstractCallback& callback) +{ + os_offset_t offset; + const ulint size = callback.physical_size(); + ulint n_bytes = iter.n_io_buffers * size; + + byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size())); + ut_ad(!srv_read_only_mode); + + if (!page_compress_buf) { + return DB_OUT_OF_MEMORY; + } + + uint32_t actual_space_id = 0; + const bool full_crc32 = fil_space_t::full_crc32( + callback.get_space_flags()); + + /* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless + copying for non-index pages. Unfortunately, it is + required by buf_zip_decompress() */ + dberr_t err = DB_SUCCESS; + bool page_compressed = false; + bool punch_hole = !my_test_if_thinly_provisioned(iter.file); + + for (offset = iter.start; offset < iter.end; offset += n_bytes) { + if (callback.is_interrupted()) { + err = DB_INTERRUPTED; + goto func_exit; + } + + byte* io_buffer = iter.io_buffer; + block->page.frame = io_buffer; + + if (block->page.zip.data) { + /* Zip IO is done in the compressed page buffer. */ + io_buffer = block->page.zip.data; + } + + /* We have to read the exact number of bytes. Otherwise the + InnoDB IO functions croak on failed reads. */ + + n_bytes = ulint(ut_min(os_offset_t(n_bytes), + iter.end - offset)); + + ut_ad(n_bytes > 0); + ut_ad(!(n_bytes % size)); + + const bool encrypted = iter.crypt_data != NULL + && iter.crypt_data->should_encrypt(); + /* Use additional crypt io buffer if tablespace is encrypted */ + byte* const readptr = encrypted + ? iter.crypt_io_buffer : io_buffer; + byte* const writeptr = readptr; + + err = os_file_read(IORequestReadPartial, iter.file, readptr, + offset, n_bytes, nullptr); + if (err != DB_SUCCESS) { + ib::error() << iter.filepath + << ": os_file_read() failed"; + goto func_exit; + } + + bool updated = false; + os_offset_t page_off = offset; + ulint n_pages_read = n_bytes / size; + /* This block is not attached to buf_pool */ + block->page.id_.set_page_no(uint32_t(page_off / size)); + + for (ulint i = 0; i < n_pages_read; + ++block->page.id_, + ++i, page_off += size, block->page.frame += size) { + byte* src = readptr + i * size; + const ulint page_no = page_get_page_no(src); + if (!page_no && block->page.id().page_no()) { + if (!buf_is_zeroes(span<const byte>(src, + size))) { + goto page_corrupted; + } + /* Proceed to the next page, + because this one is all zero. */ + continue; + } + + if (page_no != block->page.id().page_no()) { +page_corrupted: + ib::warn() << callback.filename() + << ": Page " << (offset / size) + << " at offset " << offset + << " looks corrupted."; + err = DB_CORRUPTION; + goto func_exit; + } + + if (block->page.id().page_no() == 0) { + actual_space_id = mach_read_from_4( + src + FIL_PAGE_SPACE_ID); + } + + const uint16_t type = fil_page_get_type(src); + page_compressed = + (full_crc32 + && fil_space_t::is_compressed( + callback.get_space_flags()) + && buf_page_is_compressed( + src, callback.get_space_flags())) + || type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED + || type == FIL_PAGE_PAGE_COMPRESSED; + + if (page_compressed && block->page.zip.data) { + goto page_corrupted; + } + + bool decrypted = false; + byte* dst = io_buffer + i * size; + bool frame_changed = false; + uint key_version = buf_page_get_key_version( + src, callback.get_space_flags()); + + if (!encrypted) { + } else if (!key_version) { + if (block->page.id().page_no() == 0 + && block->page.zip.data) { + block->page.zip.data = src; + frame_changed = true; + } else if (!page_compressed + && type != FIL_PAGE_TYPE_XDES + && !block->page.zip.data) { + block->page.frame = src; + frame_changed = true; + } else { + ut_ad(dst != src); + memcpy(dst, src, size); + } + } else { + if (!buf_page_verify_crypt_checksum( + src, callback.get_space_flags())) { + goto page_corrupted; + } + + if ((err = fil_space_decrypt( + actual_space_id, + callback.get_space_flags(), + iter.crypt_data, dst, + callback.physical_size(), + src))) { + goto func_exit; + } + + decrypted = true; + updated = true; + } + + /* For full_crc32 format, skip checksum check + after decryption. */ + bool skip_checksum_check = full_crc32 && encrypted; + + /* If the original page is page_compressed, we need + to decompress it before adjusting further. */ + if (page_compressed) { + ulint compress_length = fil_page_decompress( + page_compress_buf, dst, + callback.get_space_flags()); + ut_ad(compress_length != srv_page_size); + if (compress_length == 0) { + goto page_corrupted; + } + updated = true; + } else if (!skip_checksum_check + && buf_page_is_corrupted( + false, + encrypted && !frame_changed + ? dst : src, + callback.get_space_flags())) { + goto page_corrupted; + } + + if ((err = callback(block)) != DB_SUCCESS) { + goto func_exit; + } else if (!updated) { + updated = !!block->page.frame; + } + + /* If tablespace is encrypted we use additional + temporary scratch area where pages are read + for decrypting readptr == crypt_io_buffer != io_buffer. + + Destination for decryption is a buffer pool block + block->page.frame == dst == io_buffer that is updated. + Pages that did not require decryption even when + tablespace is marked as encrypted are not copied + instead block->page.frame is set to src == readptr. + + For encryption we again use temporary scratch area + writeptr != io_buffer == dst + that is then written to the tablespace + + (1) For normal tables io_buffer == dst == writeptr + (2) For only page compressed tables + io_buffer == dst == writeptr + (3) For encrypted (and page compressed) + readptr != io_buffer == dst != writeptr + */ + + ut_ad(!encrypted && !page_compressed ? + src == dst && dst == writeptr + (i * size):1); + ut_ad(page_compressed && !encrypted ? + src == dst && dst == writeptr + (i * size):1); + ut_ad(encrypted ? + src != dst && dst != writeptr + (i * size):1); + + /* When tablespace is encrypted or compressed its + first page (i.e. page 0) is not encrypted or + compressed and there is no need to copy frame. */ + if (encrypted && block->page.id().page_no() != 0) { + byte *local_frame = callback.get_frame(block); + ut_ad((writeptr + (i * size)) != local_frame); + memcpy((writeptr + (i * size)), local_frame, size); + } + + if (frame_changed) { + if (block->page.zip.data) { + block->page.zip.data = dst; + } else { + block->page.frame = dst; + } + } + + src = io_buffer + (i * size); + + if (page_compressed) { + updated = true; + if (ulint len = fil_page_compress( + src, + page_compress_buf, + callback.get_space_flags(), + 512,/* FIXME: proper block size */ + encrypted)) { + /* FIXME: remove memcpy() */ + memcpy(src, page_compress_buf, len); + memset(src + len, 0, + srv_page_size - len); + } + } + + /* Encrypt the page if encryption was used. */ + if (encrypted && decrypted) { + byte *dest = writeptr + i * size; + + byte* tmp = fil_encrypt_buf( + iter.crypt_data, + block->page.id().space(), + block->page.id().page_no(), + src, block->zip_size(), dest, + full_crc32); + + if (tmp == src) { + /* TODO: remove unnecessary memcpy's */ + ut_ad(dest != src); + memcpy(dest, src, size); + } + + updated = true; + } + + /* Write checksum for the compressed full crc32 page.*/ + if (full_crc32 && page_compressed) { + ut_ad(updated); + byte* dest = writeptr + i * size; + ut_d(bool comp = false); + ut_d(bool corrupt = false); + ulint size = buf_page_full_crc32_size( + dest, +#ifdef UNIV_DEBUG + &comp, &corrupt +#else + NULL, NULL +#endif + ); + ut_ad(!comp == (size == srv_page_size)); + ut_ad(!corrupt); + mach_write_to_4(dest + (size - 4), + my_crc32c(0, dest, size - 4)); + } + } + + if (page_compressed && punch_hole) { + err = fil_import_compress_fwrite( + iter, full_crc32, offset, writeptr, n_bytes, + !updated); + + if (err != DB_SUCCESS) { + punch_hole = false; + if (updated) { + goto normal_write; + } + } + } else if (updated) { +normal_write: + /* A page was updated in the set, write it back. */ + err = os_file_write(IORequestWrite, + iter.filepath, iter.file, + writeptr, offset, n_bytes); + + if (err != DB_SUCCESS) { + goto func_exit; + } + } + } + +func_exit: + free(page_compress_buf); + return err; +} + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +static +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + AbstractCallback& callback) +{ + dberr_t err; + pfs_os_file_t file; + char* filepath; + + ut_a(n_io_buffers > 0); + ut_ad(!srv_read_only_mode); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_1", + return(DB_CORRUPTION);); + + /* Make sure the data_dir_path is set. */ + dict_get_and_save_data_dir_path(table); + + ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path); + + const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags) + ? table->data_dir_path : nullptr; + + filepath = fil_make_filepath(data_dir_path, + {table->name.m_name, + strlen(table->name.m_name)}, + IBD, data_dir_path != nullptr); + if (!filepath) { + return(DB_OUT_OF_MEMORY); + } else { + bool success; + + file = os_file_create_simple_no_error_handling( + innodb_data_file_key, filepath, + OS_FILE_OPEN, OS_FILE_READ_WRITE, false, &success); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + ib::error() << "Trying to import a tablespace," + " but could not open the tablespace file " + << filepath; + ut_free(filepath); + return DB_TABLESPACE_NOT_FOUND; + } else { + err = DB_SUCCESS; + } + } + + callback.set_file(filepath, file); + + os_offset_t file_size = os_file_get_size(file); + ut_a(file_size != (os_offset_t) -1); + + /* Allocate a page to read in the tablespace header, so that we + can determine the page size and zip_size (if it is compressed). + We allocate an extra page in case it is a compressed table. */ + + byte* page = static_cast<byte*>(aligned_malloc(2 * srv_page_size, + srv_page_size)); + + buf_block_t* block = reinterpret_cast<buf_block_t*> + (ut_zalloc_nokey(sizeof *block)); + block->page.frame = page; + block->page.init(buf_page_t::UNFIXED + 1, page_id_t{~0ULL}); + + /* Read the first page and determine the page size. */ + + err = os_file_read(IORequestReadPartial, file, page, 0, srv_page_size, + nullptr); + + if (err == DB_SUCCESS) { + err = callback.init(file_size, block); + } + + if (err == DB_SUCCESS) { + block->page.id_ = page_id_t(callback.get_space_id(), 0); + if (ulint zip_size = callback.get_zip_size()) { + page_zip_set_size(&block->page.zip, zip_size); + /* ROW_FORMAT=COMPRESSED is not optimised for block IO + for now. We do the IMPORT page by page. */ + n_io_buffers = 1; + } + + fil_iterator_t iter; + + /* read (optional) crypt data */ + iter.crypt_data = fil_space_read_crypt_data( + callback.get_zip_size(), page); + + /* If tablespace is encrypted, it needs extra buffers */ + if (iter.crypt_data && n_io_buffers > 1) { + /* decrease io buffers so that memory + consumption will not double */ + n_io_buffers /= 2; + } + + iter.file = file; + iter.start = 0; + iter.end = file_size; + iter.filepath = filepath; + iter.file_size = file_size; + iter.n_io_buffers = n_io_buffers; + + /* Add an extra page for compressed page scratch area. */ + iter.io_buffer = static_cast<byte*>( + aligned_malloc((1 + iter.n_io_buffers) + << srv_page_size_shift, srv_page_size)); + + iter.crypt_io_buffer = iter.crypt_data + ? static_cast<byte*>( + aligned_malloc((1 + iter.n_io_buffers) + << srv_page_size_shift, + srv_page_size)) + : NULL; + + if (block->page.zip.ssize) { + ut_ad(iter.n_io_buffers == 1); + block->page.frame = iter.io_buffer; + block->page.zip.data = block->page.frame + + srv_page_size; + } + + err = callback.run(iter, block); + + if (iter.crypt_data) { + fil_space_destroy_crypt_data(&iter.crypt_data); + } + + aligned_free(iter.crypt_io_buffer); + aligned_free(iter.io_buffer); + } + + if (err == DB_SUCCESS) { + ib::info() << "Sync to disk"; + + if (!os_file_flush(file)) { + ib::info() << "os_file_flush() failed!"; + err = DB_IO_ERROR; + } else { + ib::info() << "Sync to disk - done!"; + } + } + + os_file_close(file); + + aligned_free(page); + ut_free(filepath); + ut_free(block); + + return(err); +} + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ +{ + dberr_t err; + ib_uint64_t autoinc = 0; + char* filepath = NULL; + trx_t* trx = prebuilt->trx; + + /* The caller assured that this is not read_only_mode and that no + temorary tablespace is being imported. */ + ut_ad(!srv_read_only_mode); + ut_ad(!table->is_temporary()); + + ut_ad(table->space_id); + ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND); + ut_ad(trx); + ut_ad(trx->state == TRX_STATE_ACTIVE); + ut_ad(!table->is_readable()); + + ibuf_delete_for_discarded_space(table->space_id); + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + /* TODO: Do not write any undo log for the IMPORT cleanup. */ + { + mtr_t mtr; + mtr.start(); + trx_undo_assign(trx, &err, &mtr); + mtr.commit(); + } + + DBUG_EXECUTE_IF("ib_import_undo_assign_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err == DB_SUCCESS && !trx->has_logged_persistent()) { + err = DB_TOO_MANY_CONCURRENT_TRXS; + } + if (err != DB_SUCCESS) { + return row_import_cleanup(prebuilt, err); + } + + trx->op_info = "read meta-data file"; + + row_import cfg; + THD* thd = trx->mysql_thd; + + err = row_import_read_cfg(table, thd, cfg); + + /* Check if the table column definitions match the contents + of the config file. */ + + if (err == DB_SUCCESS) { + + if (dberr_t err = handle_instant_metadata(table, cfg)) { + return row_import_error(prebuilt, err); + } + + /* We have a schema file, try and match it with our + data dictionary. */ + + err = cfg.match_schema(thd); + + /* Update index->page and SYS_INDEXES.PAGE_NO to match the + B-tree root page numbers in the tablespace. Use the index + name from the .cfg file to find match. */ + + if (err == DB_SUCCESS) { + cfg.set_root_by_name(); + autoinc = cfg.m_autoinc; + } + + DBUG_EXECUTE_IF("ib_import_set_index_root_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + } else if (cfg.m_missing) { + /* We don't have a schema file, we will have to discover + the index root pages from the .ibd file and skip the schema + matching step. */ + + ut_a(err == DB_FAIL); + + cfg.m_zip_size = 0; + + if (UT_LIST_GET_LEN(table->indexes) > 1) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "Drop all secondary indexes before importing " + "table %s when .cfg file is missing.", + table->name.m_name); + err = DB_ERROR; + return row_import_error(prebuilt, err); + } + + FetchIndexRootPages fetchIndexRootPages(table, trx); + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(srv_page_size), + fetchIndexRootPages); + + if (err == DB_SUCCESS) { + + err = fetchIndexRootPages.build_row_import(&cfg); + + /* Update index->page and SYS_INDEXES.PAGE_NO + to match the B-tree root page numbers in the + tablespace. */ + + if (err == DB_SUCCESS) { + err = cfg.set_root_by_heuristic(); + + if (err == DB_SUCCESS) { + err = handle_instant_metadata(table, + cfg); + } + } + } + } + + if (err != DB_SUCCESS) { + return row_import_error(prebuilt, err); + } + + trx->op_info = "importing tablespace"; + + ib::info() << "Phase I - Update all pages"; + + /* Iterate over all the pages and do the sanity checking and + the conversion required to import the tablespace. */ + + PageConverter converter(&cfg, table->space_id, trx); + + /* Set the IO buffer size in pages. */ + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_zip_size ? cfg.m_zip_size + : srv_page_size), converter); + + DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); +#ifdef BTR_CUR_HASH_ADAPT + /* On DISCARD TABLESPACE, we did not drop any adaptive hash + index entries. If we replaced the discarded tablespace with a + smaller one here, there could still be some adaptive hash + index entries that point to cached garbage pages in the buffer + pool, because PageConverter::operator() only evicted those + pages that were replaced by the imported pages. We must + detach any remaining adaptive hash index entries, because the + adaptive hash index must be a subset of the table contents; + false positives are not tolerated. */ + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index; + index = UT_LIST_GET_NEXT(indexes, index)) { + index = index->clone_if_needed(); + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (err != DB_SUCCESS) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + table->name.m_name); + + if (err != DB_DECRYPTION_FAILED) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "Error importing tablespace for table %s : %s", + table_name, ut_strerr(err)); + } + + return row_import_cleanup(prebuilt, err); + } + + /* If the table is stored in a remote tablespace, we need to + determine that filepath from the link file and system tables. + Find the space ID in SYS_TABLES since this is an ALTER TABLE. */ + dict_get_and_save_data_dir_path(table); + + ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path); + const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags) + ? table->data_dir_path : nullptr; + fil_space_t::name_type name{ + table->name.m_name, strlen(table->name.m_name)}; + + filepath = fil_make_filepath(data_dir_path, name, IBD, + data_dir_path != nullptr); + + DBUG_EXECUTE_IF( + "ib_import_OOM_15", + ut_free(filepath); + filepath = NULL; + ); + + if (filepath == NULL) { + return row_import_cleanup(prebuilt, DB_OUT_OF_MEMORY); + } + + /* Open the tablespace so that we can access via the buffer pool. + The tablespace is initially opened as a temporary one, because + we will not be writing any redo log for it before we have invoked + fil_space_t::set_imported() to declare it a persistent tablespace. */ + + table->space = fil_ibd_open( + 2, FIL_TYPE_IMPORT, table->space_id, + dict_tf_to_fsp_flags(table->flags), name, filepath, &err); + + ut_ad((table->space == NULL) == (err != DB_SUCCESS)); + DBUG_EXECUTE_IF("ib_import_open_tablespace_failure", + err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;); + + if (!table->space) { + ib_senderrf(thd, IB_LOG_LEVEL_ERROR, + ER_GET_ERRMSG, + err, ut_strerr(err), filepath); + } + + ut_free(filepath); + + if (err == DB_SUCCESS) { + err = ibuf_check_bitmap_on_import(trx, table->space); + } + + DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return row_import_cleanup(prebuilt, err); + } + + /* The first index must always be the clustered index. */ + + dict_index_t* index = dict_table_get_first_index(table); + + if (!dict_index_is_clust(index)) { + return row_import_error(prebuilt, DB_CORRUPTION); + } + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + + DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return row_import_error(prebuilt, err); + } else if (cfg.requires_purge(index->name)) { + + /* Purge any delete-marked records that couldn't be + purged during the page conversion phase from the + cluster index. */ + + IndexPurge purge(trx, index); + + trx->op_info = "cluster: purging delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + } + + DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return row_import_error(prebuilt, err); + } + + /* For secondary indexes, purge any records that couldn't be purged + during the page conversion phase. */ + + err = row_import_adjust_root_pages_of_secondary_indexes( + trx, table, cfg); + + DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return row_import_error(prebuilt, err); + } + + /* Ensure that the next available DB_ROW_ID is not smaller than + any DB_ROW_ID stored in the table. */ + + if (prebuilt->clust_index_was_generated) { + row_import_set_sys_max_row_id(prebuilt, table); + } + + ib::info() << "Phase III - Flush changes to disk"; + + /* Ensure that all pages dirtied during the IMPORT make it to disk. + The only dirty pages generated should be from the pessimistic purge + of delete marked records that couldn't be purged in Phase I. */ + while (buf_flush_list_space(prebuilt->table->space)); + + for (ulint count = 0; prebuilt->table->space->referenced(); count++) { + /* Issue a warning every 10.24 seconds, starting after + 2.56 seconds */ + if ((count & 511) == 128) { + ib::warn() << "Waiting for flush to complete on " + << prebuilt->table->name; + } + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + + ib::info() << "Phase IV - Flush complete"; + prebuilt->table->space->set_imported(); + + /* The dictionary latches will be released in in row_import_cleanup() + after the transaction commit, for both success and error. */ + + row_mysql_lock_data_dictionary(trx); + + /* Update the root pages of the table's indexes. */ + err = row_import_update_index_root(trx, table, false); + + if (err != DB_SUCCESS) { + return row_import_error(prebuilt, err); + } + + err = row_import_update_discarded_flag(trx, table->id, false); + + if (err != DB_SUCCESS) { + return row_import_error(prebuilt, err); + } + + table->file_unreadable = false; + table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1); + + /* Set autoinc value read from .cfg file, if one was specified. + Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */ + if (autoinc) { + ib::info() << table->name << " autoinc value set to " + << autoinc; + + table->autoinc = autoinc--; + btr_write_autoinc(dict_table_get_first_index(table), autoinc); + } + + return row_import_cleanup(prebuilt, err); +} diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc new file mode 100644 index 00000000..bdee0ed1 --- /dev/null +++ b/storage/innobase/row/row0ins.cc @@ -0,0 +1,3843 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0ins.cc +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0ins.h" +#include "dict0dict.h" +#include "trx0rec.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "mach0data.h" +#include "ibuf0ibuf.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0sel.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "eval0eval.h" +#include "data0data.h" +#include "buf0lru.h" +#include "fts0fts.h" +#include "fts0types.h" +#ifdef BTR_CUR_HASH_ADAPT +# include "btr0sea.h" +#endif +#ifdef WITH_WSREP +#include <wsrep.h> +#include <mysql/service_wsrep.h> +#include "ha_prototypes.h" +#endif /* WITH_WSREP */ + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/** Create an row template for each index of a table. */ +static void ins_node_create_entry_list(ins_node_t *node) +{ + node->entry_list.reserve(UT_LIST_GET_LEN(node->table->indexes)); + + for (dict_index_t *index= dict_table_get_first_index(node->table); index; + index= dict_table_get_next_index(index)) + { + /* Corrupted or incomplete secondary indexes will be filtered out in + row_ins(). */ + dtuple_t *entry= index->online_status >= ONLINE_INDEX_ABORTED + ? dtuple_create(node->entry_sys_heap, 0) + : row_build_index_entry_low(node->row, NULL, index, node->entry_sys_heap, + ROW_BUILD_FOR_INSERT); + node->entry_list.push_back(entry); + } +} + +/*****************************************************************//** +Adds system field buffers to a row. */ +static +void +row_ins_alloc_sys_fields( +/*=====================*/ + ins_node_t* node) /*!< in: insert node */ +{ + dtuple_t* row; + dict_table_t* table; + const dict_col_t* col; + dfield_t* dfield; + + row = node->row; + table = node->table; + + ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); + + /* allocate buffer to hold the needed system created hidden columns. */ + compile_time_assert(DATA_ROW_ID_LEN + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + == sizeof node->sys_buf); + memset(node->sys_buf, 0, sizeof node->sys_buf); + /* Assign DB_ROLL_PTR to 1 << ROLL_PTR_INSERT_FLAG_POS */ + node->sys_buf[DATA_ROW_ID_LEN + DATA_TRX_ID_LEN] = 0x80; + ut_ad(!memcmp(node->sys_buf + DATA_ROW_ID_LEN, reset_trx_id, + sizeof reset_trx_id)); + + /* 1. Populate row-id */ + col = dict_table_get_sys_col(table, DATA_ROW_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_set_data(dfield, node->sys_buf, DATA_ROW_ID_LEN); + + /* 2. Populate trx id */ + col = dict_table_get_sys_col(table, DATA_TRX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN], + DATA_TRX_ID_LEN); + + col = dict_table_get_sys_col(table, DATA_ROLL_PTR); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN + + DATA_TRX_ID_LEN], + DATA_ROLL_PTR_LEN); +} + +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row) /*!< in: new row (or first row) for the node */ +{ + node->state = INS_NODE_SET_IX_LOCK; + node->index = NULL; + node->entry_list.clear(); + node->entry = node->entry_list.end(); + + node->row = row; + + mem_heap_empty(node->entry_sys_heap); + + /* Create templates for index entries */ + + ins_node_create_entry_list(node); + + /* Allocate from entry_sys_heap buffers for sys fields */ + + row_ins_alloc_sys_fields(node); + + /* As we allocated a new trx id buf, the trx id should be written + there again: */ + + node->trx_id = 0; +} + +/*******************************************************************//** +Does an insert operation by updating a delete-marked existing record +in the index. This situation can occur if the delete-marked record is +kept in the index for consistent reads. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_sec_index_entry_by_modify( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + rec_offs** offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + big_rec_t* dummy_big_rec; + upd_t* update; + rec_t* rec; + dberr_t err; + + rec = btr_cur_get_rec(cursor); + + ut_ad(!cursor->index()->is_clust()); + ut_ad(rec_offs_validate(rec, cursor->index(), *offsets)); + ut_ad(!entry->info_bits); + + /* We know that in the alphabetical ordering, entry and rec are + identified. But in their binary form there may be differences if + there are char fields in them. Therefore we have to calculate the + difference. */ + + update = row_upd_build_sec_rec_difference_binary( + rec, cursor->index(), *offsets, entry, heap); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* We should never insert in place of a record that + has not been delete-marked. The only exception is when + online CREATE INDEX copied the changes that we already + made to the clustered index, and completed the + secondary index creation before we got here. In this + case, the change would already be there. The CREATE + INDEX should be in wait_while_table_is_used() at least + until this INSERT or UPDATE returns. After that point, + set_committed(true) would be invoked in + commit_inplace_alter_table(). */ + ut_a(update->n_fields == 0); + ut_ad(!dict_index_is_online_ddl(cursor->index())); + return cursor->index()->is_committed() + ? DB_CORRUPTION : DB_SUCCESS; + } + + if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping changes + within the page */ + + /* TODO: pass only *offsets */ + err = btr_cur_optimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + default: + break; + } + } else { + ut_ad(mode == BTR_INSERT_TREE); + if (buf_pool.running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + + err = btr_cur_pessimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, + heap, &dummy_big_rec, update, 0, + thr, thr_get_trx(thr)->id, mtr); + ut_ad(!dummy_big_rec); + } + + return(err); +} + +/*******************************************************************//** +Does an insert operation by delete unmarking and updating a delete marked +existing record in the index. This situation can occur if the delete marked +record is kept in the index for consistent reads. +@return DB_SUCCESS, DB_FAIL, or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_clust_index_entry_by_modify( +/*================================*/ + btr_pcur_t* pcur, /*!< in/out: a persistent cursor pointing + to the clust_rec that is being modified. */ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap that can + be emptied, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr; must be committed before + latching any further pages */ +{ + const rec_t* rec; + upd_t* update; + dberr_t err = DB_SUCCESS; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + TABLE* mysql_table = NULL; + ut_ad(cursor->index()->is_clust()); + + rec = btr_cur_get_rec(cursor); + + ut_ad(rec_get_deleted_flag(rec, + cursor->index()->table->not_redundant())); + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(rec_get_trx_id(rec, cursor->index())); + + /* Build an update vector containing all the fields to be modified; + NOTE that this vector may NOT contain system columns trx_id or + roll_ptr */ + if (thr->prebuilt != NULL) { + mysql_table = thr->prebuilt->m_mysql_table; + ut_ad(thr->prebuilt->trx == thr_get_trx(thr)); + } + + update = row_upd_build_difference_binary( + cursor->index(), entry, rec, NULL, true, true, + thr_get_trx(thr), heap, mysql_table, &err); + if (err != DB_SUCCESS) { + return(err); + } + + if (mode != BTR_MODIFY_TREE) { + ut_ad(mode == BTR_MODIFY_LEAF + || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED + || mode == BTR_MODIFY_ROOT_AND_LEAF + || mode == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED); + + /* Try optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update( + flags, cursor, offsets, offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + default: + break; + } + } else { + if (buf_pool.running_out()) { + return DB_LOCK_TABLE_FULL; + } + + big_rec_t* big_rec = NULL; + + err = btr_cur_pessimistic_update( + flags | BTR_KEEP_POS_FLAG, + cursor, offsets, offsets_heap, heap, + &big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + + DEBUG_SYNC_C("before_row_ins_upd_extern"); + err = btr_store_big_rec_extern_fields( + pcur, *offsets, big_rec, mtr, + BTR_STORE_INSERT_UPDATE); + DEBUG_SYNC_C("after_row_ins_upd_extern"); + dtuple_big_rec_free(big_rec); + } + } + + return(err); +} + +/*********************************************************************//** +Returns TRUE if in a cascaded update/delete an ancestor node of node +updates (not DELETE, but UPDATE) table. +@return TRUE if an ancestor updates table */ +static +ibool +row_ins_cascade_ancestor_updates_table( +/*===================================*/ + que_node_t* node, /*!< in: node in a query graph */ + dict_table_t* table) /*!< in: table */ +{ + que_node_t* parent; + + for (parent = que_node_get_parent(node); + que_node_get_type(parent) == QUE_NODE_UPDATE; + parent = que_node_get_parent(parent)) { + + upd_node_t* upd_node; + + upd_node = static_cast<upd_node_t*>(parent); + + if (upd_node->table == table && !upd_node->is_delete) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*********************************************************************//** +Returns the number of ancestor UPDATE or DELETE nodes of a +cascaded update/delete node. +@return number of ancestors */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ulint +row_ins_cascade_n_ancestors( +/*========================*/ + que_node_t* node) /*!< in: node in a query graph */ +{ + que_node_t* parent; + ulint n_ancestors = 0; + + for (parent = que_node_get_parent(node); + que_node_get_type(parent) == QUE_NODE_UPDATE; + parent = que_node_get_parent(parent)) { + + n_ancestors++; + } + + return(n_ancestors); +} + +/******************************************************************//** +Calculates the update vector node->cascade->update for a child table in +a cascaded update. +@return whether any FULLTEXT INDEX is affected */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +row_ins_cascade_calc_update_vec( +/*============================*/ + upd_node_t* node, /*!< in: update node of the parent + table */ + dict_foreign_t* foreign, /*!< in: foreign key constraint whose + type is != 0 */ + mem_heap_t* heap, /*!< in: memory heap to use as + temporary storage */ + trx_t* trx) /*!< in: update transaction */ +{ + upd_node_t* cascade = node->cascade_node; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index = foreign->foreign_index; + upd_t* update; + dict_table_t* parent_table; + dict_index_t* parent_index; + upd_t* parent_update; + ulint n_fields_updated; + ulint parent_field_no; + ulint i; + ulint j; + bool doc_id_updated = false; + unsigned doc_id_pos = 0; + doc_id_t new_doc_id = FTS_NULL_DOC_ID; + ulint prefix_col; + + ut_a(cascade); + ut_a(table); + ut_a(index); + + /* Calculate the appropriate update vector which will set the fields + in the child index record to the same value (possibly padded with + spaces if the column is a fixed length CHAR or FIXBINARY column) as + the referenced index record will get in the update. */ + + parent_table = node->table; + ut_a(parent_table == foreign->referenced_table); + parent_index = foreign->referenced_index; + parent_update = node->update; + + update = cascade->update; + + update->info_bits = 0; + + n_fields_updated = 0; + + bool affects_fulltext = foreign->affects_fulltext(); + + if (table->fts) { + doc_id_pos = dict_table_get_nth_col_pos( + table, table->fts->doc_col, &prefix_col); + } + + for (i = 0; i < foreign->n_fields; i++) { + + parent_field_no = dict_table_get_nth_col_pos( + parent_table, + dict_index_get_nth_col_no(parent_index, i), + &prefix_col); + + for (j = 0; j < parent_update->n_fields; j++) { + const upd_field_t* parent_ufield + = &parent_update->fields[j]; + + if (parent_ufield->field_no == parent_field_no) { + + ulint min_size; + const dict_col_t* col; + ulint ufield_len; + upd_field_t* ufield; + + col = dict_index_get_nth_col(index, i); + + /* A field in the parent index record is + updated. Let us make the update vector + field for the child table. */ + + ufield = update->fields + n_fields_updated; + + ufield->field_no = static_cast<uint16_t>( + dict_table_get_nth_col_pos( + table, dict_col_get_no(col), + &prefix_col)); + + ufield->orig_len = 0; + ufield->exp = NULL; + + ufield->new_val = parent_ufield->new_val; + dfield_get_type(&ufield->new_val)->prtype |= + col->prtype & DATA_VERSIONED; + ufield_len = dfield_get_len(&ufield->new_val); + + /* Clear the "external storage" flag */ + dfield_set_len(&ufield->new_val, ufield_len); + + /* Do not allow a NOT NULL column to be + updated as NULL */ + + if (dfield_is_null(&ufield->new_val) + && (col->prtype & DATA_NOT_NULL)) { + goto err_exit; + } + + /* If the new value would not fit in the + column, do not allow the update */ + + if (!dfield_is_null(&ufield->new_val) + && dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminlen, col->mbmaxlen, + col->len, + ufield_len, + static_cast<char*>( + dfield_get_data( + &ufield->new_val))) + < ufield_len) { + goto err_exit; + } + + /* If the parent column type has a different + length than the child column type, we may + need to pad with spaces the new value of the + child column */ + + min_size = dict_col_get_min_size(col); + + /* Because UNIV_SQL_NULL (the marker + of SQL NULL values) exceeds all possible + values of min_size, the test below will + not hold for SQL NULL columns. */ + + if (min_size > ufield_len) { + + byte* pad; + ulint pad_len; + byte* padded_data; + ulint mbminlen; + + padded_data = static_cast<byte*>( + mem_heap_alloc( + heap, min_size)); + + pad = padded_data + ufield_len; + pad_len = min_size - ufield_len; + + memcpy(padded_data, + dfield_get_data(&ufield + ->new_val), + ufield_len); + + mbminlen = dict_col_get_mbminlen(col); + + ut_ad(!(ufield_len % mbminlen)); + ut_ad(!(min_size % mbminlen)); + + if (mbminlen == 1 + && dtype_get_charset_coll( + col->prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL) { + /* Do not pad BINARY columns */ + goto err_exit; + } + + row_mysql_pad_col(mbminlen, + pad, pad_len); + dfield_set_data(&ufield->new_val, + padded_data, min_size); + } + + /* If Doc ID is updated, check whether the + Doc ID is valid */ + if (table->fts + && ufield->field_no == doc_id_pos) { + doc_id_t n_doc_id; + + n_doc_id = + table->fts->cache->next_doc_id; + + new_doc_id = fts_read_doc_id( + static_cast<const byte*>( + dfield_get_data( + &ufield->new_val))); + + affects_fulltext = true; + doc_id_updated = true; + + if (new_doc_id <= 0) { + ib::error() << "FTS Doc ID" + " must be larger than" + " 0"; + goto err_exit; + } + + if (new_doc_id < n_doc_id) { + ib::error() << "FTS Doc ID" + " must be larger than " + << n_doc_id - 1 + << " for table " + << table->name; + goto err_exit; + } + } + + n_fields_updated++; + } + } + } + + if (affects_fulltext) { + ut_ad(table->fts); + + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + doc_id_t doc_id; + doc_id_t* next_doc_id; + upd_field_t* ufield; + + next_doc_id = static_cast<doc_id_t*>(mem_heap_alloc( + heap, sizeof(doc_id_t))); + + ut_ad(!doc_id_updated); + ufield = update->fields + n_fields_updated; + fts_get_next_doc_id(table, next_doc_id); + doc_id = fts_update_doc_id(table, ufield, next_doc_id); + n_fields_updated++; + fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL); + } else { + if (doc_id_updated) { + ut_ad(new_doc_id); + fts_trx_add_op(trx, table, new_doc_id, + FTS_INSERT, NULL); + } else { + ib::error() << "FTS Doc ID must be updated" + " along with FTS indexed column for" + " table " << table->name; +err_exit: + n_fields_updated = ULINT_UNDEFINED; + } + } + } + + update->n_fields = n_fields_updated; + + return affects_fulltext; +} + +/*********************************************************************//** +Set detailed error message associated with foreign key errors for +the given transaction. */ +static +void +row_ins_set_detailed( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign) /*!< in: foreign key constraint */ +{ + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&srv_misc_tmpfile_mutex); + rewind(srv_misc_tmpfile); + + if (os_file_set_eof(srv_misc_tmpfile)) { + ut_print_name(srv_misc_tmpfile, trx, + foreign->foreign_table_name); + std::string fk_str = dict_print_info_on_foreign_key_in_create_format( + trx, foreign, FALSE); + fputs(fk_str.c_str(), srv_misc_tmpfile); + trx_set_detailed_error_from_file(trx, srv_misc_tmpfile); + } else { + trx_set_detailed_error(trx, "temp file operation failed"); + } + + mysql_mutex_unlock(&srv_misc_tmpfile_mutex); +} + +/*********************************************************************//** +Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file +and displays information about the given transaction. +The caller must release dict_foreign_err_mutex. */ +TRANSACTIONAL_TARGET +static +void +row_ins_foreign_trx_print( +/*======================*/ + trx_t* trx) /*!< in: transaction */ +{ + ulint n_rec_locks; + ulint n_trx_locks; + ulint heap_size; + + ut_ad(!srv_read_only_mode); + + { + TMLockMutexGuard g{SRW_LOCK_CALL}; + n_rec_locks = trx->lock.n_rec_locks; + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size = mem_heap_get_size(trx->lock.lock_heap); + } + + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(dict_foreign_err_file); + ut_print_timestamp(dict_foreign_err_file); + fputs(" Transaction:\n", dict_foreign_err_file); + + trx_print_low(dict_foreign_err_file, trx, 600, + n_rec_locks, n_trx_locks, heap_size); + + mysql_mutex_assert_owner(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Reports a foreign key error associated with an update or a delete of a +parent table index entry. */ +static +void +row_ins_foreign_report_err( +/*=======================*/ + const char* errstr, /*!< in: error string from the viewpoint + of the parent table */ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!< in: a matching index record in the + child table */ + const dtuple_t* entry) /*!< in: index entry in the parent + table */ +{ + std::string fk_str; + + if (srv_read_only_mode) { + return; + } + + FILE* ef = dict_foreign_err_file; + trx_t* trx = thr_get_trx(thr); + + row_ins_set_detailed(trx, foreign); + + row_ins_foreign_trx_print(trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs(":\n", ef); + fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign, + TRUE); + fputs(fk_str.c_str(), ef); + putc('\n', ef); + fputs(errstr, ef); + fprintf(ef, " in parent table, in index %s", + foreign->referenced_index->name()); + if (entry) { + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + } + fputs("\nBut in child table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fprintf(ef, ", in index %s", foreign->foreign_index->name()); + if (rec) { + fputs(", there is a record:\n", ef); + rec_print(ef, rec, foreign->foreign_index); + } else { + fputs(", the record is not available\n", ef); + } + putc('\n', ef); + + mysql_mutex_unlock(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Reports a foreign key error to dict_foreign_err_file when we are trying +to add an index entry to a child table. Note that the adding may be the result +of an update, too. */ +static +void +row_ins_foreign_report_add_err( +/*===========================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!< in: a record in the parent table: + it does not match entry because we + have an error! */ + const dtuple_t* entry) /*!< in: index entry to insert in the + child table */ +{ + std::string fk_str; + + if (srv_read_only_mode) { + return; + } + + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + row_ins_foreign_trx_print(trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs(":\n", ef); + fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign, + TRUE); + fputs(fk_str.c_str(), ef); + if (foreign->foreign_index) { + fprintf(ef, " in parent table, in index %s", + foreign->foreign_index->name()); + } else { + fputs(" in parent table", ef); + } + if (entry) { + fputs(" tuple:\n", ef); + /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized. + It would be better to only display the user columns. */ + dtuple_print(ef, entry); + } + fputs("\nBut in parent table ", ef); + ut_print_name(ef, trx, foreign->referenced_table_name); + fprintf(ef, ", in index %s,\n" + "the closest match we can find is record:\n", + foreign->referenced_index->name()); + if (rec && page_rec_is_supremum(rec)) { + /* If the cursor ended on a supremum record, it is better + to report the previous record in the error message, so that + the user gets a more descriptive error message. */ + rec = page_rec_get_prev_const(rec); + } + + if (rec) { + rec_print(ef, rec, foreign->referenced_index); + } + putc('\n', ef); + + mysql_mutex_unlock(&dict_foreign_err_mutex); +} + +/*********************************************************************//** +Invalidate the query cache for the given table. */ +static +void +row_ins_invalidate_query_cache( +/*===========================*/ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + const char* name) /*!< in: table name prefixed with + database name and a '/' character */ +{ + innobase_invalidate_query_cache(thr_get_trx(thr), name); +} + +/** Fill virtual column information in cascade node for the child table. +@param[out] cascade child update node +@param[in] rec clustered rec of child table +@param[in] index clustered index of child table +@param[in] node parent update node +@param[in] foreign foreign key information +@return error code. */ +static +dberr_t +row_ins_foreign_fill_virtual( + upd_node_t* cascade, + const rec_t* rec, + dict_index_t* index, + upd_node_t* node, + dict_foreign_t* foreign) +{ + THD* thd = current_thd; + row_ext_t* ext; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + const rec_offs* offsets = + rec_get_offsets(rec, index, offsets_, index->n_core_fields, + ULINT_UNDEFINED, &cascade->heap); + TABLE* mysql_table= NULL; + upd_t* update = cascade->update; + ulint n_v_fld = index->table->n_v_def; + ulint n_diff; + upd_field_t* upd_field; + dict_vcol_set* v_cols = foreign->v_cols; + update->old_vrow = row_build( + ROW_COPY_DATA, index, rec, + offsets, index->table, NULL, NULL, + &ext, update->heap); + n_diff = update->n_fields; + + ut_ad(index->table->vc_templ != NULL); + + ib_vcol_row vc(NULL); + uchar *record = vc.record(thd, index, &mysql_table); + if (!record) { + return DB_OUT_OF_MEMORY; + } + ut_ad(!node->is_delete + || (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)); + ut_ad(foreign->type & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_CASCADE)); + + for (uint16_t i = 0; i < n_v_fld; i++) { + + dict_v_col_t* col = dict_table_get_nth_v_col( + index->table, i); + + dict_vcol_set::iterator it = v_cols->find(col); + + if (it == v_cols->end()) { + continue; + } + + dfield_t* vfield = innobase_get_computed_value( + update->old_vrow, col, index, + &vc.heap, update->heap, NULL, thd, mysql_table, + record, NULL, NULL); + + if (vfield == NULL) { + return DB_COMPUTE_VALUE_FAILED; + } + + upd_field = update->fields + n_diff; + + upd_field->old_v_val = static_cast<dfield_t*>( + mem_heap_alloc(update->heap, + sizeof *upd_field->old_v_val)); + + dfield_copy(upd_field->old_v_val, vfield); + + upd_field_set_v_field_no(upd_field, i, index); + + dfield_t* new_vfield = innobase_get_computed_value( + update->old_vrow, col, index, + &vc.heap, update->heap, NULL, thd, + mysql_table, record, NULL, + update); + + if (new_vfield == NULL) { + return DB_COMPUTE_VALUE_FAILED; + } + + dfield_copy(&upd_field->new_val, new_vfield); + + if (!dfield_datas_are_binary_equal( + upd_field->old_v_val, + &upd_field->new_val, 0)) + n_diff++; + } + + update->n_fields = n_diff; + return DB_SUCCESS; +} + +#ifdef WITH_WSREP +dberr_t wsrep_append_foreign_key(trx_t *trx, + dict_foreign_t* foreign, + const rec_t* clust_rec, + dict_index_t* clust_index, + bool referenced, + upd_node_t* upd_node, + bool pa_disable, + Wsrep_service_key_type key_type); +#endif /* WITH_WSREP */ + +/*********************************************************************//** +Perform referential actions or checks when a parent row is deleted or updated +and the constraint had an ON DELETE or ON UPDATE condition which was not +RESTRICT. +@return DB_SUCCESS, DB_LOCK_WAIT, or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_foreign_check_on_constraint( +/*================================*/ + que_thr_t* thr, /*!< in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /*!< in: foreign key constraint whose + type is != 0 */ + btr_pcur_t* pcur, /*!< in: cursor placed on a matching + index record in the child table */ + dtuple_t* entry, /*!< in: index entry in the parent + table */ + mtr_t* mtr) /*!< in: mtr holding the latch of pcur + page */ +{ + upd_node_t* node; + upd_node_t* cascade; + dict_table_t*const*const fktable = &foreign->foreign_table; + dict_table_t* table = *fktable; + dict_index_t* index; + dict_index_t* clust_index; + dtuple_t* ref; + const rec_t* rec; + const rec_t* clust_rec; + const buf_block_t* clust_block; + upd_t* update; + dberr_t err; + trx_t* trx; + mem_heap_t* tmp_heap = NULL; + doc_id_t doc_id = FTS_NULL_DOC_ID; + + DBUG_ENTER("row_ins_foreign_check_on_constraint"); + + trx = thr_get_trx(thr); + + /* Since we are going to delete or update a row, we have to invalidate + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the mutex rank above the lock_sys.latch. The query cache mutex + has a rank just above the lock_sys.latch. */ + + row_ins_invalidate_query_cache(thr, table->name.m_name); + + node = static_cast<upd_node_t*>(thr->run_node); + + if (node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_DELETE_CASCADE + | DICT_FOREIGN_ON_DELETE_SET_NULL))) { + + row_ins_foreign_report_err("Trying to delete", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + DBUG_RETURN(DB_ROW_IS_REFERENCED); + } + + if (!node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_UPDATE_CASCADE + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* This is an UPDATE */ + + row_ins_foreign_report_err("Trying to update", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + DBUG_RETURN(DB_ROW_IS_REFERENCED); + } + + if (node->cascade_node == NULL) { + node->cascade_heap = mem_heap_create(128); + node->cascade_node = row_create_update_node_for_mysql( + table, node->cascade_heap); + que_node_set_parent(node->cascade_node, node); + + } + cascade = node->cascade_node; + cascade->table = table; + cascade->foreign = foreign; + + if (node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) { + cascade->is_delete = PLAIN_DELETE; + } else { + cascade->is_delete = NO_DELETE; + + if (foreign->n_fields > cascade->update_n_fields) { + /* We have to make the update vector longer */ + + cascade->update = upd_create(foreign->n_fields, + node->cascade_heap); + cascade->update_n_fields = foreign->n_fields; + } + + /* We do not allow cyclic cascaded updating (DELETE is + allowed, but not UPDATE) of the same table, as this + can lead to an infinite cycle. Check that we are not + updating the same table which is already being + modified in this cascade chain. We have to check this + also because the modification of the indexes of a + 'parent' table may still be incomplete, and we must + avoid seeing the indexes of the parent table in an + inconsistent state! */ + + if (row_ins_cascade_ancestor_updates_table(cascade, table)) { + + /* We do not know if this would break foreign key + constraints, but play safe and return an error */ + + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying an update, possibly causing a cyclic" + " cascaded update\n" + "in the child table,", thr, foreign, + btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + } + + if (row_ins_cascade_n_ancestors(cascade) >= FK_MAX_CASCADE_DEL) { + err = DB_FOREIGN_EXCEED_MAX_CASCADE; + + row_ins_foreign_report_err( + "Trying a too deep cascaded delete or update\n", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + index = pcur->index(); + + ut_a(index == foreign->foreign_index); + + rec = btr_pcur_get_rec(pcur); + + tmp_heap = mem_heap_create(256); + + if (dict_index_is_clust(index)) { + /* pcur is already positioned in the clustered index of + the child table */ + + clust_index = index; + clust_rec = rec; + clust_block = btr_pcur_get_block(pcur); + } else { + /* We have to look for the record in the clustered index + in the child table */ + + clust_index = dict_table_get_first_index(table); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, + tmp_heap); + cascade->pcur->old_rec = nullptr; + cascade->pcur->btr_cur.page_cur.index = clust_index; + err = btr_pcur_open_with_no_init(ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + cascade->pcur, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto nonstandard_exit_func; + } + + clust_rec = btr_pcur_get_rec(cascade->pcur); + clust_block = btr_pcur_get_block(cascade->pcur); + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(cascade->pcur) + < dict_index_get_n_unique(clust_index)) { + + ib::error() << "In cascade of a foreign key op index " + << index->name + << " of table " << index->table->name; + + fputs("InnoDB: record ", stderr); + rec_print(stderr, rec, index); + fputs("\n" + "InnoDB: clustered record ", stderr); + rec_print(stderr, clust_rec, clust_index); + fputs("\n" + "InnoDB: Submit a detailed bug report to" + " https://jira.mariadb.org/\n", stderr); + ut_ad(0); + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Set an X-lock on the row to delete or update in the child table */ + + err = lock_table(table, fktable, LOCK_IX, thr); + + if (err == DB_SUCCESS) { + /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; + we already have a normal shared lock on the appropriate + gap if the search criterion was not unique */ + + err = lock_clust_rec_read_check_and_lock_alt( + 0, clust_block, clust_rec, clust_index, + LOCK_X, LOCK_REC_NOT_GAP, thr); + } + + if (err != DB_SUCCESS) { + + goto nonstandard_exit_func; + } + + if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(rec_get_trx_id(clust_rec, clust_index)); + /* This can happen if there is a circular reference of + rows such that cascading delete comes to delete a row + already in the process of being delete marked */ + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + if (table->fts) { + doc_id = fts_get_doc_id_from_rec( + clust_rec, clust_index, + rec_get_offsets(clust_rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &tmp_heap)); + } + + if (node->is_delete + ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) + : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) { + /* Build the appropriate update vector which sets + foreign->n_fields first fields in rec to SQL NULL */ + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + MEM_UNDEFINED(update->fields, + update->n_fields * sizeof *update->fields); + + for (ulint i = 0; i < foreign->n_fields; i++) { + upd_field_t* ufield = &update->fields[i]; + ulint col_no = dict_index_get_nth_col_no( + index, i); + ulint prefix_col; + + ufield->field_no = static_cast<uint16_t>( + dict_table_get_nth_col_pos( + table, col_no, &prefix_col)); + dict_col_t* col = dict_table_get_nth_col( + table, col_no); + dict_col_copy_type(col, dfield_get_type(&ufield->new_val)); + + ufield->orig_len = 0; + ufield->exp = NULL; + dfield_set_null(&ufield->new_val); + } + + if (foreign->affects_fulltext()) { + fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); + } + + if (foreign->v_cols != NULL + && foreign->v_cols->size() > 0) { + err = row_ins_foreign_fill_virtual( + cascade, clust_rec, clust_index, + node, foreign); + + if (err != DB_SUCCESS) { + goto nonstandard_exit_func; + } + } + } else if (table->fts && cascade->is_delete == PLAIN_DELETE + && foreign->affects_fulltext()) { + /* DICT_FOREIGN_ON_DELETE_CASCADE case */ + fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); + } + + if (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) { + + /* Build the appropriate update vector which sets changing + foreign->n_fields first fields in rec to new values */ + + bool affects_fulltext = row_ins_cascade_calc_update_vec( + node, foreign, tmp_heap, trx); + + if (foreign->v_cols && !foreign->v_cols->empty()) { + err = row_ins_foreign_fill_virtual( + cascade, clust_rec, clust_index, + node, foreign); + + if (err != DB_SUCCESS) { + goto nonstandard_exit_func; + } + } + + switch (cascade->update->n_fields) { + case ULINT_UNDEFINED: + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a cascaded update where the" + " updated value in the child\n" + "table would not fit in the length" + " of the column, or the value would\n" + "be NULL and the column is" + " declared as not NULL in the child table,", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + case 0: + /* The update does not change any columns referred + to in this foreign key constraint: no need to do + anything */ + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + /* Mark the old Doc ID as deleted */ + if (affects_fulltext) { + ut_ad(table->fts); + fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL); + } + } + + if (table->versioned() && cascade->is_delete != PLAIN_DELETE + && cascade->update->affects_versioned()) { + ut_ad(!cascade->historical_heap); + cascade->historical_heap = mem_heap_create(srv_page_size); + cascade->historical_row = row_build( + ROW_COPY_DATA, clust_index, clust_rec, NULL, table, + NULL, NULL, NULL, cascade->historical_heap); + } + + /* Store pcur position and initialize or store the cascade node + pcur stored position */ + + btr_pcur_store_position(pcur, mtr); + + if (index == clust_index) { + btr_pcur_copy_stored_position(cascade->pcur, pcur); + } else { + btr_pcur_store_position(cascade->pcur, mtr); + } + +#ifdef WITH_WSREP + if (trx->is_wsrep()) { + err = wsrep_append_foreign_key(trx, foreign, clust_rec, clust_index, + false, NULL, true, + WSREP_SERVICE_KEY_EXCLUSIVE); + if (err != DB_SUCCESS) { + goto nonstandard_exit_func; + } + } +#endif /* WITH_WSREP */ + mtr_commit(mtr); + + ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON); + + cascade->state = UPD_NODE_UPDATE_CLUSTERED; + + err = row_update_cascade_for_mysql(thr, cascade, + foreign->foreign_table); + + mtr_start(mtr); + + /* Restore pcur position */ + + if (pcur->restore_position(BTR_SEARCH_LEAF, mtr) + != btr_pcur_t::SAME_ALL) { + err = DB_CORRUPTION; + } + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + DBUG_RETURN(err); + +nonstandard_exit_func: + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + btr_pcur_store_position(pcur, mtr); + + mtr_commit(mtr); + mtr_start(mtr); + + if (pcur->restore_position(BTR_SEARCH_LEAF, mtr) + != btr_pcur_t::SAME_ALL && err == DB_SUCCESS) { + err = DB_CORRUPTION; + } + + DBUG_RETURN(err); +} + +/*********************************************************************//** +Sets a shared lock on a record. Used in locking possible duplicate key +records and also in checking foreign key constraints. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +static +dberr_t +row_ins_set_shared_rec_lock( +/*========================*/ + unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } + + return(err); +} + +/*********************************************************************//** +Sets a exclusive lock on a record. Used in locking possible duplicate key +records +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +static +dberr_t +row_ins_set_exclusive_rec_lock( +/*===========================*/ + unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } + + return(err); +} + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_sys.latch. +@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ +dberr_t +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE if we want to check that + the referenced table is ok, FALSE if we + want to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ +{ + upd_node_t* upd_node; + ulint n_fields_cmp; + btr_pcur_t pcur; + int cmp; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + bool skip_gap_lock; + + skip_gap_lock = (trx->isolation_level <= TRX_ISO_READ_COMMITTED); + + DBUG_ENTER("row_ins_check_foreign_constraint"); + + rec_offs_init(offsets_); + +#ifdef WITH_WSREP + upd_node= NULL; +#endif /* WITH_WSREP */ + + if (!trx->check_foreigns) { + /* The user has suppressed foreign key checks currently for + this session */ + DBUG_RETURN(DB_SUCCESS); + } + + /* If any of the foreign key fields in entry is SQL NULL, we + suppress the foreign key check: this is compatible with Oracle, + for example */ + for (ulint i = 0; i < entry->n_fields; i++) { + dfield_t* field = dtuple_get_nth_field(entry, i); + if (i < foreign->n_fields && dfield_is_null(field)) { + DBUG_RETURN(DB_SUCCESS); + } + /* System Versioning: if row_end != Inf, we + suppress the foreign key check */ + if (field->type.vers_sys_end() && field->vers_history_row()) { + DBUG_RETURN(DB_SUCCESS); + } + } + + if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) { + upd_node = static_cast<upd_node_t*>(thr->run_node); + + if (upd_node->is_delete != PLAIN_DELETE + && upd_node->foreign == foreign) { + /* If a cascaded update is done as defined by a + foreign key constraint, do not check that + constraint for the child row. In ON UPDATE CASCADE + the update of the parent row is only half done when + we come here: if we would check the constraint here + for the child row it would fail. + + A QUESTION remains: if in the child table there are + several constraints which refer to the same parent + table, we should merge all updates to the child as + one update? And the updates can be contradictory! + Currently we just perform the update associated + with each foreign key constraint, one after + another, and the user has problems predicting in + which order they are performed. */ + + DBUG_RETURN(DB_SUCCESS); + } + } + + if (que_node_get_type(thr->run_node) == QUE_NODE_INSERT) { + ins_node_t* insert_node = + static_cast<ins_node_t*>(thr->run_node); + dict_table_t* table = insert_node->index->table; + if (table->versioned()) { + dfield_t* row_end = dtuple_get_nth_field( + insert_node->row, table->vers_end); + if (row_end->vers_history_row()) { + DBUG_RETURN(DB_SUCCESS); + } + } + } + + dict_table_t *check_table; + dict_index_t *check_index; + dberr_t err = DB_SUCCESS; + + { + dict_table_t*& fktable = check_ref + ? foreign->referenced_table : foreign->foreign_table; + check_table = fktable; + if (check_table) { + err = lock_table(check_table, &fktable, LOCK_IS, thr); + if (err != DB_SUCCESS) { + goto do_possible_lock_wait; + } + } + check_table = fktable; + } + + check_index = check_ref + ? foreign->referenced_index : foreign->foreign_index; + + if (!check_table || !check_table->is_readable() || !check_index) { + FILE* ef = dict_foreign_err_file; + std::string fk_str; + + row_ins_set_detailed(trx, foreign); + row_ins_foreign_trx_print(trx); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, check_ref + ? foreign->foreign_table_name + : foreign->referenced_table_name); + fputs(":\n", ef); + fk_str = dict_print_info_on_foreign_key_in_create_format( + trx, foreign, TRUE); + fputs(fk_str.c_str(), ef); + if (check_ref) { + if (foreign->foreign_index) { + fprintf(ef, "\nTrying to add to index %s" + " tuple:\n", + foreign->foreign_index->name()); + } else { + fputs("\nTrying to add tuple:\n", ef); + } + dtuple_print(ef, entry); + fputs("\nBut the parent table ", ef); + ut_print_name(ef, trx, foreign->referenced_table_name); + fputs("\nor its .ibd file or the required index does" + " not currently exist!\n", ef); + err = DB_NO_REFERENCED_ROW; + } else { + if (foreign->referenced_index) { + fprintf(ef, "\nTrying to modify index %s" + " tuple:\n", + foreign->referenced_index->name()); + } else { + fputs("\nTrying to modify tuple:\n", ef); + } + dtuple_print(ef, entry); + fputs("\nBut the referencing table ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + fputs("\nor its .ibd file or the required index does" + " not currently exist!\n", ef); + err = DB_ROW_IS_REFERENCED; + } + + mysql_mutex_unlock(&dict_foreign_err_mutex); + goto exit_func; + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, foreign->n_fields); + pcur.btr_cur.page_cur.index = check_index; + err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto end_scan; + } + + /* Scan index records and check if there is a matching record */ + + do { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + + if (page_rec_is_infimum(rec)) { + + continue; + } + + offsets = rec_get_offsets(rec, check_index, offsets, + check_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (page_rec_is_supremum(rec)) { + + if (skip_gap_lock) { + + continue; + } + + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block, + rec, check_index, + offsets, thr); + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + continue; + default: + goto end_scan; + } + } + + cmp = cmp_dtuple_rec(entry, rec, check_index, offsets); + + if (cmp == 0) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(!dict_index_is_clust(check_index) + || row_get_rec_trx_id(rec, check_index, + offsets)); + + err = row_ins_set_shared_rec_lock( + skip_gap_lock + ? LOCK_REC_NOT_GAP + : LOCK_ORDINARY, block, + rec, check_index, offsets, thr); + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto end_scan; + } + } else { + if (check_table->versioned()) { + bool history_row = false; + + if (check_index->is_primary()) { + history_row = check_index-> + vers_history_row(rec, + offsets); + } else if (check_index-> + vers_history_row(rec, + history_row)) { + break; + } + + if (history_row) { + continue; + } + } + /* Found a matching record. Lock only + a record because we can allow inserts + into gaps */ + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, block, + rec, check_index, offsets, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto end_scan; + } + + if (check_ref) { + err = DB_SUCCESS; +#ifdef WITH_WSREP + if (trx->is_wsrep()) { + err = wsrep_append_foreign_key( + thr_get_trx(thr), + foreign, + rec, + check_index, + check_ref, + upd_node, + false, + WSREP_SERVICE_KEY_REFERENCE); + } +#endif /* WITH_WSREP */ + goto end_scan; + } else if (foreign->type != 0) { + /* There is an ON UPDATE or ON DELETE + condition: check them in a separate + function */ + + err = row_ins_foreign_check_on_constraint( + thr, foreign, &pcur, entry, + &mtr); + if (err != DB_SUCCESS) { + /* Since reporting a plain + "duplicate key" error + message to the user in + cases where a long CASCADE + operation would lead to a + duplicate key in some + other table is very + confusing, map duplicate + key errors resulting from + FK constraints to a + separate error code. */ + + if (err == DB_DUPLICATE_KEY) { + err = DB_FOREIGN_DUPLICATE_KEY; + } + + goto end_scan; + } + + /* row_ins_foreign_check_on_constraint + may have repositioned pcur on a + different block */ + block = btr_pcur_get_block(&pcur); + } else { + row_ins_foreign_report_err( + "Trying to delete or update", + thr, foreign, rec, entry); + + err = DB_ROW_IS_REFERENCED; + goto end_scan; + } + } + } else { + ut_a(cmp < 0); + + err = skip_gap_lock + ? DB_SUCCESS + : row_ins_set_shared_rec_lock( + LOCK_GAP, block, + rec, check_index, offsets, thr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + if (check_ref) { + err = DB_NO_REFERENCED_ROW; + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + } + default: + break; + } + + goto end_scan; + } + } while (btr_pcur_move_to_next(&pcur, &mtr)); + + if (check_ref) { + row_ins_foreign_report_add_err( + trx, foreign, btr_pcur_get_rec(&pcur), entry); + err = DB_NO_REFERENCED_ROW; + } else { + err = DB_SUCCESS; + } + +end_scan: + mtr_commit(&mtr); + ut_free(pcur.old_rec_buf); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + +do_possible_lock_wait: + if (err == DB_LOCK_WAIT) { + trx->error_state = err; + + thr->lock_state = QUE_THR_LOCK_ROW; + + err = lock_wait(thr); + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + + if (err == DB_SUCCESS) { + err = DB_LOCK_WAIT; + } + } + +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + DBUG_RETURN(err); +} + +/** Sets the values of the dtuple fields in ref_entry from the values of +foreign columns in entry. +@param[in] foreign foreign key constraint +@param[in] index clustered index +@param[in] entry tuple of clustered index +@param[in] ref_entry tuple of foreign columns +@return true if all foreign key fields present in clustered index */ +static +bool row_ins_foreign_index_entry(dict_foreign_t *foreign, + const dict_index_t *index, + const dtuple_t *entry, + dtuple_t *ref_entry) +{ + for (ulint i= 0; i < foreign->n_fields; i++) + { + for (ulint j= 0; j < index->n_fields; j++) + { + const dict_col_t *col= dict_index_get_nth_col(index, j); + + /* A clustered index may contain instantly dropped columns, + which must be skipped. */ + if (col->is_dropped()) + continue; + + const char *col_name= dict_table_get_col_name(index->table, col->ind); + if (0 == innobase_strcasecmp(col_name, foreign->foreign_col_names[i])) + { + dfield_copy(&ref_entry->fields[i], &entry->fields[j]); + goto got_match; + } + } + return false; +got_match: + continue; + } + + return true; +} + +/***************************************************************//** +Checks if foreign key constraints fail for an index entry. If index +is not mentioned in any constraint, this function does nothing, +Otherwise does searches to the indexes of referenced tables and +sets shared locks which lock either the success or the failure of +a constraint. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_check_foreign_constraints( +/*==============================*/ + dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in: index */ + bool pk, /*!< in: index->is_primary() */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_foreign_t* foreign; + dberr_t err = DB_SUCCESS; + mem_heap_t* heap = NULL; + + DBUG_ASSERT(index->is_primary() == pk); + + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "foreign_constraint_check_for_ins"); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + err == DB_SUCCESS && it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (foreign->foreign_index == index + || (pk && !foreign->foreign_index)) { + + dtuple_t* ref_tuple = entry; + if (UNIV_UNLIKELY(!foreign->foreign_index)) { + /* Change primary key entry to + foreign key index entry */ + if (!heap) { + heap = mem_heap_create(1000); + } else { + mem_heap_empty(heap); + } + + ref_tuple = dtuple_create( + heap, foreign->n_fields); + dtuple_set_n_fields_cmp( + ref_tuple, foreign->n_fields); + if (!row_ins_foreign_index_entry( + foreign, index, entry, ref_tuple)) { + err = DB_NO_REFERENCED_ROW; + break; + } + + } + + dict_table_t* ref_table = NULL; + dict_table_t* referenced_table + = foreign->referenced_table; + + if (referenced_table == NULL) { + + ref_table = dict_table_open_on_name( + foreign->referenced_table_name_lookup, + false, DICT_ERR_IGNORE_NONE); + } + + err = row_ins_check_foreign_constraint( + TRUE, foreign, table, ref_tuple, thr); + + if (ref_table) { + dict_table_close(ref_table); + } + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return err; +} + +/***************************************************************//** +Checks if a unique key violation to rec would occur at the index entry +insert. +@return TRUE if error */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + const rec_t* rec, /*!< in: user record; NOTE that we assume + that the caller already has a record lock on + the record! */ + const dtuple_t* entry, /*!< in: entry to insert */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint matched_fields; + ulint n_unique; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + + cmp_dtuple_rec_with_match(entry, rec, index, offsets, &matched_fields); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + /* In a unique secondary index we allow equal key values if they + contain SQL NULLs */ + + if (!dict_index_is_clust(index) && !index->nulls_equal) { + + for (i = 0; i < n_unique; i++) { + if (dfield_is_null(dtuple_get_nth_field(entry, i))) { + + return(FALSE); + } + } + } + + return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); +} + +/** Determine whether a history row was inserted by this transaction +(row TRX_ID is the same as current TRX_ID). +@param index secondary index +@param rec secondary index record +@param trx transaction +@return error code +@retval DB_SUCCESS on success +@retval DB_FOREIGN_DUPLICATE_KEY if a history row was inserted by trx */ +static dberr_t vers_row_same_trx(dict_index_t* index, const rec_t* rec, + const trx_t& trx) +{ + mtr_t mtr; + dberr_t ret= DB_SUCCESS; + dict_index_t *clust_index= dict_table_get_first_index(index->table); + ut_ad(index != clust_index); + + mtr.start(); + + if (const rec_t *clust_rec= + row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr)) + { + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *clust_offs= offsets_; + rec_offs_init(offsets_); + mem_heap_t *heap= NULL; + + clust_offs= + rec_get_offsets(clust_rec, clust_index, clust_offs, + clust_index->n_core_fields, ULINT_UNDEFINED, &heap); + if (clust_index->vers_history_row(clust_rec, clust_offs)) + { + ulint trx_id_len; + const byte *trx_id= rec_get_nth_field(clust_rec, clust_offs, + clust_index->n_uniq, &trx_id_len); + ut_ad(trx_id_len == DATA_TRX_ID_LEN); + + if (trx.id == trx_read_trx_id(trx_id)) + ret= DB_FOREIGN_DUPLICATE_KEY; + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + } + else + { + ib::error() << "foreign constraints: secondary index " << index->name << + " of table " << index->table->name << " is out of sync"; + ut_ad("secondary index is out of sync" == 0); + ret= DB_TABLE_CORRUPT; + } + + mtr.commit(); + return ret; +} + +/***************************************************************//** +Scans a unique non-clustered index at a given index entry to determine +whether a uniqueness violation has occurred for the key value of the entry. +Set shared locks on possible duplicate records. +@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_scan_sec_index_for_duplicate( +/*=================================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: non-clustered unique index */ + dtuple_t* entry, /*!< in: index entry */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mem_heap_t* offsets_heap) + /*!< in/out: memory heap that can be emptied */ +{ + ulint n_unique; + int cmp; + ulint n_fields_cmp; + btr_pcur_t pcur; + rec_offs offsets_[REC_OFFS_SEC_INDEX_SIZE]; + rec_offs* offsets = offsets_; + DBUG_ENTER("row_ins_scan_sec_index_for_duplicate"); + + rec_offs_init(offsets_); + + ut_ad(!index->lock.have_any()); + + n_unique = dict_index_get_n_unique(index); + + /* If the secondary index is unique, but one of the fields in the + n_unique first fields is NULL, a unique key violation cannot occur, + since we define NULL != NULL in this case */ + + if (!index->nulls_equal) { + for (ulint i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + DBUG_RETURN(DB_SUCCESS); + } + } + } + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, n_unique); + pcur.btr_cur.page_cur.index = index; + trx_t* const trx = thr_get_trx(thr); + dberr_t err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, + &pcur, mtr); + if (err != DB_SUCCESS) { + goto end_scan; + } + + /* Scan index records and check if there is a duplicate */ + + do { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + const ulint lock_type = LOCK_ORDINARY; + + if (page_rec_is_infimum(rec)) { + + continue; + } + + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &offsets_heap); + + if (flags & BTR_NO_LOCKING_FLAG) { + /* Set no locks when applying log + in online table rebuild. */ + } else if (trx->duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + lock_type, block, rec, index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + lock_type, block, rec, index, offsets, thr); + } + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + case DB_SUCCESS: + break; + default: + goto end_scan; + } + + if (page_rec_is_supremum(rec)) { + + continue; + } + + cmp = cmp_dtuple_rec(entry, rec, index, offsets); + + if (cmp == 0) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { + + err = DB_DUPLICATE_KEY; + + trx->error_info = index; + + if (!index->table->versioned()) { + } else if (dberr_t e = + vers_row_same_trx(index, rec, + *trx)) { + err = e; + goto end_scan; + } + + /* If the duplicate is on hidden FTS_DOC_ID, + state so in the error log */ + if (index == index->table->fts_doc_id_index + && DICT_TF2_FLAG_IS_SET( + index->table, + DICT_TF2_FTS_HAS_DOC_ID)) { + + ib::error() << "Duplicate FTS_DOC_ID" + " value on table " + << index->table->name; + } + + goto end_scan; + } + } else { + ut_a(cmp < 0); + goto end_scan; + } + } while (btr_pcur_move_to_next(&pcur, mtr)); + +end_scan: + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + + DBUG_RETURN(err); +} + +/** Checks for a duplicate when the table is being rebuilt online. +@param n_uniq index->db_trx_id() +@param entry entry being inserted +@param rec clustered index record at insert position +@param index clustered index +@param offsets rec_get_offsets(rec) +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_online(ulint n_uniq, const dtuple_t *entry, + const rec_t *rec, const dict_index_t *index, + rec_offs *offsets) +{ + ulint fields = 0; + + /* During rebuild, there should not be any delete-marked rows + in the new table. */ + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq); + ut_ad(n_uniq == index->db_trx_id()); + + /* Compare the PRIMARY KEY fields and the DB_TRX_ID, DB_ROLL_PTR. */ + cmp_dtuple_rec_with_match_low(entry, rec, index, offsets, n_uniq + 2, + &fields); + + if (fields < n_uniq) { + /* Not a duplicate. */ + return(DB_SUCCESS); + } + + ulint trx_id_len; + + if (fields == n_uniq + 2 + && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len), + reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + ut_ad(trx_id_len == DATA_TRX_ID_LEN); + /* rec is an exact match of entry, and DB_TRX_ID belongs + to a transaction that started after our ALTER TABLE. */ + return(DB_SUCCESS_LOCKED_REC); + } + + return(DB_DUPLICATE_KEY); +} + +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_error_in_clust_online( +/*====================================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const btr_cur_t*cursor, /*!< in: cursor on insert position */ + rec_offs** offsets,/*!< in/out: rec_get_offsets(rec) */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + dberr_t err = DB_SUCCESS; + const rec_t* rec = btr_cur_get_rec(cursor); + + ut_ad(!cursor->index()->is_instant()); + + if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index(), *offsets, + cursor->index()->n_fields, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, + rec, cursor->index(), *offsets); + if (err != DB_SUCCESS) { + return(err); + } + } + + if (!(rec = page_rec_get_next_const(btr_cur_get_rec(cursor)))) { + return DB_CORRUPTION; + } + + if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index(), *offsets, + cursor->index()->n_fields, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, + rec, cursor->index(), *offsets); + } + + return(err); +} + +/***************************************************************//** +Checks if a unique key violation error would occur at an index entry +insert. Sets shared locks on possible duplicate records. Works only +for a clustered index! +@retval DB_SUCCESS if no error +@retval DB_DUPLICATE_KEY if error, +@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate +record */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_error_in_clust( + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: B-tree cursor */ + const dtuple_t* entry, /*!< in: entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + rec_t* rec; + ulint n_unique; + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(cursor->index()->is_clust()); + + /* NOTE: For unique non-clustered indexes there may be any number + of delete marked records with the same value for the non-clustered + index key (remember multiversioning), and which differ only in + the row refererence part of the index record, containing the + clustered index key fields. For such a secondary index record, + to avoid race condition, we must FIRST do the insertion and after + that check that the uniqueness condition is not breached! */ + + /* NOTE: A problem is that in the B-tree node pointers on an + upper level may match more to the entry than the actual existing + user records on the leaf level. So, even if low_match would suggest + that a duplicate key violation may occur, this may not be the case. */ + + n_unique = dict_index_get_n_unique(cursor->index()); + + if (cursor->low_match >= n_unique) { + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index(), + offsets, + cursor->index() + ->n_core_fields, + ULINT_UNDEFINED, &heap); + + /* We set a lock on the possible duplicate: this + is needed in logical logging of MySQL to make + sure that in roll-forward we get the same duplicate + errors as in original execution */ + + if (flags & BTR_NO_LOCKING_FLAG) { + /* Do nothing if no-locking is set */ + err = DB_SUCCESS; + } else if (trx->duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index(), offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), rec, + cursor->index(), offsets, thr); + } + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index(), offsets)) { +duplicate: + trx->error_info = cursor->index(); + err = DB_DUPLICATE_KEY; + if (thr->prebuilt + && thr->prebuilt->upd_node + && thr->prebuilt->upd_node->is_delete + == VERSIONED_DELETE + && entry->vers_history_row()) + { + ulint trx_id_len; + byte *trx_id = rec_get_nth_field( + rec, offsets, n_unique, + &trx_id_len); + ut_ad(trx_id_len == DATA_TRX_ID_LEN); + if (trx->id == trx_read_trx_id(trx_id)) { + err = DB_FOREIGN_DUPLICATE_KEY; + } + } + goto func_exit; + } + } + } + + err = DB_SUCCESS; + + if (cursor->up_match >= n_unique) { + + rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + if (rec && !page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, cursor->index(), + offsets, + cursor->index() + ->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (trx->duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index(), offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index(), offsets, thr); + } + + switch (err) { + default: + break; + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index(), + offsets)) { + goto duplicate; + } + } + } + + /* This should never happen */ + err = DB_CORRUPTION; + } +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/***************************************************************//** +Checks if an index entry has long enough common prefix with an +existing record so that the intended insert of the entry must be +changed to a modify of the existing record. In the case of a clustered +index, the prefix must be n_unique fields long. In the case of a +secondary index, all fields must be equal. InnoDB never updates +secondary index records in place, other than clearing or setting the +delete-mark flag. We could be able to update the non-unique fields +of a unique secondary index record by checking the cursor->up_match, +but we do not do so, because it could have some locking implications. +@return TRUE if the existing record should be updated; FALSE if not */ +UNIV_INLINE +ibool +row_ins_must_modify_rec( +/*====================*/ + const btr_cur_t* cursor) /*!< in: B-tree cursor */ +{ + /* NOTE: (compare to the note in row_ins_duplicate_error_in_clust) + Because node pointers on upper levels of the B-tree may match more + to entry than to actual user records on the leaf level, we + have to check if the candidate record is actually a user record. + A clustered index node pointer contains index->n_unique first fields, + and a secondary index node pointer contains all index fields. */ + + return(cursor->low_match + >= dict_index_get_n_unique_in_tree(cursor->index()) + && !page_rec_is_infimum(btr_cur_get_rec(cursor))); +} + +/** Insert the externally stored fields (off-page columns) +of a clustered index entry. +@param[in] entry index entry to insert +@param[in] big_rec externally stored fields +@param[in,out] offsets rec_get_offsets() +@param[in,out] heap memory heap +@param[in] thd client connection, or NULL +@param[in] index clustered index +@return error code +@retval DB_SUCCESS +@retval DB_OUT_OF_FILE_SPACE */ +static +dberr_t +row_ins_index_entry_big_rec( + const dtuple_t* entry, + const big_rec_t* big_rec, + rec_offs* offsets, + mem_heap_t** heap, + dict_index_t* index, + const void* thd __attribute__((unused))) +{ + mtr_t mtr; + btr_pcur_t pcur; + rec_t* rec; + + pcur.btr_cur.page_cur.index = index; + ut_ad(index->is_primary()); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch"); + + mtr.start(); + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + } + + dberr_t error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_TREE, + &pcur, &mtr); + if (error != DB_SUCCESS) { + return error; + } + + rec = btr_pcur_get_rec(&pcur); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, heap); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern"); + error = btr_store_big_rec_extern_fields( + &pcur, offsets, big_rec, &mtr, BTR_STORE_INSERT); + DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern"); + + mtr.commit(); + + ut_free(pcur.old_rec_buf); + return(error); +} + +#ifdef HAVE_REPLICATION /* Working around MDEV-24622 */ +extern "C" int thd_is_slave(const MYSQL_THD thd); +#else +# define thd_is_slave(thd) 0 +#endif + +#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ +/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock(). +We would only need this for row_ins_clust_index_entry_low(), +but GCC 4.8.5 does not support pop_options. */ +# pragma GCC optimize ("O0") +#endif + +/***************************************************************//** +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_pcur_t pcur; + dberr_t err = DB_SUCCESS; + big_rec_t* big_rec = NULL; + mtr_t mtr; + uint64_t auto_inc = 0; + mem_heap_t* offsets_heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + trx_t* trx = thr_get_trx(thr); + buf_block_t* block; + + DBUG_ENTER("row_ins_clust_index_entry_low"); + + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_unique(index) + || n_uniq == dict_index_get_n_unique(index)); + ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index)); + ut_ad(!trx->in_rollback); + + mtr.start(); + + if (index->table->is_temporary()) { + /* Disable REDO logging as the lifetime of temp-tables is + limited to server or connection lifetime and so REDO + information is not needed on restart for recovery. + Disable locking as temp-tables are local to a connection. */ + + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(!index->table->persistent_autoinc); + ut_ad(!index->is_instant()); + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + + if (UNIV_UNLIKELY(entry->is_metadata())) { + ut_ad(index->is_instant()); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(mode == BTR_MODIFY_TREE); + } else { + if (mode == BTR_MODIFY_LEAF + && dict_index_is_online_ddl(index)) { + mode = BTR_MODIFY_LEAF_ALREADY_LATCHED; + mtr_s_lock_index(index, &mtr); + } + + if (unsigned ai = index->table->persistent_autoinc) { + /* Prepare to persist the AUTO_INCREMENT value + from the index entry to PAGE_ROOT_AUTO_INC. */ + const dfield_t* dfield = dtuple_get_nth_field( + entry, ai - 1); + if (!dfield_is_null(dfield)) { + auto_inc = row_parse_int( + static_cast<const byte*>( + dfield->data), + dfield->len, + dfield->type.mtype, + dfield->type.prtype + & DATA_UNSIGNED); + if (auto_inc + && mode != BTR_MODIFY_TREE) { + mode = btr_latch_mode( + BTR_MODIFY_ROOT_AND_LEAF + ^ BTR_MODIFY_LEAF + ^ mode); + } + } + } + } + } + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + pcur.btr_cur.page_cur.index = index; + err = btr_pcur_open(entry, PAGE_CUR_LE, mode, &pcur, &mtr); + if (err != DB_SUCCESS) { + index->table->file_unreadable = true; +err_exit: + mtr.commit(); + goto func_exit; + } + + if (auto_inc) { + buf_block_t* root + = mtr.at_savepoint(mode != BTR_MODIFY_ROOT_AND_LEAF); + ut_ad(index->page == root->page.id().page_no()); + page_set_autoinc(root, auto_inc, &mtr, false); + } + + btr_pcur_get_btr_cur(&pcur)->thr = thr; + +#ifdef UNIV_DEBUG + { + page_t* page = btr_pcur_get_page(&pcur); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_n_fields_is_sane(index, first_rec, entry)); + } +#endif /* UNIV_DEBUG */ + + block = btr_pcur_get_block(&pcur); + + DBUG_EXECUTE_IF("row_ins_row_level", goto skip_bulk_insert;); + + if (!(flags & BTR_NO_UNDO_LOG_FLAG) + && page_is_empty(block->page.frame) + && !entry->is_metadata() && !trx->duplicates + && !trx->check_unique_secondary && !trx->check_foreigns + && !trx->dict_operation + && block->page.id().page_no() == index->page + && !index->table->skip_alter_undo + && !index->table->n_rec_locks + && !index->table->is_active_ddl() + && !index->table->has_spatial_index() + && !index->table->versioned() + && !thd_is_slave(trx->mysql_thd) /* FIXME: MDEV-24622 */) { + DEBUG_SYNC_C("empty_root_page_insert"); + + trx->bulk_insert = true; + + if (!index->table->is_temporary()) { + err = lock_table(index->table, NULL, LOCK_X, thr); + + if (err != DB_SUCCESS) { + trx->error_state = err; + trx->bulk_insert = false; + goto err_exit; + } + + if (index->table->n_rec_locks) { +avoid_bulk: + trx->bulk_insert = false; + goto skip_bulk_insert; + } + +#ifdef WITH_WSREP + if (trx->is_wsrep()) + { + if (!wsrep_thd_is_local_transaction(trx->mysql_thd)) + goto skip_bulk_insert; + if (wsrep_append_table_key(trx->mysql_thd, *index->table)) + { + trx->error_state = DB_ROLLBACK; + goto err_exit; + } + } +#endif /* WITH_WSREP */ + +#ifdef BTR_CUR_HASH_ADAPT + if (btr_search_enabled) { + btr_search_x_lock_all(); + index->table->bulk_trx_id = trx->id; + btr_search_x_unlock_all(); + } else { + index->table->bulk_trx_id = trx->id; + } +#else /* BTR_CUR_HASH_ADAPT */ + index->table->bulk_trx_id = trx->id; +#endif /* BTR_CUR_HASH_ADAPT */ + + /* Write TRX_UNDO_EMPTY undo log and + start buffering the insert operation */ + err = trx_undo_report_row_operation( + thr, index, entry, + nullptr, 0, nullptr, nullptr, + nullptr); + + if (err != DB_SUCCESS) { + goto avoid_bulk; + } + + goto err_exit; + } + } + +skip_bulk_insert: + if (UNIV_UNLIKELY(entry->info_bits != 0)) { + ut_ad(entry->is_metadata()); + ut_ad(flags == BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + ut_ad(!dict_index_is_online_ddl(index)); + + const rec_t* rec = btr_pcur_get_rec(&pcur); + + if (rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG) { + trx->error_info = index; + err = DB_DUPLICATE_KEY; + goto err_exit; + } + + ut_ad(!row_ins_must_modify_rec(&pcur.btr_cur)); + goto do_insert; + } + + if (rec_is_metadata(btr_pcur_get_rec(&pcur), *index)) { + goto do_insert; + } + + if (n_uniq + && (pcur.btr_cur.up_match >= n_uniq + || pcur.btr_cur.low_match >= n_uniq)) { + + if (flags + == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) { + /* Set no locks when applying log + in online table rebuild. Only check for duplicates. */ + err = row_ins_duplicate_error_in_clust_online( + n_uniq, entry, &pcur.btr_cur, + &offsets, &offsets_heap); + + switch (err) { + case DB_SUCCESS: + break; + default: + ut_ad(0); + /* fall through */ + case DB_SUCCESS_LOCKED_REC: + case DB_DUPLICATE_KEY: + trx->error_info = index; + } + } else { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust( + flags, &pcur.btr_cur, entry, thr); + } + + if (err != DB_SUCCESS) { + goto err_exit; + } + } + + /* Note: Allowing duplicates would qualify for modification of + an existing record as the new entry is exactly same as old entry. */ + if (row_ins_must_modify_rec(&pcur.btr_cur)) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + mem_heap_t* entry_heap = mem_heap_create(1024); + + err = row_ins_clust_index_entry_by_modify( + &pcur, flags, mode, &offsets, &offsets_heap, + entry_heap, entry, thr, &mtr); + + mtr_commit(&mtr); + mem_heap_free(entry_heap); + } else { + if (index->is_instant()) entry->trim(*index); +do_insert: + rec_t* insert_rec; + + if (mode != BTR_MODIFY_TREE) { + ut_ad(mode == BTR_MODIFY_LEAF + || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED + || mode == BTR_MODIFY_ROOT_AND_LEAF + || mode + == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED); + err = btr_cur_optimistic_insert( + flags, &pcur.btr_cur, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } else { + if (buf_pool.running_out()) { + err = DB_LOCK_TABLE_FULL; + goto err_exit; + } + + err = btr_cur_optimistic_insert( + flags, &pcur.btr_cur, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + flags, &pcur.btr_cur, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } + } + + mtr.commit(); + + if (big_rec) { + /* Online table rebuild could read (and + ignore) the incomplete record at this point. + If online rebuild is in progress, the + row_ins_index_entry_big_rec() will write log. */ + + DBUG_EXECUTE_IF( + "row_ins_extern_checkpoint", + log_write_up_to(mtr.commit_lsn(), true);); + err = row_ins_index_entry_big_rec( + entry, big_rec, offsets, &offsets_heap, index, + trx->mysql_thd); + dtuple_convert_back_big_rec(index, entry, big_rec); + } + } + +func_exit: + if (offsets_heap != NULL) { + mem_heap_free(offsets_heap); + } + + ut_free(pcur.old_rec_buf); + DBUG_RETURN(err); +} + +/** Start a mini-transaction. +@param[in,out] mtr mini-transaction +@param[in,out] index secondary index */ +static void row_ins_sec_mtr_start(mtr_t *mtr, dict_index_t *index) +{ + ut_ad(!dict_index_is_clust(index)); + ut_ad(mtr->is_named_space(index->table->space)); + + const mtr_log_t log_mode = mtr->get_log_mode(); + + mtr->start(); + index->set_modified(*mtr); + mtr->set_log_mode(log_mode); +} + +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_INSERT_TREE is needed +@return error code */ +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ +{ + DBUG_ENTER("row_ins_sec_index_entry_low"); + + btr_cur_t cursor; + btr_latch_mode search_mode = mode; + dberr_t err; + ulint n_unique; + mtr_t mtr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + rtr_info_t rtr_info; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_INSERT_TREE); + + cursor.thr = thr; + cursor.rtr_info = NULL; + cursor.page_cur.index = index; + ut_ad(thr_get_trx(thr)->id != 0); + + mtr.start(); + + if (index->table->is_temporary()) { + /* Disable locking, because temporary tables are never + shared between transactions or connections. */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + } + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (index->is_spatial()) { + rtr_init_rtr_info(&rtr_info, false, &cursor, index, false); + rtr_info_update_btr(&cursor, &rtr_info); + + err = rtr_insert_leaf(&cursor, entry, search_mode, &mtr); + + if (err == DB_SUCCESS && search_mode == BTR_MODIFY_LEAF + && rtr_info.mbr_adj) { + mtr_commit(&mtr); + search_mode = mode = BTR_MODIFY_TREE; + rtr_clean_rtr_info(&rtr_info, true); + rtr_init_rtr_info(&rtr_info, false, &cursor, + index, false); + rtr_info_update_btr(&cursor, &rtr_info); + mtr.start(); + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + } + err = rtr_insert_leaf(&cursor, entry, + search_mode, &mtr); + } + + DBUG_EXECUTE_IF( + "rtree_test_check_count", { + goto func_exit;}); + + } else { + if (!index->table->is_temporary()) { + search_mode = btr_latch_mode( + search_mode + | (thr_get_trx(thr)->check_unique_secondary + ? BTR_INSERT + : BTR_INSERT | BTR_IGNORE_SEC_UNIQUE)); + } + + err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode, + &mtr); + } + + if (err != DB_SUCCESS) { + if (err == DB_DECRYPTION_FAILED) { + btr_decryption_failed(*index); + } + goto func_exit; + } + + if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { + ut_ad(!dict_index_is_spatial(index)); + /* The insert was buffered during the search: we are done */ + goto func_exit; + } + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_n_fields_is_sane(index, first_rec, entry)); + } +#endif /* UNIV_DEBUG */ + + n_unique = dict_index_get_n_unique(index); + + if (dict_index_is_unique(index) + && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) { + mtr_commit(&mtr); + + DEBUG_SYNC_C("row_ins_sec_index_unique"); + + row_ins_sec_mtr_start(&mtr, index); + + err = row_ins_scan_sec_index_for_duplicate( + flags, index, entry, thr, &mtr, offsets_heap); + + mtr_commit(&mtr); + + switch (err) { + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + if (!index->is_committed()) { + ut_ad(!thr_get_trx(thr) + ->dict_operation_lock_mode); + index->type |= DICT_CORRUPT; + /* Do not return any error to the + caller. The duplicate will be reported + by ALTER TABLE or CREATE UNIQUE INDEX. + Unfortunately we cannot report the + duplicate key value to the DDL thread, + because the altered_table object is + private to its call stack. */ + err = DB_SUCCESS; + } + /* fall through */ + default: + if (dict_index_is_spatial(index)) { + rtr_clean_rtr_info(&rtr_info, true); + } + DBUG_RETURN(err); + } + + row_ins_sec_mtr_start(&mtr, index); + + DEBUG_SYNC_C("row_ins_sec_index_entry_dup_locks_created"); + + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion (bypassing the change buffer). */ + err = cursor.search_leaf( + entry, PAGE_CUR_LE, + btr_latch_mode(search_mode + & ~(BTR_INSERT + | BTR_IGNORE_SEC_UNIQUE)), + &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + } + + if (row_ins_must_modify_rec(&cursor)) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + offsets = rec_get_offsets( + btr_cur_get_rec(&cursor), index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &offsets_heap); + + err = row_ins_sec_index_entry_by_modify( + flags, mode, &cursor, &offsets, + offsets_heap, heap, entry, thr, &mtr); + + if (err == DB_SUCCESS && dict_index_is_spatial(index) + && rtr_info.mbr_adj) { + err = rtr_ins_enlarge_mbr(&cursor, &mtr); + } + } else { + rec_t* insert_rec; + big_rec_t* big_rec; + + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert( + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); + if (err == DB_SUCCESS + && dict_index_is_spatial(index) + && rtr_info.mbr_adj) { + err = rtr_ins_enlarge_mbr(&cursor, &mtr); + } + } else { + if (buf_pool.running_out()) { + err = DB_LOCK_TABLE_FULL; + goto func_exit; + } + + err = btr_cur_optimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); + } + if (err == DB_SUCCESS + && dict_index_is_spatial(index) + && rtr_info.mbr_adj) { + err = rtr_ins_enlarge_mbr(&cursor, &mtr); + } + } + + if (err == DB_SUCCESS && trx_id) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + + ut_ad(!big_rec); + } + +func_exit: + if (dict_index_is_spatial(index)) { + rtr_clean_rtr_info(&rtr_info, true); + } + + mtr_commit(&mtr); + DBUG_RETURN(err); +} + +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + dberr_t err; + ulint n_uniq; + + DBUG_ENTER("row_ins_clust_index_entry"); + + if (!index->table->foreign_set.empty()) { + err = row_ins_check_foreign_constraints( + index->table, index, true, entry, thr); + if (err != DB_SUCCESS) { + + DBUG_RETURN(err); + } + } + + n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0; + +#ifdef WITH_WSREP + const bool skip_locking + = wsrep_thd_skip_locking(thr_get_trx(thr)->mysql_thd); + ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK + : (index->table->is_temporary() || skip_locking) + ? BTR_NO_LOCKING_FLAG : 0; +#ifdef UNIV_DEBUG + if (skip_locking && strcmp(wsrep_get_sr_table_name(), + index->table->name.m_name)) { + WSREP_ERROR("Record locking is disabled in this thread, " + "but the table being modified is not " + "`%s`: `%s`.", wsrep_get_sr_table_name(), + index->table->name.m_name); + ut_error; + } +#endif /* UNIV_DEBUG */ +#else + ulint flags = index->table->no_rollback() ? BTR_NO_ROLLBACK + : index->table->is_temporary() + ? BTR_NO_LOCKING_FLAG : 0; +#endif /* WITH_WSREP */ + const ulint orig_n_fields = entry->n_fields; + + /* For intermediate table during copy alter table, + skip the undo log and record lock checking for + insertion operation. + */ + if (index->table->skip_alter_undo) { + flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG; + } + + /* Try first optimistic descent to the B-tree */ + log_free_check(); + + err = row_ins_clust_index_entry_low( + flags, BTR_MODIFY_LEAF, index, n_uniq, entry, + n_ext, thr); + + entry->n_fields = orig_n_fields; + + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_clust_index_entry_leaf"); + + if (err != DB_FAIL) { + DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after"); + DBUG_RETURN(err); + } + + /* Try then pessimistic descent to the B-tree */ + log_free_check(); + + err = row_ins_clust_index_entry_low( + flags, BTR_MODIFY_TREE, index, n_uniq, entry, + n_ext, thr); + + entry->n_fields = orig_n_fields; + + DBUG_RETURN(err); +} + +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + bool check_foreign) /*!< in: true if check + foreign table is needed, false otherwise */ +{ + dberr_t err = DB_SUCCESS; + mem_heap_t* offsets_heap; + mem_heap_t* heap; + trx_id_t trx_id = 0; + + DBUG_EXECUTE_IF("row_ins_sec_index_entry_timeout", { + DBUG_SET("-d,row_ins_sec_index_entry_timeout"); + return(DB_LOCK_WAIT);}); + + if (check_foreign && !index->table->foreign_set.empty()) { + err = row_ins_check_foreign_constraints(index->table, index, + false, entry, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } + + ut_ad(thr_get_trx(thr)->id != 0); + + offsets_heap = mem_heap_create(1024); + heap = mem_heap_create(1024); + + /* Try first optimistic descent to the B-tree */ + + log_free_check(); + ulint flags = index->table->is_temporary() + ? BTR_NO_LOCKING_FLAG + : 0; + + /* For intermediate table during copy alter table, + skip the undo log and record lock checking for + insertion operation. + */ + if (index->table->skip_alter_undo) { + trx_id = thr_get_trx(thr)->id; + flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG; + } + + err = row_ins_sec_index_entry_low( + flags, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry, + trx_id, thr); + if (err == DB_FAIL) { + mem_heap_empty(heap); + + if (index->table->space == fil_system.sys_space + && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) { + ibuf_free_excess_pages(); + } + + /* Try then pessimistic descent to the B-tree */ + log_free_check(); + + err = row_ins_sec_index_entry_low( + flags, BTR_INSERT_TREE, index, + offsets_heap, heap, entry, 0, thr); + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + return(err); +} + +/***************************************************************//** +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +static +dberr_t +row_ins_index_entry( +/*================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx = thr_get_trx(thr); + + ut_ad(trx->id || index->table->no_rollback() + || index->table->is_temporary()); + + DBUG_EXECUTE_IF("row_ins_index_entry_timeout", { + DBUG_SET("-d,row_ins_index_entry_timeout"); + return(DB_LOCK_WAIT);}); + + if (index->is_btree()) { + if (auto t= trx->check_bulk_buffer(index->table)) { + /* MDEV-25036 FIXME: check also foreign key + constraints */ + ut_ad(!trx->check_foreigns); + return t->bulk_insert_buffered(*entry, *index, trx); + } + } + + if (index->is_primary()) { + return row_ins_clust_index_entry(index, entry, thr, 0); + } else { + return row_ins_sec_index_entry(index, entry, thr); + } +} + + +/*****************************************************************//** +This function generate MBR (Minimum Bounding Box) for spatial objects +and set it to spatial index field. */ +static +void +row_ins_spatial_index_entry_set_mbr_field( +/*======================================*/ + dfield_t* field, /*!< in/out: mbr field */ + const dfield_t* row_field) /*!< in: row field */ +{ + ulint dlen = 0; + double mbr[SPDIMS * 2]; + + /* This must be a GEOMETRY datatype */ + ut_ad(DATA_GEOMETRY_MTYPE(field->type.mtype)); + + const byte* dptr = static_cast<const byte*>( + dfield_get_data(row_field)); + dlen = dfield_get_len(row_field); + + /* obtain the MBR */ + rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, + static_cast<uint>(dlen - GEO_DATA_HEADER_SIZE), + SPDIMS, mbr); + + /* Set mbr as index entry data */ + dfield_write_mbr(field, mbr); +} + +/** Sets the values of the dtuple fields in entry from the values of appropriate +columns in row. +@param[in] index index handler +@param[out] entry index entry to make +@param[in] row row +@return DB_SUCCESS if the set is successful */ +static +dberr_t +row_ins_index_entry_set_vals( + const dict_index_t* index, + dtuple_t* entry, + const dtuple_t* row) +{ + ulint n_fields; + ulint i; + ulint num_v = dtuple_get_n_v_fields(entry); + + n_fields = dtuple_get_n_fields(entry); + + for (i = 0; i < n_fields + num_v; i++) { + dict_field_t* ind_field = NULL; + dfield_t* field; + const dfield_t* row_field; + ulint len; + dict_col_t* col; + + if (i >= n_fields) { + /* This is virtual field */ + field = dtuple_get_nth_v_field(entry, i - n_fields); + col = &dict_table_get_nth_v_col( + index->table, i - n_fields)->m_col; + } else { + field = dtuple_get_nth_field(entry, i); + ind_field = dict_index_get_nth_field(index, i); + col = ind_field->col; + } + + if (col->is_virtual()) { + const dict_v_col_t* v_col + = reinterpret_cast<const dict_v_col_t*>(col); + ut_ad(dtuple_get_n_fields(row) + == dict_table_get_n_cols(index->table)); + row_field = dtuple_get_nth_v_field(row, v_col->v_pos); + } else if (col->is_dropped()) { + ut_ad(index->is_primary()); + + if (!(col->prtype & DATA_NOT_NULL)) { + field->data = NULL; + field->len = UNIV_SQL_NULL; + field->type.prtype = DATA_BINARY_TYPE; + } else { + ut_ad(ind_field->fixed_len <= col->len); + dfield_set_data(field, field_ref_zero, + ind_field->fixed_len); + field->type.prtype = DATA_NOT_NULL; + } + + field->type.mtype = col->len + ? DATA_FIXBINARY : DATA_BINARY; + continue; + } else { + row_field = dtuple_get_nth_field( + row, ind_field->col->ind); + } + + len = dfield_get_len(row_field); + + /* Check column prefix indexes */ + if (ind_field != NULL && ind_field->prefix_len > 0 + && len != UNIV_SQL_NULL) { + + const dict_col_t* col + = dict_field_get_col(ind_field); + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ind_field->prefix_len, + len, + static_cast<const char*>( + dfield_get_data(row_field))); + + ut_ad(!dfield_is_ext(row_field)); + } + + /* Handle spatial index. For the first field, replace + the data with its MBR (Minimum Bounding Box). */ + if ((i == 0) && dict_index_is_spatial(index)) { + if (!row_field->data + || row_field->len < GEO_DATA_HEADER_SIZE) { + return(DB_CANT_CREATE_GEOMETRY_OBJECT); + } + row_ins_spatial_index_entry_set_mbr_field( + field, row_field); + continue; + } + + dfield_set_data(field, dfield_get_data(row_field), len); + if (dfield_is_ext(row_field)) { + ut_ad(dict_index_is_clust(index)); + dfield_set_ext(field); + } + } + + return(DB_SUCCESS); +} + +/***********************************************************//** +Inserts a single index entry to the table. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins_index_entry_step( +/*=====================*/ + ins_node_t* node, /*!< in: row insert node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + DBUG_ENTER("row_ins_index_entry_step"); + + ut_ad(dtuple_check_typed(node->row)); + + err = row_ins_index_entry_set_vals(node->index, *node->entry, + node->row); + + if (err != DB_SUCCESS) { + DBUG_RETURN(err); + } + + ut_ad(dtuple_check_typed(*node->entry)); + + err = row_ins_index_entry(node->index, *node->entry, thr); + + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_index_entry_step"); + + DBUG_RETURN(err); +} + +/***********************************************************//** +Allocates a row id for row and inits the node->index field. */ +UNIV_INLINE +void +row_ins_alloc_row_id_step( +/*======================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + ut_ad(node->state == INS_NODE_ALLOC_ROW_ID); + if (dict_table_get_first_index(node->table)->is_gen_clust()) + dict_sys_write_row_id(node->sys_buf, dict_sys.get_new_row_id()); +} + +/***********************************************************//** +Gets a row to insert from the values list. */ +UNIV_INLINE +void +row_ins_get_row_from_values( +/*========================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->values_list; + + while (list_node) { + eval_exp(list_node); + + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/***********************************************************//** +Gets a row to insert from the select list. */ +UNIV_INLINE +void +row_ins_get_row_from_select( +/*========================*/ + ins_node_t* node) /*!< in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->select->select_list; + + while (list_node) { + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/***********************************************************//** +Inserts a row to a table. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_ins( +/*====*/ + ins_node_t* node, /*!< in: row insert node */ + que_thr_t* thr) /*!< in: query thread */ +{ + DBUG_ENTER("row_ins"); + + DBUG_PRINT("row_ins", ("table: %s", node->table->name.m_name)); + + if (node->state == INS_NODE_ALLOC_ROW_ID) { + + row_ins_alloc_row_id_step(node); + + node->index = dict_table_get_first_index(node->table); + ut_ad(node->entry_list.empty() == false); + node->entry = node->entry_list.begin(); + + if (node->ins_type == INS_SEARCHED) { + + row_ins_get_row_from_select(node); + + } else if (node->ins_type == INS_VALUES) { + + row_ins_get_row_from_values(node); + } + + node->state = INS_NODE_INSERT_ENTRIES; + } + + ut_ad(node->state == INS_NODE_INSERT_ENTRIES); + + while (dict_index_t *index = node->index) { + if (index->type & (DICT_FTS | DICT_CORRUPT) + || !index->is_committed()) { + } else if (dberr_t err = row_ins_index_entry_step(node, thr)) { + DBUG_RETURN(err); + } + node->index = dict_table_get_next_index(index); + ++node->entry; + } + + ut_ad(node->entry == node->entry_list.end()); + + node->state = INS_NODE_ALLOC_ROW_ID; + + DBUG_RETURN(DB_SUCCESS); +} + +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ins_node_t* node; + que_node_t* parent; + sel_node_t* sel_node; + trx_t* trx; + dberr_t err; + + ut_ad(thr); + + DEBUG_SYNC_C("innodb_row_ins_step_enter"); + + trx = thr_get_trx(thr); + + node = static_cast<ins_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); + + parent = que_node_get_parent(node); + sel_node = node->select; + + if (thr->prev_node == parent) { + node->state = INS_NODE_SET_IX_LOCK; + } + + /* If this is the first time this node is executed (or when + execution resumes after wait for the table IX lock), set an + IX lock on the table and reset the possible select node. MySQL's + partitioned table code may also call an insert within the same + SQL statement AFTER it has used this table handle to do a search. + This happens, for example, when a row update moves it to another + partition. In that case, we have already set the IX lock on the + table during the search operation, and there is no need to set + it again here. But we must write trx->id to node->sys_buf. */ + + if (node->table->no_rollback()) { + /* No-rollback tables should only be written to by a + single thread at a time, but there can be multiple + concurrent readers. We must hold an open table handle. */ + DBUG_ASSERT(node->table->get_ref_count() > 0); + DBUG_ASSERT(node->ins_type == INS_DIRECT); + /* No-rollback tables can consist only of a single index. */ + DBUG_ASSERT(node->entry_list.size() == 1); + DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1); + /* There should be no possibility for interruption and + restarting here. In theory, we could allow resumption + from the INS_NODE_INSERT_ENTRIES state here. */ + DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK); + node->index = dict_table_get_first_index(node->table); + node->entry = node->entry_list.begin(); + node->state = INS_NODE_INSERT_ENTRIES; + goto do_insert; + } + + if (node->state == INS_NODE_SET_IX_LOCK) { + + node->state = INS_NODE_ALLOC_ROW_ID; + + if (node->table->is_temporary()) { + node->trx_id = trx->id; + } + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + if (trx->id == node->trx_id) { + /* No need to do IX-locking */ + + goto same_trx; + } + + err = lock_table(node->table, NULL, LOCK_IX, thr); + + DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait", + err = DB_LOCK_WAIT;); + + if (err != DB_SUCCESS) { + node->state = INS_NODE_SET_IX_LOCK; + goto error_handling; + } + + node->trx_id = trx->id; +same_trx: + if (node->ins_type == INS_SEARCHED) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to insert */ + + thr->run_node = sel_node; + + return(thr); + } + } + + if ((node->ins_type == INS_SEARCHED) + && (sel_node->state != SEL_NODE_FETCH)) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to insert */ + thr->run_node = parent; + + return(thr); + } +do_insert: + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_ins(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* err == DB_LOCK_WAIT or SQL error detected */ + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->ins_type == INS_SEARCHED) { + /* Fetch a row to insert */ + + thr->run_node = sel_node; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc new file mode 100644 index 00000000..c4f46304 --- /dev/null +++ b/storage/innobase/row/row0log.cc @@ -0,0 +1,4134 @@ +/***************************************************************************** + +Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0log.cc +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#include "row0log.h" +#include "row0row.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0merge.h" +#include "row0ext.h" +#include "log0crypt.h" +#include "data0data.h" +#include "que0que.h" +#include "srv0mon.h" +#include "handler0alter.h" +#include "ut0stage.h" +#include "trx0rec.h" + +#include <sql_class.h> +#include <algorithm> +#include <map> + +Atomic_counter<ulint> onlineddl_rowlog_rows; +ulint onlineddl_rowlog_pct_used; +ulint onlineddl_pct_progress; + +/** Table row modification operations during online table rebuild. +Delete-marked records are not copied to the rebuilt table. */ +enum row_tab_op { + /** Insert a record */ + ROW_T_INSERT = 0x41, + /** Update a record in place */ + ROW_T_UPDATE, + /** Delete (purge) a record */ + ROW_T_DELETE +}; + +/** Index record modification operations during online index creation */ +enum row_op { + /** Insert a record */ + ROW_OP_INSERT = 0x61, + /** Delete a record */ + ROW_OP_DELETE +}; + +/** Size of the modification log entry header, in bytes */ +#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/ + +/** Log block for modifications during online ALTER TABLE */ +struct row_log_buf_t { + byte* block; /*!< file block buffer */ + size_t size; /*!< length of block in bytes */ + ut_new_pfx_t block_pfx; /*!< opaque descriptor of "block". Set + by ut_allocator::allocate_large() and fed to + ut_allocator::deallocate_large(). */ + mrec_buf_t buf; /*!< buffer for accessing a record + that spans two blocks */ + ulint blocks; /*!< current position in blocks */ + ulint bytes; /*!< current position within block */ + ulonglong total; /*!< logical position, in bytes from + the start of the row_log_table log; + 0 for row_log_online_op() and + row_log_apply(). */ +}; + +/** @brief Buffer for logging modifications during online index creation + +All modifications to an index that is being created will be logged by +row_log_online_op() to this buffer. + +All modifications to a table that is being rebuilt will be logged by +row_log_table_delete(), row_log_table_update(), row_log_table_insert() +to this buffer. + +When head.blocks == tail.blocks, the reader will access tail.block +directly. When also head.bytes == tail.bytes, both counts will be +reset to 0 and the file will be truncated. */ +struct row_log_t { + pfs_os_file_t fd; /*!< file descriptor */ + mysql_mutex_t mutex; /*!< mutex protecting error, + max_trx and tail */ + dict_table_t* table; /*!< table that is being rebuilt, + or NULL when this is a secondary + index that is being created online */ + bool same_pk;/*!< whether the definition of the PRIMARY KEY + has remained the same */ + const dtuple_t* defaults; + /*!< default values of added, changed columns, + or NULL */ + const ulint* col_map;/*!< mapping of old column numbers to + new ones, or NULL if !table */ + dberr_t error; /*!< error that occurred during online + table rebuild */ + /** The transaction ID of the ALTER TABLE transaction. Any + concurrent DML would necessarily be logged with a larger + transaction ID, because ha_innobase::prepare_inplace_alter_table() + acts as a barrier that ensures that any concurrent transaction + that operates on the table would have been started after + ha_innobase::prepare_inplace_alter_table() returns and before + ha_innobase::commit_inplace_alter_table(commit=true) is invoked. + + Due to the nondeterministic nature of purge and due to the + possibility of upgrading from an earlier version of MariaDB + or MySQL, it is possible that row_log_table_low() would be + fed DB_TRX_ID that precedes than min_trx. We must normalize + such references to reset_trx_id[]. */ + trx_id_t min_trx; + trx_id_t max_trx;/*!< biggest observed trx_id in + row_log_online_op(); + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t tail; /*!< writer context; + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + size_t crypt_tail_size; /*!< size of crypt_tail_size*/ + byte* crypt_tail; /*!< writer context; + temporary buffer used in encryption, + decryption or NULL*/ + row_log_buf_t head; /*!< reader context; protected by MDL only; + modifiable by row_log_apply_ops() */ + size_t crypt_head_size; /*!< size of crypt_tail_size*/ + byte* crypt_head; /*!< reader context; + temporary buffer used in encryption, + decryption or NULL */ + const char* path; /*!< where to create temporary file during + log operation */ + /** the number of core fields in the clustered index of the + source table; before row_log_table_apply() completes, the + table could be emptied, so that table->is_instant() no longer holds, + but all log records must be in the "instant" format. */ + unsigned n_core_fields; + /** the default values of non-core fields when the operation started */ + dict_col_t::def_t* non_core_fields; + bool allow_not_null; /*!< Whether the alter ignore is being + used or if the sql mode is non-strict mode; + if not, NULL values will not be converted to + defaults */ + const TABLE* old_table; /*< Use old table in case of error. */ + + uint64_t n_rows; /*< Number of rows read from the table */ + + /** Alter table transaction. It can be used to apply the DML logs + into the table */ + const trx_t* alter_trx; + + /** Determine whether the log should be in the 'instant ADD' format + @param[in] index the clustered index of the source table + @return whether to use the 'instant ADD COLUMN' format */ + bool is_instant(const dict_index_t* index) const + { + ut_ad(table); + ut_ad(n_core_fields <= index->n_fields); + return n_core_fields != index->n_fields; + } + + const byte* instant_field_value(ulint n, ulint* len) const + { + ut_ad(n >= n_core_fields); + const dict_col_t::def_t& d= non_core_fields[n - n_core_fields]; + *len = d.len; + return static_cast<const byte*>(d.data); + } +}; + +/** Create the file or online log if it does not exist. +@param[in,out] log online rebuild log +@return true if success, false if not */ +static MY_ATTRIBUTE((warn_unused_result)) +pfs_os_file_t +row_log_tmpfile( + row_log_t* log) +{ + DBUG_ENTER("row_log_tmpfile"); + if (log->fd == OS_FILE_CLOSED) { + log->fd = row_merge_file_create_low(log->path); + DBUG_EXECUTE_IF("row_log_tmpfile_fail", + if (log->fd != OS_FILE_CLOSED) + row_merge_file_destroy_low(log->fd); + log->fd = OS_FILE_CLOSED;); + if (log->fd != OS_FILE_CLOSED) { + MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_LOG_FILES); + } + } + + DBUG_RETURN(log->fd); +} + +/** Allocate the memory for the log buffer. +@param[in,out] log_buf Buffer used for log operation +@return TRUE if success, false if not */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +row_log_block_allocate( + row_log_buf_t& log_buf) +{ + DBUG_ENTER("row_log_block_allocate"); + if (log_buf.block == NULL) { + DBUG_EXECUTE_IF( + "simulate_row_log_allocation_failure", + DBUG_RETURN(false); + ); + + log_buf.block = ut_allocator<byte>(mem_key_row_log_buf) + .allocate_large(srv_sort_buf_size, + &log_buf.block_pfx); + + if (log_buf.block == NULL) { + DBUG_RETURN(false); + } + log_buf.size = srv_sort_buf_size; + } + DBUG_RETURN(true); +} + +/** Free the log buffer. +@param[in,out] log_buf Buffer used for log operation */ +static +void +row_log_block_free( + row_log_buf_t& log_buf) +{ + DBUG_ENTER("row_log_block_free"); + if (log_buf.block != NULL) { + ut_allocator<byte>(mem_key_row_log_buf).deallocate_large( + log_buf.block, &log_buf.block_pfx); + log_buf.block = NULL; + } + DBUG_VOID_RETURN; +} + +/** Logs an operation to a secondary index that is (or was) being created. +@param index index, S or X latched +@param tuple index tuple +@param trx_id transaction ID for insert, or 0 for delete +@retval false if row_log_apply() failure happens +or true otherwise */ +bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple, + trx_id_t trx_id) +{ + byte* b; + ulint extra_size; + ulint size; + ulint mrec_size; + ulint avail_size; + row_log_t* log; + bool success= true; + + ut_ad(dtuple_validate(tuple)); + ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index)); + ut_ad(index->lock.have_x() || index->lock.have_s()); + + if (index->is_corrupted()) { + return success; + } + + ut_ad(dict_index_is_online_ddl(index) + || (index->online_log + && index->online_status == ONLINE_INDEX_COMPLETE)); + + /* Compute the size of the record. This differs from + row_merge_buf_encode(), because here we do not encode + extra_size+1 (and reserve 0 as the end-of-chunk marker). */ + + size = rec_get_converted_size_temp<false>( + index, tuple->fields, tuple->n_fields, &extra_size); + ut_ad(size >= extra_size); + ut_ad(size <= sizeof log->tail.buf); + + mrec_size = ROW_LOG_HEADER_SIZE + + (extra_size >= 0x80) + size + + (trx_id ? DATA_TRX_ID_LEN : 0); + + log = index->online_log; + mysql_mutex_lock(&log->mutex); + +start_log: + if (trx_id > log->max_trx) { + log->max_trx = trx_id; + } + + if (!row_log_block_allocate(log->tail)) { + log->error = DB_OUT_OF_MEMORY; + goto err_exit; + } + + MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); + + ut_ad(log->tail.bytes < srv_sort_buf_size); + avail_size = srv_sort_buf_size - log->tail.bytes; + + if (mrec_size > avail_size) { + b = log->tail.buf; + } else { + b = log->tail.block + log->tail.bytes; + } + + if (trx_id != 0) { + *b++ = ROW_OP_INSERT; + trx_write_trx_id(b, trx_id); + b += DATA_TRX_ID_LEN; + } else { + *b++ = ROW_OP_DELETE; + } + + if (extra_size < 0x80) { + *b++ = (byte) extra_size; + } else { + ut_ad(extra_size < 0x8000); + *b++ = (byte) (0x80 | (extra_size >> 8)); + *b++ = (byte) extra_size; + } + + rec_convert_dtuple_to_temp<false>( + b + extra_size, index, tuple->fields, tuple->n_fields); + + b += size; + + if (mrec_size >= avail_size) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + byte* buf = log->tail.block; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + if (index->online_status != ONLINE_INDEX_COMPLETE) + goto write_failed; + /* About to run out of log, InnoDB has to + apply the online log for the completed index */ + index->lock.s_unlock(); + dberr_t error= row_log_apply( + log->alter_trx, index, nullptr, nullptr); + index->lock.s_lock(SRW_LOCK_CALL); + if (error != DB_SUCCESS) { + /* Mark all newly added indexes + as corrupted */ + log->error = error; + success = false; + goto err_exit; + } + + /* Recheck whether the index online log */ + if (!index->online_log) { + goto err_exit; + } + + goto start_log; + } + + if (mrec_size == avail_size) { + ut_ad(b == &buf[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + mrec_size); + memcpy(buf + log->tail.bytes, + log->tail.buf, avail_size); + } + + MEM_CHECK_DEFINED(buf, srv_sort_buf_size); + + if (row_log_tmpfile(log) == OS_FILE_CLOSED) { + log->error = DB_OUT_OF_MEMORY; + goto err_exit; + } + + /* If encryption is enabled encrypt buffer before writing it + to file system. */ + if (srv_encrypt_log) { + if (!log_tmp_block_encrypt( + buf, srv_sort_buf_size, + log->crypt_tail, byte_offset)) { + log->error = DB_DECRYPTION_FAILED; + goto write_failed; + } + + srv_stats.n_rowlog_blocks_encrypted.inc(); + buf = log->crypt_tail; + } + + log->tail.blocks++; + if (os_file_write( + IORequestWrite, + "(modification log)", + log->fd, + buf, byte_offset, srv_sort_buf_size) + != DB_SUCCESS) { +write_failed: + index->type |= DICT_CORRUPT; + } + + MEM_UNDEFINED(log->tail.block, srv_sort_buf_size); + MEM_UNDEFINED(buf, srv_sort_buf_size); + + memcpy(log->tail.block, log->tail.buf + avail_size, + mrec_size - avail_size); + log->tail.bytes = mrec_size - avail_size; + } else { + log->tail.bytes += mrec_size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); +err_exit: + mysql_mutex_unlock(&log->mutex); + return success; +} + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + return(index->online_log->error); +} + +/******************************************************//** +Starts logging an operation to a table that is being rebuilt. +@return pointer to log, or NULL if no logging is necessary */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +byte* +row_log_table_open( +/*===============*/ + row_log_t* log, /*!< in/out: online rebuild log */ + ulint size, /*!< in: size of log record */ + ulint* avail) /*!< out: available size for log record */ +{ + mysql_mutex_lock(&log->mutex); + + MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); + + if (log->error != DB_SUCCESS) { +err_exit: + mysql_mutex_unlock(&log->mutex); + return(NULL); + } + + if (!row_log_block_allocate(log->tail)) { + log->error = DB_OUT_OF_MEMORY; + goto err_exit; + } + + ut_ad(log->tail.bytes < srv_sort_buf_size); + *avail = srv_sort_buf_size - log->tail.bytes; + + if (size > *avail) { + /* Make sure log->tail.buf is large enough */ + ut_ad(size <= sizeof log->tail.buf); + return(log->tail.buf); + } else { + return(log->tail.block + log->tail.bytes); + } +} + +/******************************************************//** +Stops logging an operation to a table that is being rebuilt. */ +static MY_ATTRIBUTE((nonnull)) +void +row_log_table_close_func( +/*=====================*/ + dict_index_t* index, /*!< in/out: online rebuilt index */ +#ifdef UNIV_DEBUG + const byte* b, /*!< in: end of log record */ +#endif /* UNIV_DEBUG */ + ulint size, /*!< in: size of log record */ + ulint avail) /*!< in: available size for log record */ +{ + row_log_t* log = index->online_log; + + mysql_mutex_assert_owner(&log->mutex); + + if (size >= avail) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + byte* buf = log->tail.block; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (size == avail) { + ut_ad(b == &buf[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + size); + memcpy(buf + log->tail.bytes, log->tail.buf, avail); + } + + MEM_CHECK_DEFINED(buf, srv_sort_buf_size); + + if (row_log_tmpfile(log) == OS_FILE_CLOSED) { + log->error = DB_OUT_OF_MEMORY; + goto err_exit; + } + + /* If encryption is enabled encrypt buffer before writing it + to file system. */ + if (srv_encrypt_log) { + if (!log_tmp_block_encrypt( + log->tail.block, srv_sort_buf_size, + log->crypt_tail, byte_offset, + index->table->space_id)) { + log->error = DB_DECRYPTION_FAILED; + goto err_exit; + } + + srv_stats.n_rowlog_blocks_encrypted.inc(); + buf = log->crypt_tail; + } + + log->tail.blocks++; + if (os_file_write( + IORequestWrite, + "(modification log)", + log->fd, + buf, byte_offset, srv_sort_buf_size) + != DB_SUCCESS) { +write_failed: + log->error = DB_ONLINE_LOG_TOO_BIG; + } + + MEM_UNDEFINED(log->tail.block, srv_sort_buf_size); + MEM_UNDEFINED(buf, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail, size - avail); + log->tail.bytes = size - avail; + } else { + log->tail.bytes += size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + log->tail.total += size; + MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf); +err_exit: + mysql_mutex_unlock(&log->mutex); + + onlineddl_rowlog_rows++; + /* 10000 means 100.00%, 4525 means 45.25% */ + onlineddl_rowlog_pct_used = static_cast<ulint>((log->tail.total * 10000) / srv_online_max_size); +} + +#ifdef UNIV_DEBUG +# define row_log_table_close(index, b, size, avail) \ + row_log_table_close_func(index, b, size, avail) +#else /* UNIV_DEBUG */ +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(index, size, avail) +#endif /* UNIV_DEBUG */ + +/** Check whether a virtual column is indexed in the new table being +created during alter table +@param[in] index cluster index +@param[in] v_no virtual column number +@return true if it is indexed, else false */ +bool +row_log_col_is_indexed( + const dict_index_t* index, + ulint v_no) +{ + return(dict_table_get_nth_v_col( + index->online_log->table, v_no)->m_col.ord_part); +} + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should + be logged, or NULL to use those in rec */ +{ + ulint old_pk_extra_size; + ulint old_pk_size; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + const dtuple_t* old_pk; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); + ut_ad(index->lock.have_any()); + + if (index->online_status != ONLINE_INDEX_CREATION + || (index->type & DICT_CORRUPT) || index->table->corrupted + || index->online_log->error != DB_SUCCESS) { + return; + } + + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index(new_table); + + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(index->online_log->min_trx); + + /* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */ + if (index->online_log->same_pk) { + dtuple_t* tuple; + ut_ad(new_index->n_uniq == index->n_uniq); + + /* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first + fields of the record. */ + heap = mem_heap_create( + DATA_TRX_ID_LEN + + DTUPLE_EST_ALLOC(new_index->first_user_field())); + old_pk = tuple = dtuple_create(heap, + new_index->first_user_field()); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_index->n_uniq); + + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { + ulint len; + const void* field = rec_get_nth_field( + rec, offsets, i, &len); + dfield_t* dfield = dtuple_get_nth_field( + tuple, i); + ut_ad(len != UNIV_SQL_NULL); + ut_ad(!rec_offs_nth_extern(offsets, i)); + dfield_set_data(dfield, field, len); + } + + dfield_t* db_trx_id = dtuple_get_nth_field( + tuple, new_index->n_uniq); + + const bool replace_sys_fields + = sys + || trx_read_trx_id(static_cast<byte*>(db_trx_id->data)) + < index->online_log->min_trx; + + if (replace_sys_fields) { + if (!sys || trx_read_trx_id(sys) + < index->online_log->min_trx) { + sys = reset_trx_id; + } + + dfield_set_data(db_trx_id, sys, DATA_TRX_ID_LEN); + dfield_set_data(db_trx_id + 1, sys + DATA_TRX_ID_LEN, + DATA_ROLL_PTR_LEN); + } + + ut_d(trx_id_check(db_trx_id->data, + index->online_log->min_trx)); + } else { + /* The PRIMARY KEY has changed. Translate the tuple. */ + old_pk = row_log_table_get_pk( + rec, index, offsets, NULL, &heap); + + if (!old_pk) { + ut_ad(index->online_log->error != DB_SUCCESS); + if (heap) { + goto func_exit; + } + return; + } + } + + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + old_pk_size = rec_get_converted_size_temp<false>( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + + /* 2 = 1 (extra_size) + at least 1 byte payload */ + mrec_size = 2 + old_pk_size; + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = ROW_T_DELETE; + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp<false>( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + + b += old_pk_size; + + row_log_table_close(index, b, mrec_size, avail_size); + } + +func_exit: + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static +void +row_log_table_low_redundant( +/*========================*/ + const rec_t* rec, /*!< in: clustered index leaf + page record in ROW_FORMAT=REDUNDANT, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + bool insert, /*!< in: true if insert, + false if update */ + const dtuple_t* old_pk, /*!< in: old PRIMARY KEY value + (if !insert and a PRIMARY KEY + is being created) */ + const dict_index_t* new_index) + /*!< in: clustered index of the + new table, not latched */ +{ + ulint old_pk_size; + ulint old_pk_extra_size; + ulint size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + dtuple_t* tuple; + const ulint n_fields = rec_get_n_fields_old(rec); + + ut_ad(index->n_fields >= n_fields); + ut_ad(index->n_fields == n_fields || index->is_instant()); + ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2)); + ut_ad(!dict_table_is_comp(index->table)); /* redundant row format */ + ut_ad(dict_index_is_clust(new_index)); + + heap = mem_heap_create(DTUPLE_EST_ALLOC(n_fields)); + tuple = dtuple_create(heap, n_fields); + dict_index_copy_types(tuple, index, n_fields); + + dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index)); + + if (rec_get_1byte_offs_flag(rec)) { + for (ulint i = 0; i < n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + } else { + for (ulint i = 0; i < n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_2_is_field_extern(rec, i)) { + dfield_set_ext(dfield); + } + } + } + + dfield_t* db_trx_id = dtuple_get_nth_field(tuple, index->n_uniq); + ut_ad(dfield_get_len(db_trx_id) == DATA_TRX_ID_LEN); + ut_ad(dfield_get_len(db_trx_id + 1) == DATA_ROLL_PTR_LEN); + + if (trx_read_trx_id(static_cast<const byte*> + (dfield_get_data(db_trx_id))) + < index->online_log->min_trx) { + dfield_set_data(db_trx_id, reset_trx_id, DATA_TRX_ID_LEN); + dfield_set_data(db_trx_id + 1, reset_trx_id + DATA_TRX_ID_LEN, + DATA_ROLL_PTR_LEN); + } + + const bool is_instant = index->online_log->is_instant(index); + rec_comp_status_t status = is_instant + ? REC_STATUS_INSTANT : REC_STATUS_ORDINARY; + + size = rec_get_converted_size_temp<true>( + index, tuple->fields, tuple->n_fields, &extra_size, status); + if (is_instant) { + size++; + extra_size++; + } + + mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80); + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp<false>( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + if (insert) { + *b++ = ROW_T_INSERT; + } else { + *b++ = ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp<false>( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + if (status == REC_STATUS_INSTANT) { + ut_ad(is_instant); + if (n_fields <= index->online_log->n_core_fields) { + status = REC_STATUS_ORDINARY; + } + *b = status; + } + + rec_convert_dtuple_to_temp<true>( + b + extra_size, index, tuple->fields, tuple->n_fields, + status); + b += size; + + row_log_table_close(index, b, mrec_size, avail_size); + } + + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static +void +row_log_table_low( +/*==============*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + bool insert, /*!< in: true if insert, false if update */ + const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert + and a PRIMARY KEY is being created) */ +{ + ulint old_pk_size; + ulint old_pk_extra_size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + const dict_index_t* new_index; + row_log_t* log = index->online_log; + + new_index = dict_table_get_first_index(log->table); + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf); + ut_ad(index->lock.have_any()); + + /* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix + of the clustered index record (PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR), + with no information on virtual columns */ + ut_ad(!old_pk || !insert); + ut_ad(!old_pk || old_pk->n_v_fields == 0); + + if (index->online_status != ONLINE_INDEX_CREATION + || (index->type & DICT_CORRUPT) || index->table->corrupted + || log->error != DB_SUCCESS) { + return; + } + + if (!rec_offs_comp(offsets)) { + row_log_table_low_redundant( + rec, index, insert, old_pk, new_index); + return; + } + + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY + || rec_get_status(rec) == REC_STATUS_INSTANT); + + const ulint omit_size = REC_N_NEW_EXTRA_BYTES; + + const ulint rec_extra_size = rec_offs_extra_size(offsets) - omit_size; + const bool is_instant = log->is_instant(index); + extra_size = rec_extra_size + is_instant; + + unsigned fake_extra_size = 0; + byte fake_extra_buf[3]; + if (is_instant && UNIV_UNLIKELY(!index->is_instant())) { + /* The source table was emptied after ALTER TABLE + started, and it was converted to non-instant format. + Because row_log_table_apply_op() expects to find + all records to be logged in the same way, we will + be unable to copy the rec_extra_size bytes from the + record header, but must convert them here. */ + unsigned n_add = index->n_fields - 1 - log->n_core_fields; + fake_extra_size = rec_get_n_add_field_len(n_add); + ut_ad(fake_extra_size == 1 || fake_extra_size == 2); + extra_size += fake_extra_size; + byte* fake_extra = fake_extra_buf + fake_extra_size; + rec_set_n_add_field(fake_extra, n_add); + ut_ad(fake_extra == fake_extra_buf); + } + + mrec_size = ROW_LOG_HEADER_SIZE + + (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size + + is_instant + fake_extra_size; + + if (insert || log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp<false>( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(log, mrec_size, &avail_size)) { + if (insert) { + *b++ = ROW_T_INSERT; + } else { + *b++ = ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp<false>( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + if (is_instant) { + *b++ = fake_extra_size + ? REC_STATUS_INSTANT + : rec_get_status(rec); + } else { + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + } + + memcpy(b, rec - rec_extra_size - omit_size, rec_extra_size); + b += rec_extra_size; + memcpy(b, fake_extra_buf + 1, fake_extra_size); + b += fake_extra_size; + ulint len; + ulint trx_id_offs = rec_get_nth_field_offs( + offsets, index->n_uniq, &len); + ut_ad(len == DATA_TRX_ID_LEN); + memcpy(b, rec, rec_offs_data_size(offsets)); + if (trx_read_trx_id(b + trx_id_offs) < log->min_trx) { + memcpy(b + trx_id_offs, + reset_trx_id, sizeof reset_trx_id); + } + b += rec_offs_data_size(offsets); + + row_log_table_close(index, b, mrec_size, avail_size); + } +} + +/******************************************************//** +Logs an update to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ +{ + row_log_table_low(rec, index, offsets, false, old_pk); +} + +/** Gets the old table column of a PRIMARY KEY column. +@param table old table (before ALTER TABLE) +@param col_map mapping of old column numbers to new ones +@param col_no column position in the new table +@return old table column, or NULL if this is an added column */ +static +const dict_col_t* +row_log_table_get_pk_old_col( +/*=========================*/ + const dict_table_t* table, + const ulint* col_map, + ulint col_no) +{ + for (ulint i = 0; i < table->n_cols; i++) { + if (col_no == col_map[i]) { + return(dict_table_get_nth_col(table, i)); + } + } + + return(NULL); +} + +/** Maps an old table column of a PRIMARY KEY column. +@param[in] ifield clustered index field in the new table (after +ALTER TABLE) +@param[in] index the clustered index of ifield +@param[in,out] dfield clustered index tuple field in the new table +@param[in,out] heap memory heap for allocating dfield contents +@param[in] rec clustered index leaf page record in the old +table +@param[in] offsets rec_get_offsets(rec) +@param[in] i rec field corresponding to col +@param[in] zip_size ROW_FORMAT=COMPRESSED size of the old table +@param[in] max_len maximum length of dfield +@param[in] log row log for the table +@retval DB_INVALID_NULL if a NULL value is encountered +@retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */ +static +dberr_t +row_log_table_get_pk_col( + const dict_field_t* ifield, + const dict_index_t* index, + dfield_t* dfield, + mem_heap_t* heap, + const rec_t* rec, + const rec_offs* offsets, + ulint i, + ulint zip_size, + ulint max_len, + const row_log_t* log) +{ + const byte* field; + ulint len; + + field = rec_get_nth_field(rec, offsets, i, &len); + + if (len == UNIV_SQL_DEFAULT) { + field = log->instant_field_value(i, &len); + } + + if (len == UNIV_SQL_NULL) { + if (!log->allow_not_null) { + return(DB_INVALID_NULL); + } + + unsigned col_no= ifield->col->ind; + ut_ad(col_no < log->defaults->n_fields); + + field = static_cast<const byte*>( + log->defaults->fields[col_no].data); + if (!field) { + return(DB_INVALID_NULL); + } + len = log->defaults->fields[col_no].len; + } + + if (rec_offs_nth_extern(offsets, i)) { + ulint field_len = ifield->prefix_len; + byte* blob_field; + + if (!field_len) { + field_len = ifield->fixed_len; + if (!field_len) { + field_len = max_len + 1; + } + } + + blob_field = static_cast<byte*>( + mem_heap_alloc(heap, field_len)); + + len = btr_copy_externally_stored_field_prefix( + blob_field, field_len, zip_size, field, len); + if (len >= max_len + 1) { + return(DB_TOO_BIG_INDEX_COL); + } + + dfield_set_data(dfield, blob_field, len); + } else { + dfield_set_data(dfield, mem_heap_dup(heap, field, len), len); + } + + return(DB_SUCCESS); +} + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for + row_log_table_delete(), or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ +{ + dtuple_t* tuple = NULL; + row_log_t* log = index->online_log; + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!offsets || rec_offs_validate(rec, index, offsets)); + ut_ad(index->lock.have_any()); + ut_ad(log); + ut_ad(log->table); + ut_ad(log->min_trx); + + if (log->same_pk) { + /* The PRIMARY KEY columns are unchanged. */ + if (sys) { + /* Store the DB_TRX_ID,DB_ROLL_PTR. */ + ulint trx_id_offs = index->trx_id_offset; + + if (!trx_id_offs) { + ulint len; + + if (!offsets) { + offsets = rec_get_offsets( + rec, index, nullptr, + index->n_core_fields, + index->db_trx_id() + 1, heap); + } + + trx_id_offs = rec_get_nth_field_offs( + offsets, index->db_trx_id(), &len); + ut_ad(len == DATA_TRX_ID_LEN); + } + + const byte* ptr = trx_read_trx_id(rec + trx_id_offs) + < log->min_trx + ? reset_trx_id + : rec + trx_id_offs; + + memcpy(sys, ptr, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + ut_d(trx_id_check(sys, log->min_trx)); + } + + return(NULL); + } + + mysql_mutex_lock(&log->mutex); + + /* log->error is protected by log->mutex. */ + if (log->error == DB_SUCCESS) { + dict_table_t* new_table = log->table; + dict_index_t* new_index + = dict_table_get_first_index(new_table); + const ulint new_n_uniq + = dict_index_get_n_unique(new_index); + + if (!*heap) { + ulint size = 0; + + if (!offsets) { + size += (1 + REC_OFFS_HEADER_SIZE + + unsigned(index->n_fields)) + * sizeof *offsets; + } + + for (ulint i = 0; i < new_n_uniq; i++) { + size += dict_col_get_min_size( + dict_index_get_nth_col(new_index, i)); + } + + *heap = mem_heap_create( + DTUPLE_EST_ALLOC(new_n_uniq + 2) + size); + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, nullptr, + index->n_core_fields, + ULINT_UNDEFINED, heap); + } + + tuple = dtuple_create(*heap, new_n_uniq + 2); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_n_uniq); + + const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table); + + const ulint zip_size = index->table->space->zip_size(); + + for (ulint new_i = 0; new_i < new_n_uniq; new_i++) { + dict_field_t* ifield; + dfield_t* dfield; + ulint prtype; + ulint mbminlen, mbmaxlen; + + ifield = dict_index_get_nth_field(new_index, new_i); + dfield = dtuple_get_nth_field(tuple, new_i); + + const ulint col_no + = dict_field_get_col(ifield)->ind; + + if (const dict_col_t* col + = row_log_table_get_pk_old_col( + index->table, log->col_map, col_no)) { + ulint i = dict_col_get_clust_pos(col, index); + + if (i == ULINT_UNDEFINED) { + ut_ad(0); + log->error = DB_CORRUPTION; + goto err_exit; + } + + log->error = row_log_table_get_pk_col( + ifield, new_index, dfield, *heap, + rec, offsets, i, zip_size, max_len, + log); + + if (log->error != DB_SUCCESS) { +err_exit: + tuple = NULL; + goto func_exit; + } + + mbminlen = col->mbminlen; + mbmaxlen = col->mbmaxlen; + prtype = col->prtype; + } else { + /* No matching column was found in the old + table, so this must be an added column. + Copy the default value. */ + ut_ad(log->defaults); + + dfield_copy(dfield, dtuple_get_nth_field( + log->defaults, col_no)); + mbminlen = dfield->type.mbminlen; + mbmaxlen = dfield->type.mbmaxlen; + prtype = dfield->type.prtype; + } + + ut_ad(!dfield_is_ext(dfield)); + ut_ad(!dfield_is_null(dfield)); + + if (ifield->prefix_len) { + ulint len = dtype_get_at_most_n_mbchars( + prtype, mbminlen, mbmaxlen, + ifield->prefix_len, + dfield_get_len(dfield), + static_cast<const char*>( + dfield_get_data(dfield))); + + ut_ad(len <= dfield_get_len(dfield)); + dfield_set_len(dfield, len); + } + } + + const byte* trx_roll = rec + + row_get_trx_id_offset(index, offsets); + + /* Copy the fields, because the fields will be updated + or the record may be moved somewhere else in the B-tree + as part of the upcoming operation. */ + if (trx_read_trx_id(trx_roll) < log->min_trx) { + trx_roll = reset_trx_id; + if (sys) { + memcpy(sys, trx_roll, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + } else if (sys) { + memcpy(sys, trx_roll, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + trx_roll = sys; + } else { + trx_roll = static_cast<const byte*>( + mem_heap_dup( + *heap, trx_roll, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); + } + + ut_d(trx_id_check(trx_roll, log->min_trx)); + + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq), + trx_roll, DATA_TRX_ID_LEN); + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1), + trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN); + } + +func_exit: + mysql_mutex_unlock(&log->mutex); + return(tuple); +} + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec,index) */ +{ + row_log_table_low(rec, index, offsets, true, NULL); +} + +/******************************************************//** +Converts a log record to a table row. +@return converted row, or NULL if the conversion fails */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const dtuple_t* +row_log_table_apply_convert_mrec( +/*=============================*/ + const mrec_t* mrec, /*!< in: merge record */ + dict_index_t* index, /*!< in: index of mrec */ + const rec_offs* offsets, /*!< in: offsets of mrec */ + row_log_t* log, /*!< in: rebuild context */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dberr_t* error) /*!< out: DB_SUCCESS or + DB_MISSING_HISTORY or + reason of failure */ +{ + dtuple_t* row; + + log->n_rows++; + *error = DB_SUCCESS; + + /* This is based on row_build(). */ + if (log->defaults) { + row = dtuple_copy(log->defaults, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(log->table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(log->table)); + dict_table_copy_types(row, log->table); + } + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + + const dict_col_t* col + = dict_field_get_col(ind_field); + + if (col->is_dropped()) { + /* the column was instantly dropped earlier */ + ut_ad(index->table->instant); + continue; + } + + ulint col_no + = log->col_map[dict_col_get_no(col)]; + + if (col_no == ULINT_UNDEFINED) { + /* the column is being dropped now */ + continue; + } + + dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + + ulint len; + const byte* data; + + if (rec_offs_nth_extern(offsets, i)) { + ut_ad(rec_offs_any_extern(offsets)); + index->lock.x_lock(SRW_LOCK_CALL); + + data = btr_rec_copy_externally_stored_field( + mrec, offsets, + index->table->space->zip_size(), + i, &len, heap); + ut_a(data); + dfield_set_data(dfield, data, len); + + index->lock.x_unlock(); + } else { + data = rec_get_nth_field(mrec, offsets, i, &len); + if (len == UNIV_SQL_DEFAULT) { + data = log->instant_field_value(i, &len); + } + dfield_set_data(dfield, data, len); + } + + if (len != UNIV_SQL_NULL && col->mtype == DATA_MYSQL + && col->len != len && !dict_table_is_comp(log->table)) { + + ut_ad(col->len >= len); + if (dict_table_is_comp(index->table)) { + byte* buf = (byte*) mem_heap_alloc(heap, + col->len); + memcpy(buf, dfield->data, len); + memset(buf + len, 0x20, col->len - len); + + dfield_set_data(dfield, buf, col->len); + } else { + /* field length mismatch should not happen + when rebuilding the redundant row format + table. */ + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + } + } + + /* See if any columns were changed to NULL or NOT NULL. */ + const dict_col_t* new_col + = dict_table_get_nth_col(log->table, col_no); + ut_ad(new_col->same_format(*col)); + + /* Assert that prtype matches except for nullability. */ + ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype) + & ~(DATA_NOT_NULL | DATA_VERSIONED + | CHAR_COLL_MASK << 16 | DATA_LONG_TRUE_VARCHAR))); + + if (new_col->prtype == col->prtype) { + continue; + } + + if ((new_col->prtype & DATA_NOT_NULL) + && dfield_is_null(dfield)) { + + if (!log->allow_not_null) { + /* We got a NULL value for a NOT NULL column. */ + *error = DB_INVALID_NULL; + return NULL; + } + + const dfield_t& default_field + = log->defaults->fields[col_no]; + + Field* field = log->old_table->field[col->ind]; + + field->set_warning(Sql_condition::WARN_LEVEL_WARN, + WARN_DATA_TRUNCATED, 1, + ulong(log->n_rows)); + + *dfield = default_field; + } + + /* Adjust the DATA_NOT_NULL flag in the parsed row. */ + dfield_get_type(dfield)->prtype = new_col->prtype; + + ut_ad(dict_col_type_assert_equal(new_col, + dfield_get_type(dfield))); + } + + return(row); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert_low( +/*===========================*/ + que_thr_t* thr, /*!< in: query graph */ + const dtuple_t* row, /*!< in: table row + in the old table definition */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup) /*!< in/out: for reporting + duplicate key errors */ +{ + dberr_t error; + dtuple_t* entry; + const row_log_t*log = dup->index->online_log; + dict_index_t* index = dict_table_get_first_index(log->table); + ulint n_index = 0; + + ut_ad(dtuple_validate(row)); + + DBUG_LOG("ib_alter_table", + "insert table " << index->table->id << " (index " + << index->id << "): " << rec_printer(row).str()); + + static const ulint flags + = (BTR_CREATE_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG); + + entry = row_build_index_entry(row, NULL, index, heap); + + error = row_ins_clust_index_entry_low( + flags, BTR_MODIFY_TREE, index, index->n_uniq, + entry, 0, thr); + + switch (error) { + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + /* The row had already been copied to the table. */ + return(DB_SUCCESS); + default: + return(error); + } + + ut_ad(dict_index_is_clust(index)); + + for (n_index += index->type != DICT_CLUSTERED; + (index = dict_table_get_next_index(index)); n_index++) { + if (index->type & DICT_FTS) { + continue; + } + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + flags, BTR_INSERT_TREE, + index, offsets_heap, heap, entry, + thr_get_trx(thr)->id, thr); + + if (error != DB_SUCCESS) { + if (error == DB_DUPLICATE_KEY) { + thr_get_trx(thr)->error_key_num = n_index; + } + break; + } + } + + return(error); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + const mrec_t* mrec, /*!< in: record to insert */ + const rec_offs* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup) /*!< in/out: for reporting + duplicate key errors */ +{ + row_log_t*log = dup->index->online_log; + dberr_t error; + const dtuple_t* row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, &error); + + switch (error) { + case DB_SUCCESS: + ut_ad(row != NULL); + break; + default: + ut_ad(0); + /* fall through */ + case DB_INVALID_NULL: + ut_ad(row == NULL); + return(error); + } + + error = row_log_table_apply_insert_low( + thr, row, offsets_heap, heap, dup); + if (error != DB_SUCCESS) { + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + return(error); +} + +/******************************************************//** +Deletes a record from a table that is being rebuilt. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_delete_low( +/*===========================*/ + btr_pcur_t* pcur, /*!< in/out: B-tree cursor, + will be trashed */ + const rec_offs* offsets, /*!< in: offsets on pcur */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction, + will be committed */ +{ + dberr_t error; + row_ext_t* ext; + dtuple_t* row; + dict_index_t* index = pcur->index(); + + ut_ad(dict_index_is_clust(index)); + + DBUG_LOG("ib_alter_table", + "delete table " << index->table->id << " (index " + << index->id << "): " + << rec_printer(btr_pcur_get_rec(pcur), offsets).str()); + + if (dict_table_get_next_index(index)) { + /* Build a row template for purging secondary index entries. */ + row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(pcur), + offsets, NULL, NULL, NULL, &ext, heap); + } else { + row = NULL; + } + + btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, false, mtr); + if (error != DB_SUCCESS) { +err_exit: + mtr->commit(); + return error; + } + + mtr->commit(); + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->type & DICT_FTS) { + continue; + } + + const dtuple_t* entry = row_build_index_entry( + row, ext, index, heap); + mtr->start(); + index->set_modified(*mtr); + pcur->btr_cur.page_cur.index = index; + error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_PURGE_TREE, pcur, + mtr); + if (error) { + goto err_exit; + } +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(pcur)) + || btr_pcur_get_low_match(pcur) < index->n_uniq) { + /* All secondary index entries should be + found, because new_table is being modified by + this thread only, and all indexes should be + updated in sync. */ + mtr->commit(); + return(DB_INDEX_CORRUPT); + } + + btr_cur_pessimistic_delete(&error, FALSE, + btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, false, mtr); + mtr->commit(); + } + + return(error); +} + +/******************************************************//** +Replays a delete operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_delete( +/*=======================*/ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: merge record */ + const rec_offs* moffsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const row_log_t* log) /*!< in: online log */ +{ + dict_table_t* new_table = log->table; + dict_index_t* index = dict_table_get_first_index(new_table); + dtuple_t* old_pk; + mtr_t mtr; + btr_pcur_t pcur; + rec_offs* offsets; + + pcur.btr_cur.page_cur.index = index; + ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field()); + ut_ad(!rec_offs_any_extern(moffsets)); + + /* Convert the row to a search tuple. */ + old_pk = dtuple_create(heap, index->n_uniq); + dict_index_copy_types(old_pk, index, index->n_uniq); + + for (ulint i = 0; i < index->n_uniq; i++) { + ulint len; + const void* field; + field = rec_get_nth_field(mrec, moffsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + dfield_set_data(dtuple_get_nth_field(old_pk, i), + field, len); + } + + mtr_start(&mtr); + index->set_modified(mtr); + dberr_t err = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_PURGE_TREE, &pcur, + &mtr); + if (err != DB_SUCCESS) { + goto all_done; + } +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { +all_done: + mtr_commit(&mtr); + /* The record was not found. All done. */ + /* This should only happen when an earlier + ROW_T_INSERT was skipped or + ROW_T_UPDATE was interpreted as ROW_T_DELETE + due to BLOBs having been freed by rollback. */ + return err; + } + + offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, nullptr, + index->n_core_fields, + ULINT_UNDEFINED, &offsets_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */ + + { + ulint len; + const byte* mrec_trx_id + = rec_get_nth_field(mrec, moffsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + const byte* rec_trx_id + = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_d(trx_id_check(rec_trx_id, log->min_trx)); + ut_d(trx_id_check(mrec_trx_id, log->min_trx)); + + ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len) + == mrec_trx_id + DATA_TRX_ID_LEN); + ut_ad(len == DATA_ROLL_PTR_LEN); + ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, + trx_id_col + 1, &len) + == rec_trx_id + DATA_TRX_ID_LEN); + ut_ad(len == DATA_ROLL_PTR_LEN); + + if (memcmp(mrec_trx_id, rec_trx_id, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + /* The ROW_T_DELETE was logged for a different + PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR. + This is possible if a ROW_T_INSERT was skipped + or a ROW_T_UPDATE was interpreted as ROW_T_DELETE + because some BLOBs were missing due to + (1) rolling back the initial insert, or + (2) purging the BLOB for a later ROW_T_DELETE + (3) purging 'old values' for a later ROW_T_UPDATE + or ROW_T_DELETE. */ + ut_ad(!log->same_pk); + goto all_done; + } + } + + return row_log_table_apply_delete_low(&pcur, offsets, heap, &mtr); +} + +/******************************************************//** +Replays an update operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_update( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: new value */ + const rec_offs* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + const dtuple_t* old_pk) /*!< in: PRIMARY KEY and + DB_TRX_ID,DB_ROLL_PTR + of the old value, + or PRIMARY KEY if same_pk */ +{ + row_log_t* log = dup->index->online_log; + const dtuple_t* row; + dict_index_t* index = dict_table_get_first_index(log->table); + mtr_t mtr; + btr_pcur_t pcur; + dberr_t error; + ulint n_index = 0; + + pcur.btr_cur.page_cur.index = index; + + ut_ad(dtuple_get_n_fields_cmp(old_pk) + == dict_index_get_n_unique(index)); + ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2) + == dict_index_get_n_unique(index)); + + row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, &error); + + switch (error) { + case DB_SUCCESS: + ut_ad(row != NULL); + break; + default: + ut_ad(0); + /* fall through */ + case DB_INVALID_NULL: + ut_ad(row == NULL); + return(error); + } + + mtr.start(); + index->set_modified(mtr); + error = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur, + &mtr); + if (error != DB_SUCCESS) { +func_exit: + mtr.commit(); +func_exit_committed: + ut_ad(mtr.has_committed()); + ut_free(pcur.old_rec_buf); + + if (error != DB_SUCCESS) { + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + + return error; + } +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + ut_ad(0);/* We did not request buffering. */ + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + break; + } +#endif /* UNIV_DEBUG */ + + ut_ad(!page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + && btr_pcur_get_low_match(&pcur) >= index->n_uniq); + + /* Prepare to update (or delete) the record. */ + rec_offs* cur_offsets = rec_get_offsets( + btr_pcur_get_rec(&pcur), index, nullptr, index->n_core_fields, + ULINT_UNDEFINED, &offsets_heap); + +#ifdef UNIV_DEBUG + if (!log->same_pk) { + ulint len; + const byte* rec_trx_id + = rec_get_nth_field(btr_pcur_get_rec(&pcur), + cur_offsets, index->n_uniq, &len); + const dfield_t* old_pk_trx_id + = dtuple_get_nth_field(old_pk, index->n_uniq); + ut_ad(len == DATA_TRX_ID_LEN); + ut_d(trx_id_check(rec_trx_id, log->min_trx)); + ut_ad(old_pk_trx_id->len == DATA_TRX_ID_LEN); + ut_ad(old_pk_trx_id[1].len == DATA_ROLL_PTR_LEN); + ut_ad(DATA_TRX_ID_LEN + + static_cast<const char*>(old_pk_trx_id->data) + == old_pk_trx_id[1].data); + ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx)); + ut_ad(!memcmp(rec_trx_id, old_pk_trx_id->data, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); + } +#endif + + dtuple_t* entry = row_build_index_entry_low( + row, NULL, index, heap, ROW_BUILD_NORMAL); + upd_t* update = row_upd_build_difference_binary( + index, entry, btr_pcur_get_rec(&pcur), cur_offsets, + false, false, NULL, heap, dup->table, &error); + if (error != DB_SUCCESS || !update->n_fields) { + goto func_exit; + } + + const bool pk_updated + = upd_get_nth_field(update, 0)->field_no < new_trx_id_col; + + if (pk_updated || rec_offs_any_extern(cur_offsets)) { + /* If the record contains any externally stored + columns, perform the update by delete and insert, + because we will not write any undo log that would + allow purge to free any orphaned externally stored + columns. */ + + if (pk_updated && log->same_pk) { + /* The ROW_T_UPDATE log record should only be + written when the PRIMARY KEY fields of the + record did not change in the old table. We + can only get a change of PRIMARY KEY columns + in the rebuilt table if the PRIMARY KEY was + redefined (!same_pk). */ + ut_ad(0); + error = DB_CORRUPTION; + goto func_exit; + } + + error = row_log_table_apply_delete_low( + &pcur, cur_offsets, heap, &mtr); + ut_ad(mtr.has_committed()); + + if (error == DB_SUCCESS) { + error = row_log_table_apply_insert_low( + thr, row, offsets_heap, heap, dup); + } + + goto func_exit_committed; + } + + dtuple_t* old_row; + row_ext_t* old_ext; + + if (dict_table_get_next_index(index)) { + /* Construct the row corresponding to the old value of + the record. */ + old_row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur), + cur_offsets, NULL, NULL, NULL, &old_ext, heap); + ut_ad(old_row); + + DBUG_LOG("ib_alter_table", + "update table " << index->table->id + << " (index " << index->id + << ": " << rec_printer(old_row).str() + << " to " << rec_printer(row).str()); + } else { + old_row = NULL; + old_ext = NULL; + } + + big_rec_t* big_rec; + + error = btr_cur_pessimistic_update( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG + | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &cur_offsets, &offsets_heap, heap, &big_rec, + update, 0, thr, 0, &mtr); + + if (big_rec) { + if (error == DB_SUCCESS) { + error = btr_store_big_rec_extern_fields( + &pcur, cur_offsets, big_rec, &mtr, + BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + + for (n_index += index->type != DICT_CLUSTERED; + (index = dict_table_get_next_index(index)); n_index++) { + if (!index->is_btree()) { + continue; + } + + if (error != DB_SUCCESS) { + break; + } + + if (!row_upd_changes_ord_field_binary( + index, update, thr, old_row, NULL)) { + continue; + } + + if (dict_index_has_virtual(index)) { + dtuple_copy_v_fields(old_row, old_pk); + } + + mtr.commit(); + + entry = row_build_index_entry(old_row, old_ext, index, heap); + if (!entry) { + ut_ad(0); + error = DB_CORRUPTION; + goto func_exit_committed; + } + + mtr.start(); + index->set_modified(mtr); + pcur.btr_cur.page_cur.index = index; + + ut_free(pcur.old_rec_buf); + pcur.old_rec_buf = nullptr; + + if (ROW_FOUND != row_search_index_entry( + entry, BTR_MODIFY_TREE, &pcur, &mtr)) { + ut_ad(0); + error = DB_CORRUPTION; + break; + } + + btr_cur_pessimistic_delete( + &error, FALSE, btr_pcur_get_btr_cur(&pcur), + BTR_CREATE_FLAG, false, &mtr); + + if (error != DB_SUCCESS) { + break; + } + + mtr.commit(); + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, + BTR_INSERT_TREE, index, offsets_heap, heap, + entry, thr_get_trx(thr)->id, thr); + + /* Report correct index name for duplicate key error. */ + if (error == DB_DUPLICATE_KEY) { + thr_get_trx(thr)->error_key_num = n_index; + } + + mtr.start(); + index->set_modified(mtr); + } + + goto func_exit; +} + +/******************************************************//** +Applies an operation to a table that was rebuilt. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const mrec_t* +row_log_table_apply_op( +/*===================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in new index */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS + or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + rec_offs* offsets) /*!< in/out: work area + for parsing mrec */ +{ + row_log_t* log = dup->index->online_log; + dict_index_t* new_index = dict_table_get_first_index(log->table); + ulint extra_size; + const mrec_t* next_mrec; + dtuple_t* old_pk; + + ut_ad(dict_index_is_clust(dup->index)); + ut_ad(dup->index->table != log->table); + ut_ad(log->head.total <= log->tail.total); + + *error = DB_SUCCESS; + + /* 3 = 1 (op type) + 1 (extra_size) + at least 1 byte payload */ + if (mrec + 3 >= mrec_end) { + return(NULL); + } + + const bool is_instant = log->is_instant(dup->index); + const mrec_t* const mrec_start = mrec; + + switch (*mrec++) { + default: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + case ROW_T_INSERT: + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + ut_ad(extra_size || !is_instant); + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets, + log->n_core_fields, log->non_core_fields, + is_instant + ? static_cast<rec_comp_status_t>( + *(mrec - extra_size)) + : REC_STATUS_ORDINARY); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } else { + log->head.total += ulint(next_mrec - mrec_start); + *error = row_log_table_apply_insert( + thr, mrec, offsets, offsets_heap, + heap, dup); + } + break; + + case ROW_T_DELETE: + extra_size = *mrec++; + ut_ad(mrec < mrec_end); + + /* We assume extra_size < 0x100 for the PRIMARY KEY prefix. + For fixed-length PRIMARY key columns, it is 0. */ + mrec += extra_size; + + /* The ROW_T_DELETE record was converted by + rec_convert_dtuple_to_temp() using new_index. */ + ut_ad(!new_index->is_instant()); + rec_offs_set_n_fields(offsets, new_index->first_user_field()); + rec_init_offsets_temp(mrec, new_index, offsets); + next_mrec = mrec + rec_offs_data_size(offsets); + if (next_mrec > mrec_end) { + return(NULL); + } + + log->head.total += ulint(next_mrec - mrec_start); + + *error = row_log_table_apply_delete( + new_trx_id_col, + mrec, offsets, offsets_heap, heap, log); + break; + + case ROW_T_UPDATE: + /* Logically, the log entry consists of the + (PRIMARY KEY,DB_TRX_ID) of the old value (converted + to the new primary key definition) followed by + the new value in the old table definition. If the + definition of the columns belonging to PRIMARY KEY + is not changed, the log will only contain + DB_TRX_ID,new_row. */ + + if (log->same_pk) { + ut_ad(new_index->n_uniq == dup->index->n_uniq); + + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + ut_ad(extra_size || !is_instant); + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets, + log->n_core_fields, + log->non_core_fields, + is_instant + ? static_cast<rec_comp_status_t>( + *(mrec - extra_size)) + : REC_STATUS_ORDINARY); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + + old_pk = dtuple_create(heap, new_index->n_uniq); + dict_index_copy_types( + old_pk, new_index, old_pk->n_fields); + + /* Copy the PRIMARY KEY fields from mrec to old_pk. */ + for (ulint i = 0; i < new_index->n_uniq; i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + } else { + /* We assume extra_size < 0x100 + for the PRIMARY KEY prefix. */ + mrec += *mrec + 1; + + if (mrec > mrec_end) { + return(NULL); + } + + /* Get offsets for PRIMARY KEY, + DB_TRX_ID, DB_ROLL_PTR. */ + /* The old_pk prefix was converted by + rec_convert_dtuple_to_temp() using new_index. */ + ut_ad(!new_index->is_instant()); + rec_offs_set_n_fields(offsets, + new_index->first_user_field()); + rec_init_offsets_temp(mrec, new_index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + if (next_mrec + 2 > mrec_end) { + return(NULL); + } + + /* Copy the PRIMARY KEY fields and + DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */ + old_pk = dtuple_create(heap, + new_index->first_user_field()); + dict_index_copy_types(old_pk, new_index, + old_pk->n_fields); + + for (ulint i = 0; i < new_index->first_user_field(); + i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + + mrec = next_mrec; + + /* Fetch the new value of the row as it was + in the old table definition. */ + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + ut_ad(extra_size || !is_instant); + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets, + log->n_core_fields, + log->non_core_fields, + is_instant + ? static_cast<rec_comp_status_t>( + *(mrec - extra_size)) + : REC_STATUS_ORDINARY); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + } + + ut_ad(next_mrec <= mrec_end); + log->head.total += ulint(next_mrec - mrec_start); + dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq); + + *error = row_log_table_apply_update( + thr, new_trx_id_col, + mrec, offsets, offsets_heap, heap, dup, old_pk); + break; + } + + ut_ad(log->head.total <= log->tail.total); + mem_heap_empty(offsets_heap); + mem_heap_empty(heap); + return(next_mrec); +} + +#ifdef HAVE_PSI_STAGE_INTERFACE +/** Estimate how much an ALTER TABLE progress should be incremented per +one block of log applied. +For the other phases of ALTER TABLE we increment the progress with 1 per +page processed. +@return amount of abstract units to add to work_completed when one block +of log is applied. +*/ +inline +ulint +row_log_progress_inc_per_block() +{ + /* We must increment the progress once per page (as in + srv_page_size, default = innodb_page_size=16KiB). + One block here is srv_sort_buf_size (usually 1MiB). */ + const ulint pages_per_block = std::max<ulint>( + ulint(srv_sort_buf_size >> srv_page_size_shift), 1); + + /* Multiply by an artificial factor of 6 to even the pace with + the rest of the ALTER TABLE phases, they process page_size amount + of data faster. */ + return(pages_per_block * 6); +} + +/** Estimate how much work is to be done by the log apply phase +of an ALTER TABLE for this index. +@param[in] index index whose log to assess +@return work to be done by log-apply in abstract units +*/ +ulint +row_log_estimate_work( + const dict_index_t* index) +{ + if (index == NULL || index->online_log == NULL + || index->online_log_is_dummy()) { + return(0); + } + + const row_log_t* l = index->online_log; + const ulint bytes_left = + static_cast<ulint>(l->tail.total - l->head.total); + const ulint blocks_left = bytes_left / srv_sort_buf_size; + + return(blocks_left * row_log_progress_inc_per_block()); +} +#else /* HAVE_PSI_STAGE_INTERFACE */ +inline +ulint +row_log_progress_inc_per_block() +{ + return(0); +} +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +/** Applies operations to a table was rebuilt. +@param[in] thr query graph +@param[in,out] dup for reporting duplicate key errors +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL, then stage->inc() will be called for each block +of log that is applied. +@return DB_SUCCESS, or error code on failure */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_log_table_apply_ops( + que_thr_t* thr, + row_merge_dup_t* dup, + ut_stage_alter_t* stage) +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end = NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* heap; + mem_heap_t* offsets_heap; + rec_offs* offsets; + bool has_index_lock; + dict_index_t* index = const_cast<dict_index_t*>( + dup->index); + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index( + new_table); + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + std::max<ulint>(index->n_fields, + new_index->first_user_field()); + const ulint new_trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index); + trx_t* trx = thr_get_trx(thr); + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(trx->mysql_thd); + ut_ad(index->lock.have_x()); + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, DATA_TRX_ID), index) + != ULINT_UNDEFINED); + ut_ad(new_trx_id_col > 0); + ut_ad(new_trx_id_col != ULINT_UNDEFINED); + + MEM_UNDEFINED(&mrec_end, sizeof mrec_end); + + offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets)); + rec_offs_set_n_alloc(offsets, i); + rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index)); + + heap = mem_heap_create(srv_page_size); + offsets_heap = mem_heap_create(srv_page_size); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); + ut_ad(index->lock.have_u_or_x()); + ut_ad(index->online_log->head.bytes == 0); + + stage->inc(row_log_progress_inc_per_block()); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + if (index->is_corrupted()) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + ut_ad(dict_index_is_online_ddl(index)); + + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + ib::error() << "Unexpected end of temporary file for table " + << index->table->name; +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + if (index->online_log->fd > 0 + && ftruncate(index->online_log->fd, 0) == -1) { + ib::error() + << "\'" << index->name + 1 + << "\' failed with error " + << errno << ":" << strerror(errno); + + goto corruption; + } +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + index->online_log->head.bytes = 0; + index->online_log->tail.bytes = 0; + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + index->lock.x_unlock(); + + log_free_check(); + + ut_ad(dict_index_is_online_ddl(index)); + + if (!row_log_block_allocate(index->online_log->head)) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + byte* buf = index->online_log->head.block; + + if (DB_SUCCESS + != os_file_read(IORequestRead, index->online_log->fd, + buf, ofs, srv_sort_buf_size, nullptr)) { + ib::error() + << "Unable to read temporary file" + " for table " << index->table->name; + goto corruption; + } + + if (srv_encrypt_log) { + if (!log_tmp_block_decrypt( + buf, srv_sort_buf_size, + index->online_log->crypt_head, ofs)) { + error = DB_DECRYPTION_FAILED; + goto func_exit; + } + + srv_stats.n_rowlog_blocks_decrypted.inc(); + memcpy(buf, index->online_log->crypt_head, + srv_sort_buf_size); + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + /* This read is not protected by index->online_log->mutex for + performance reasons. We will eventually notice any error that + was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + ulint((&index->online_log->head.buf)[1] - mrec_end)); + mrec = row_log_table_apply_op( + thr, new_trx_id_col, + dup, &error, offsets_heap, heap, + index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = ulint(mrec - mrec_end); + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec <= mrec_end); + + if (mrec == mrec_end) { + /* We are at the end of the log. + Mark the replay all_done. */ + if (has_index_lock) { + goto all_done; + } + } + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + /* This read is not protected by index->online_log->mutex + for performance reasons. We will eventually notice any + error that was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + next_mrec = row_log_table_apply_op( + thr, new_trx_id_col, + dup, &error, offsets_heap, heap, + mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + index->lock.x_lock(SRW_LOCK_CALL); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes + += ulint(next_mrec - mrec); + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + ulint(mrec_end - mrec)); + mrec_end += ulint(index->online_log->head.buf - mrec); + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + index->lock.x_lock(SRW_LOCK_CALL); + } + + mem_heap_free(offsets_heap); + mem_heap_free(heap); + row_log_block_free(index->online_log->head); + ut_free(offsets); + return(error); +} + +/** Apply the row_log_table log to a table upon completing rebuild. +@param[in] thr query graph +@param[in] old_table old table +@param[in,out] table MySQL table (for reporting duplicates) +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_log_table() will be called initially and then +stage->inc() will be called for each block of log that is applied. +@param[in] new_table Altered table +@return DB_SUCCESS, or error code on failure */ +dberr_t +row_log_table_apply( + que_thr_t* thr, + dict_table_t* old_table, + struct TABLE* table, + ut_stage_alter_t* stage, + dict_table_t* new_table) +{ + dberr_t error; + dict_index_t* clust_index; + + thr_get_trx(thr)->error_key_num = 0; + DBUG_EXECUTE_IF("innodb_trx_duplicates", + thr_get_trx(thr)->duplicates = TRX_DUP_REPLACE;); + + stage->begin_phase_log_table(); + + clust_index = dict_table_get_first_index(old_table); + + if (clust_index->online_log->n_rows == 0) { + clust_index->online_log->n_rows = new_table->stat_n_rows; + } + + clust_index->lock.x_lock(SRW_LOCK_CALL); + + if (!clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + /* This function should not be called unless + rebuilding a table online. Build in some fault + tolerance. */ + ut_ad(0); + error = DB_ERROR; + } else { + row_merge_dup_t dup = { + clust_index, table, + clust_index->online_log->col_map, 0 + }; + + error = row_log_table_apply_ops(thr, &dup, stage); + + ut_ad(error != DB_SUCCESS + || clust_index->online_log->head.total + == clust_index->online_log->tail.total); + } + + clust_index->lock.x_unlock(); + DBUG_EXECUTE_IF("innodb_trx_duplicates", + thr_get_trx(thr)->duplicates = 0;); + + return(error); +} + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +bool +row_log_allocate( +/*=============*/ + const trx_t* trx, /*!< in: the ALTER TABLE transaction */ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* defaults, + /*!< in: default values of + added, changed columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ + const char* path, /*!< in: where to create temporary file */ + const TABLE* old_table, /*!< in: table definition before alter */ + const bool allow_not_null) /*!< in: allow null to not-null + conversion */ +{ + row_log_t* log; + DBUG_ENTER("row_log_allocate"); + + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(dict_index_is_clust(index) == !!table); + ut_ad(!table || index->table != table); + ut_ad(same_pk || table); + ut_ad(!table || col_map); + ut_ad(!defaults || col_map); + ut_ad(index->lock.have_u_or_x()); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->id); + + log = static_cast<row_log_t*>(ut_malloc_nokey(sizeof *log)); + + if (log == NULL) { + DBUG_RETURN(false); + } + + log->fd = OS_FILE_CLOSED; + mysql_mutex_init(index_online_log_key, &log->mutex, nullptr); + + log->table = table; + log->same_pk = same_pk; + log->defaults = defaults; + log->col_map = col_map; + log->error = DB_SUCCESS; + log->min_trx = trx->id; + log->max_trx = 0; + log->tail.blocks = log->tail.bytes = 0; + log->tail.total = 0; + log->tail.block = log->head.block = NULL; + log->crypt_tail = log->crypt_head = NULL; + log->head.blocks = log->head.bytes = 0; + log->head.total = 0; + log->path = path; + log->n_core_fields = index->n_core_fields; + ut_ad(!table || log->is_instant(index) + == (index->n_core_fields < index->n_fields)); + log->allow_not_null = allow_not_null; + log->old_table = old_table; + log->n_rows = 0; + + if (table && index->is_instant()) { + const unsigned n = log->n_core_fields; + log->non_core_fields = UT_NEW_ARRAY_NOKEY( + dict_col_t::def_t, index->n_fields - n); + for (unsigned i = n; i < index->n_fields; i++) { + log->non_core_fields[i - n] + = index->fields[i].col->def_val; + } + } else { + log->non_core_fields = NULL; + } + + dict_index_set_online_status(index, ONLINE_INDEX_CREATION); + + if (srv_encrypt_log) { + log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size; + log->crypt_head = static_cast<byte *>( + my_large_malloc(&log->crypt_head_size, MYF(MY_WME))); + log->crypt_tail = static_cast<byte *>( + my_large_malloc(&log->crypt_tail_size, MYF(MY_WME))); + + if (!log->crypt_head || !log->crypt_tail) { + row_log_free(log); + DBUG_RETURN(false); + } + } + + index->online_log = log; + + if (!table) { + /* Assign the clustered index online log to table. + It can be used by concurrent DML to identify whether + the table has any online DDL */ + index->table->indexes.start->online_log_make_dummy(); + log->alter_trx = trx; + } + + /* While we might be holding an exclusive data dictionary lock + here, in row_log_abort_sec() we will not always be holding it. Use + atomic operations in both cases. */ + MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX); + + DBUG_RETURN(true); +} + +/******************************************************//** +Free the row log for an index that was being created online. */ +void +row_log_free( +/*=========*/ + row_log_t* log) /*!< in,own: row log */ +{ + MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX); + + UT_DELETE_ARRAY(log->non_core_fields); + row_log_block_free(log->tail); + row_log_block_free(log->head); + row_merge_file_destroy_low(log->fd); + + if (log->crypt_head) { + my_large_free(log->crypt_head, log->crypt_head_size); + } + + if (log->crypt_tail) { + my_large_free(log->crypt_tail, log->crypt_tail_size); + } + + mysql_mutex_destroy(&log->mutex); + ut_free(log); +} + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ +{ + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION); +#ifdef SAFE_MUTEX + ut_ad(index->lock.have_x() + || (index->lock.have_s() + && mysql_mutex_is_owner(&index->online_log->mutex))); +#endif + return(index->online_log->max_trx); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. */ +static MY_ATTRIBUTE((nonnull)) +void +row_log_apply_op_low( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + enum row_op op, /*!< in: operation being applied */ + trx_id_t trx_id, /*!< in: transaction identifier */ + const dtuple_t* entry) /*!< in: row */ +{ + mtr_t mtr; + btr_cur_t cursor; + rec_offs* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); + + ut_ad(index->lock.have_x() == has_index_lock); + + ut_ad(!index->is_corrupted()); + ut_ad(trx_id != 0 || op == ROW_OP_DELETE); + + DBUG_LOG("ib_create_index", + (op == ROW_OP_INSERT ? "insert " : "delete ") + << (has_index_lock ? "locked index " : "unlocked index ") + << index->id << ',' << ib::hex(trx_id) << ": " + << rec_printer(entry).str()); + + mtr_start(&mtr); + index->set_modified(mtr); + cursor.page_cur.index = index; + if (has_index_lock) { + mtr_x_lock_index(index, &mtr); + } + + /* We perform the pessimistic variant of the operations if we + already hold index->lock exclusively. First, search the + record. The operation may already have been performed, + depending on when the row in the clustered index was + scanned. */ + *error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock + ? BTR_MODIFY_TREE_ALREADY_LATCHED + : BTR_MODIFY_LEAF, &mtr); + if (UNIV_UNLIKELY(*error != DB_SUCCESS)) { + goto func_exit; + } + + ut_ad(dict_index_get_n_unique(index) > 0); + /* This test is somewhat similar to row_ins_must_modify_rec(), + but not identical for unique secondary indexes. */ + if (cursor.low_match >= dict_index_get_n_unique(index) + && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) { + /* We have a matching record. */ + bool exists = (cursor.low_match + == dict_index_get_n_fields(index)); +#ifdef UNIV_DEBUG + rec_t* rec = btr_cur_get_rec(&cursor); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); +#endif /* UNIV_DEBUG */ + + ut_ad(exists || dict_index_is_unique(index)); + + switch (op) { + case ROW_OP_DELETE: + if (!exists) { + /* The existing record matches the + unique secondary index key, but the + PRIMARY KEY columns differ. So, this + exact record does not exist. For + example, we could detect a duplicate + key error in some old index before + logging an ROW_OP_INSERT for our + index. This ROW_OP_DELETE could have + been logged for rolling back + TRX_UNDO_INSERT_REC. */ + goto func_exit; + } + + *error = btr_cur_optimistic_delete( + &cursor, BTR_CREATE_FLAG, &mtr); + + if (*error != DB_FAIL) { + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + index->set_modified(mtr); + *error = cursor.search_leaf(entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, + &mtr); + if (UNIV_UNLIKELY(*error != DB_SUCCESS)) { + goto func_exit; + } + /* No other thread than the current one + is allowed to modify the index tree. + Thus, the record should still exist. */ + ut_ad(cursor.low_match + >= dict_index_get_n_fields(index)); + ut_ad(page_rec_is_user_rec( + btr_cur_get_rec(&cursor))); + } + + /* As there are no externally stored fields in + a secondary index record, the parameter + rollback=false will be ignored. */ + + btr_cur_pessimistic_delete( + error, FALSE, &cursor, + BTR_CREATE_FLAG, false, &mtr); + break; + case ROW_OP_INSERT: + if (exists) { + /* The record already exists. There + is nothing to be inserted. + This could happen when processing + TRX_UNDO_DEL_MARK_REC in statement + rollback: + + UPDATE of PRIMARY KEY can lead to + statement rollback if the updated + value of the PRIMARY KEY already + exists. In this case, the UPDATE would + be mapped to DELETE;INSERT, and we + only wrote undo log for the DELETE + part. The duplicate key error would be + triggered before logging the INSERT + part. + + Theoretically, we could also get a + similar situation when a DELETE operation + is blocked by a FOREIGN KEY constraint. */ + goto func_exit; + } + + if (dtuple_contains_null(entry)) { + /* The UNIQUE KEY columns match, but + there is a NULL value in the key, and + NULL!=NULL. */ + goto insert_the_rec; + } + + goto duplicate; + } + } else { + switch (op) { + rec_t* rec; + big_rec_t* big_rec; + case ROW_OP_DELETE: + /* The record does not exist. For example, we + could detect a duplicate key error in some old + index before logging an ROW_OP_INSERT for our + index. This ROW_OP_DELETE could be logged for + rolling back TRX_UNDO_INSERT_REC. */ + goto func_exit; + case ROW_OP_INSERT: + if (dict_index_is_unique(index) + && (cursor.up_match + >= dict_index_get_n_unique(index) + || cursor.low_match + >= dict_index_get_n_unique(index)) + && (!index->n_nullable + || !dtuple_contains_null(entry))) { +duplicate: + /* Duplicate key */ + ut_ad(dict_index_is_unique(index)); + row_merge_dup_report(dup, entry->fields); + *error = DB_DUPLICATE_KEY; + goto func_exit; + } +insert_the_rec: + /* Insert the record. As we are inserting into + a secondary index, there cannot be externally + stored columns (!big_rec). */ + *error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, 0, NULL, &mtr); + ut_ad(!big_rec); + if (*error != DB_FAIL) { + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + index->set_modified(mtr); + *error = cursor.search_leaf(entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, + &mtr); + if (*error != DB_SUCCESS) { + break; + } + } + + /* We already determined that the + record did not exist. No other thread + than the current one is allowed to + modify the index tree. Thus, the + record should still not exist. */ + + *error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, + 0, NULL, &mtr); + ut_ad(!big_rec); + break; + } + mem_heap_empty(offsets_heap); + } + + if (*error == DB_SUCCESS && trx_id) { + page_update_max_trx_id(btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + +func_exit: + mtr_commit(&mtr); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const mrec_t* +row_log_apply_op( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap for + allocating data tuples */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + rec_offs* offsets) /*!< in/out: work area for + rec_init_offsets_temp() */ + +{ + enum row_op op; + ulint extra_size; + ulint data_size; + dtuple_t* entry; + trx_id_t trx_id; + + /* Online index creation is only used for secondary indexes. */ + ut_ad(!dict_index_is_clust(index)); + + ut_ad(index->lock.have_x() == has_index_lock); + + if (index->is_corrupted()) { + *error = DB_INDEX_CORRUPT; + return(NULL); + } + + *error = DB_SUCCESS; + + if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) { + return(NULL); + } + + switch (*mrec) { + case ROW_OP_INSERT: + if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) { + return(NULL); + } + + op = static_cast<enum row_op>(*mrec++); + trx_id = trx_read_trx_id(mrec); + mrec += DATA_TRX_ID_LEN; + break; + case ROW_OP_DELETE: + op = static_cast<enum row_op>(*mrec++); + trx_id = 0; + break; + default: +corrupted: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + } + + extra_size = *mrec++; + + ut_ad(mrec < mrec_end); + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_init_offsets_temp(mrec, index, offsets); + + if (rec_offs_any_extern(offsets)) { + /* There should never be any externally stored fields + in a secondary index, which is what online index + creation is used for. Therefore, the log file must be + corrupted. */ + goto corrupted; + } + + data_size = rec_offs_data_size(offsets); + + mrec += data_size; + + if (mrec > mrec_end) { + return(NULL); + } + + entry = row_rec_to_index_entry_low( + mrec - data_size, index, offsets, heap); + /* Online index creation is only implemented for secondary + indexes, which never contain off-page columns. */ + ut_ad(dtuple_get_n_ext(entry) == 0); + + row_log_apply_op_low(index, dup, error, offsets_heap, + has_index_lock, op, trx_id, entry); + return(mrec); +} + +/** Applies operations to a secondary index that was being created. +@param[in] trx transaction (for checking if the operation was +interrupted) +@param[in,out] index index +@param[in,out] dup for reporting duplicate key errors +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL, then stage->inc() will be called for each block +of log that is applied or nullptr when row log applied done by DML +thread. +@return DB_SUCCESS, or error code on failure */ +static +dberr_t +row_log_apply_ops( + const trx_t* trx, + dict_index_t* index, + row_merge_dup_t* dup, + ut_stage_alter_t* stage) +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end= NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* offsets_heap; + mem_heap_t* heap; + rec_offs* offsets; + bool has_index_lock; + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + ut_ad(dict_index_is_online_ddl(index) + || (index->online_log + && index->online_status == ONLINE_INDEX_COMPLETE)); + ut_ad(!index->is_committed()); + ut_ad(index->lock.have_x()); + ut_ad(index->online_log); + + MEM_UNDEFINED(&mrec_end, sizeof mrec_end); + + offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets)); + rec_offs_set_n_alloc(offsets, i); + rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index)); + + offsets_heap = mem_heap_create(srv_page_size); + heap = mem_heap_create(srv_page_size); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); + ut_ad(index->lock.have_x()); + ut_ad(index->online_log->head.bytes == 0); + + if (stage) { + stage->inc(row_log_progress_inc_per_block()); + } + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + error = index->online_log->error; + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (index->is_corrupted()) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + ib::error() << "Unexpected end of temporary file for index " + << index->name; +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + if (index->online_log->fd > 0 + && ftruncate(index->online_log->fd, 0) == -1) { + ib::error() + << "\'" << index->name + 1 + << "\' failed with error " + << errno << ":" << strerror(errno); + + goto corruption; + } +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + index->online_log->tail.bytes = 0; + index->online_log->head.bytes = 0; + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs = static_cast<os_offset_t>( + index->online_log->head.blocks) + * srv_sort_buf_size; + ut_ad(has_index_lock); + has_index_lock = false; + index->lock.x_unlock(); + + log_free_check(); + + if (!row_log_block_allocate(index->online_log->head)) { + error = DB_OUT_OF_MEMORY; + goto func_exit; + } + + byte* buf = index->online_log->head.block; + + if (DB_SUCCESS + != os_file_read(IORequestRead, index->online_log->fd, + buf, ofs, srv_sort_buf_size, nullptr)) { + ib::error() + << "Unable to read temporary file" + " for index " << index->name; + goto corruption; + } + + if (srv_encrypt_log) { + if (!log_tmp_block_decrypt( + buf, srv_sort_buf_size, + index->online_log->crypt_head, ofs)) { + error = DB_DECRYPTION_FAILED; + goto func_exit; + } + + srv_stats.n_rowlog_blocks_decrypted.inc(); + memcpy(buf, index->online_log->crypt_head, srv_sort_buf_size); + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + ulint((&index->online_log->head.buf)[1] - mrec_end)); + mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = ulint(mrec - mrec_end); + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + next_mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + index->lock.x_lock(SRW_LOCK_CALL); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes + += ulint(next_mrec - mrec); + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + ulint(mrec_end - mrec)); + mrec_end += ulint(index->online_log->head.buf - mrec); + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + index->lock.x_lock(SRW_LOCK_CALL); + } + + switch (error) { + case DB_SUCCESS: + break; + case DB_INDEX_CORRUPT: + if (((os_offset_t) index->online_log->tail.blocks + 1) + * srv_sort_buf_size >= srv_online_max_size) { + /* The log file grew too big. */ + error = DB_ONLINE_LOG_TOO_BIG; + } + /* fall through */ + default: + index->type |= DICT_CORRUPT; + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + row_log_block_free(index->online_log->head); + ut_free(offsets); + return(error); +} + +/** Apply the row log to the index upon completing index creation. +@param[in] trx transaction (for checking if the operation was +interrupted) +@param[in,out] index secondary index +@param[in,out] table MySQL table (for reporting duplicates) +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_log_index() will be called initially and then +stage->inc() will be called for each block of log that is applied or nullptr +when row log has been applied by DML thread. +@return DB_SUCCESS, or error code on failure */ +dberr_t +row_log_apply( + const trx_t* trx, + dict_index_t* index, + struct TABLE* table, + ut_stage_alter_t* stage) +{ + dberr_t error; + row_merge_dup_t dup = { index, table, NULL, 0 }; + DBUG_ENTER("row_log_apply"); + + ut_ad(dict_index_is_online_ddl(index) + || (index->online_log + && index->online_status == ONLINE_INDEX_COMPLETE)); + ut_ad(!dict_index_is_clust(index)); + + if (stage) { + stage->begin_phase_log_index(); + } + + log_free_check(); + + index->lock.x_lock(SRW_LOCK_CALL); + + if (index->online_log && !index->table->corrupted) { + error = row_log_apply_ops(trx, index, &dup, stage); + } else { + error = DB_SUCCESS; + } + + if (error != DB_SUCCESS) { + ut_ad(index->table->space); + index->type |= DICT_CORRUPT; + index->table->drop_aborted = TRUE; + + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + } else if (stage) { + /* Mark the index as completed only when it is + being called by DDL thread */ + ut_ad(dup.n_dup == 0); + dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE); + } + + index->lock.x_unlock(); + + DBUG_RETURN(error); +} + +unsigned row_log_get_n_core_fields(const dict_index_t *index) +{ + ut_ad(index->online_log); + return index->online_log->n_core_fields; +} + +dberr_t row_log_get_error(const dict_index_t *index) +{ + ut_ad(index->online_log); + return index->online_log->error; +} + +dberr_t dict_table_t::clear(que_thr_t *thr) +{ + dberr_t err= DB_SUCCESS; + for (dict_index_t *index= UT_LIST_GET_FIRST(indexes); index; + index= UT_LIST_GET_NEXT(indexes, index)) + { + if (index->type & DICT_FTS) + continue; + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + continue; + case ONLINE_INDEX_COMPLETE: + break; + case ONLINE_INDEX_CREATION: + ut_ad("invalid type" == 0); + MY_ASSERT_UNREACHABLE(); + break; + } + if (dberr_t err_index= index->clear(thr)) + err= err_index; + } + return err; +} + +inline bool UndorecApplier::is_same(roll_ptr_t roll_ptr) const +{ + return uint16_t(roll_ptr) == offset && + uint32_t(roll_ptr >> 16) == page_id.page_no(); +} + +const rec_t * +UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index, + const rec_t **clust_rec, rec_offs **offsets) +{ + ut_ad(index->is_primary()); + btr_pcur_t pcur; + + bool found= row_search_on_row_ref(&pcur, BTR_MODIFY_LEAF, + index->table, &tuple, &mtr); + ut_a(found); + *clust_rec= btr_pcur_get_rec(&pcur); + + ulint len= 0; + rec_t *prev_version; + const rec_t *version= *clust_rec; + do + { + *offsets= rec_get_offsets(version, index, *offsets, + index->n_core_fields, ULINT_UNDEFINED, + &heap); + roll_ptr_t roll_ptr= trx_read_roll_ptr( + rec_get_nth_field(version, *offsets, index->db_roll_ptr(), &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + if (is_same(roll_ptr)) + return version; + trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version, + nullptr, nullptr, 0); + version= prev_version; + } + while (version); + + return nullptr; +} + +/** Clear out all online log of other online indexes after +encountering the error during row_log_apply() in DML thread +@param table table which does online DDL */ +static void row_log_mark_other_online_index_abort(dict_table_t *table) +{ + dict_index_t *clust_index= dict_table_get_first_index(table); + for (dict_index_t *index= dict_table_get_next_index(clust_index); + index; index= dict_table_get_next_index(index)) + { + if (index->online_log && + index->online_status <= ONLINE_INDEX_CREATION && + !index->is_corrupted()) + { + index->lock.x_lock(SRW_LOCK_CALL); + row_log_abort_sec(index); + index->type|= DICT_CORRUPT; + index->lock.x_unlock(); + MONITOR_ATOMIC_INC(MONITOR_BACKGROUND_DROP_INDEX); + } + } + + clust_index->lock.x_lock(SRW_LOCK_CALL); + clust_index->online_log= nullptr; + clust_index->lock.x_unlock(); + table->drop_aborted= TRUE; +} + +void dtype_t::assign(const dict_col_t &col) +{ + prtype= col.prtype; + mtype= col.mtype; + len= col.len; + mbminlen= col.mbminlen; + mbmaxlen= col.mbmaxlen; +} + +inline void dtuple_t::copy_field_types(const dict_index_t &index) +{ + ut_ad(index.n_fields == n_fields); + if (UNIV_LIKELY_NULL(index.change_col_info)) + for (ulint i= 0; i < n_fields; i++) + fields[i].type.assign(*index.fields[i].col); +} + +void UndorecApplier::log_insert(const dtuple_t &tuple, + dict_index_t *clust_index) +{ + DEBUG_SYNC_C("row_log_insert_handle"); + ut_ad(clust_index->is_primary()); + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + + rec_offs_init(offsets_); + mtr.start(); + const rec_t *rec; + const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets); + if (!match_rec) + { + mtr.commit(); + return; + } + const rec_t *copy_rec= match_rec; + if (match_rec == rec) + { + copy_rec= rec_copy(mem_heap_alloc( + heap, rec_offs_size(offsets)), match_rec, offsets); + rec_offs_make_valid(copy_rec, clust_index, true, offsets); + } + mtr.commit(); + + dict_table_t *table= clust_index->table; + clust_index->lock.s_lock(SRW_LOCK_CALL); + if (clust_index->online_log && + !clust_index->online_log_is_dummy() && + clust_index->online_status <= ONLINE_INDEX_CREATION) + { + row_log_table_insert(copy_rec, clust_index, offsets); + clust_index->lock.s_unlock(); + } + else + { + clust_index->lock.s_unlock(); + row_ext_t *ext; + dtuple_t *row= row_build(ROW_COPY_POINTERS, clust_index, + copy_rec, offsets, table, nullptr, nullptr, &ext, heap); + + if (table->n_v_cols) + { + /* Update the row with virtual column values present + in the undo log or update vector */ + if (type == TRX_UNDO_UPD_DEL_REC) + row_upd_replace_vcol(row, table, update, false, nullptr, + (cmpl_info & UPD_NODE_NO_ORD_CHANGE) + ? nullptr : undo_rec); + else + trx_undo_read_v_cols(table, undo_rec, row, false); + } + + bool success= true; + for (dict_index_t *index= clust_index; + (index= dict_table_get_next_index(index)) != nullptr; ) + { + index->lock.s_lock(SRW_LOCK_CALL); + if (index->online_log && + index->online_status <= ONLINE_INDEX_CREATION && + !index->is_corrupted()) + { + dtuple_t *entry= row_build_index_entry_low(row, ext, index, + heap, ROW_BUILD_NORMAL); + entry->copy_field_types(*index); + success= row_log_online_op(index, entry, trx_id); + } + + index->lock.s_unlock(); + if (!success) + { + row_log_mark_other_online_index_abort(index->table); + return; + } + } + } +} + +void UndorecApplier::log_update(const dtuple_t &tuple, + dict_index_t *clust_index) +{ + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs *prev_offsets= offsets2_; + + rec_offs_init(offsets_); + rec_offs_init(offsets2_); + + dict_table_t *table= clust_index->table; + + clust_index->lock.s_lock(SRW_LOCK_CALL); + bool table_rebuild= + (clust_index->online_log + && !clust_index->online_log_is_dummy() + && clust_index->online_status <= ONLINE_INDEX_CREATION); + clust_index->lock.s_unlock(); + + mtr.start(); + const rec_t *rec; + rec_t *prev_version; + bool is_update= (type == TRX_UNDO_UPD_EXIST_REC); + const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets); + if (!match_rec) + { + mtr.commit(); + return; + } + + if (table_rebuild) + { + const rec_t *copy_rec= match_rec; + if (match_rec == rec) + copy_rec= rec_copy(mem_heap_alloc( + heap, rec_offs_size(offsets)), match_rec, offsets); + trx_undo_prev_version_build(match_rec, clust_index, offsets, heap, + &prev_version, nullptr, nullptr, 0); + + prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + rec_offs_make_valid(copy_rec, clust_index, true, offsets); + mtr.commit(); + + clust_index->lock.s_lock(SRW_LOCK_CALL); + /* Recheck whether clustered index online log has been cleared */ + if (clust_index->online_log) + { + if (is_update) + { + const dtuple_t *rebuilt_old_pk= row_log_table_get_pk( + prev_version, clust_index, prev_offsets, nullptr, &heap); + row_log_table_update(copy_rec, clust_index, offsets, rebuilt_old_pk); + } + else + row_log_table_delete(prev_version, clust_index, prev_offsets, nullptr); + } + clust_index->lock.s_unlock(); + return; + } + + dtuple_t *row= nullptr; + row_ext_t *new_ext; + if (match_rec != rec) + row= row_build(ROW_COPY_POINTERS, clust_index, match_rec, offsets, + clust_index->table, NULL, NULL, &new_ext, heap); + else + row= row_build(ROW_COPY_DATA, clust_index, rec, offsets, + clust_index->table, NULL, NULL, &new_ext, heap); + mtr.commit(); + row_ext_t *old_ext; + dtuple_t *old_row= nullptr; + if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) + { + for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++) + dfield_get_type( + dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING; + } + + if (is_update) + { + old_row= dtuple_copy(row, heap); + row_upd_replace(old_row, &old_ext, clust_index, update, heap); + } + + if (table->n_v_cols) + row_upd_replace_vcol(row, table, update, false, nullptr, + (cmpl_info & UPD_NODE_NO_ORD_CHANGE) + ? nullptr : undo_rec); + + bool success= true; + dict_index_t *index= dict_table_get_next_index(clust_index); + while (index) + { + index->lock.s_lock(SRW_LOCK_CALL); + if (index->online_log && + index->online_status <= ONLINE_INDEX_CREATION && + !index->is_corrupted()) + { + if (is_update) + { + /* Ignore the index if the update doesn't affect the index */ + if (!row_upd_changes_ord_field_binary(index, update, + nullptr, + row, new_ext)) + goto next_index; + dtuple_t *old_entry= row_build_index_entry_low( + old_row, old_ext, index, heap, ROW_BUILD_NORMAL); + + old_entry->copy_field_types(*index); + + success= row_log_online_op(index, old_entry, 0); + + dtuple_t *new_entry= row_build_index_entry_low( + row, new_ext, index, heap, ROW_BUILD_NORMAL); + + new_entry->copy_field_types(*index); + + if (success) + success= row_log_online_op(index, new_entry, trx_id); + } + else + { + dtuple_t *old_entry= row_build_index_entry_low( + row, new_ext, index, heap, ROW_BUILD_NORMAL); + + old_entry->copy_field_types(*index); + + success= row_log_online_op(index, old_entry, 0); + } + } +next_index: + index->lock.s_unlock(); + if (!success) + { + row_log_mark_other_online_index_abort(index->table); + return; + } + index= dict_table_get_next_index(index); + } +} + diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc new file mode 100644 index 00000000..5df93fe6 --- /dev/null +++ b/storage/innobase/row/row0merge.cc @@ -0,0 +1,5406 @@ +/***************************************************************************** + +Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0merge.cc +New index creation routines using a merge sort + +Created 12/4/2005 Jan Lindstrom +Completed by Sunny Bains and Marko Makela +*******************************************************/ +#include <my_global.h> +#include <log.h> +#include <sql_class.h> +#include <math.h> + +#include "row0merge.h" +#include "row0ext.h" +#include "row0log.h" +#include "row0ins.h" +#include "row0row.h" +#include "row0sel.h" +#include "log0crypt.h" +#include "dict0crea.h" +#include "trx0purge.h" +#include "lock0lock.h" +#include "pars0pars.h" +#include "ut0sort.h" +#include "row0ftsort.h" +#include "row0import.h" +#include "row0vers.h" +#include "handler0alter.h" +#include "btr0bulk.h" +#ifdef BTR_CUR_ADAPT +# include "btr0sea.h" +#endif /* BTR_CUR_ADAPT */ +#include "ut0stage.h" +#include "fil0crypt.h" +#include "srv0mon.h" + +/* Ignore posix_fadvise() on those platforms where it does not exist */ +#if defined _WIN32 +# define posix_fadvise(fd, offset, len, advice) /* nothing */ +#endif /* _WIN32 */ + +/* Whether to disable file system cache */ +char srv_disable_sort_file_cache; + +/** Class that caches spatial index row tuples made from a single cluster +index page scan, and then insert into corresponding index tree */ +class spatial_index_info { +public: + /** constructor + @param index spatial index to be created */ + spatial_index_info(dict_index_t *index) : index(index) + { + ut_ad(index->is_spatial()); + } + + /** Caches an index row into index tuple vector + @param[in] row table row + @param[in] ext externally stored column prefixes, or NULL */ + void add(const dtuple_t *row, const row_ext_t *ext, mem_heap_t *heap) + { + dtuple_t *dtuple= row_build_index_entry(row, ext, index, heap); + ut_ad(dtuple); + ut_ad(dtuple->n_fields == index->n_fields); + if (ext) + { + /* Replace any references to ext, because ext will be allocated + from row_heap. */ + for (ulint i= 1; i < dtuple->n_fields; i++) + { + dfield_t &dfield= dtuple->fields[i]; + if (dfield.data >= ext->buf && + dfield.data <= &ext->buf[ext->n_ext * ext->max_len]) + dfield_dup(&dfield, heap); + } + } + m_dtuple_vec.push_back(dtuple); + } + + /** Insert spatial index rows cached in vector into spatial index + @param[in] trx_id transaction id + @param[in] pcur cluster index scanning cursor + @param[in,out] mtr_started whether scan_mtr is active + @param[in,out] heap temporary memory heap + @param[in,out] scan_mtr mini-transaction for pcur + @return DB_SUCCESS if successful, else error number */ + dberr_t insert(trx_id_t trx_id, btr_pcur_t* pcur, + bool& mtr_started, mem_heap_t* heap, mtr_t* scan_mtr) + { + big_rec_t* big_rec; + rec_t* rec; + btr_cur_t ins_cur; + mtr_t mtr; + rtr_info_t rtr_info; + rec_offs* ins_offsets = NULL; + dberr_t error = DB_SUCCESS; + dtuple_t* dtuple; + const ulint flag = BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG; + + ut_ad(mtr_started == scan_mtr->is_active()); + + DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush", + log_sys.set_check_flush_or_checkpoint();); + + for (idx_tuple_vec::iterator it = m_dtuple_vec.begin(); + it != m_dtuple_vec.end(); + ++it) { + dtuple = *it; + ut_ad(dtuple); + + if (log_sys.check_flush_or_checkpoint()) { + if (mtr_started) { + if (!btr_pcur_move_to_prev_on_page(pcur)) { + error = DB_CORRUPTION; + break; + } + btr_pcur_store_position(pcur, scan_mtr); + scan_mtr->commit(); + mtr_started = false; + } + + log_free_check(); + } + + mtr.start(); + index->set_modified(mtr); + + ins_cur.page_cur.index = index; + rtr_init_rtr_info(&rtr_info, false, &ins_cur, index, + false); + rtr_info_update_btr(&ins_cur, &rtr_info); + + error = rtr_insert_leaf(&ins_cur, dtuple, + BTR_MODIFY_LEAF, &mtr); + + /* It need to update MBR in parent entry, + so change search mode to BTR_MODIFY_TREE */ + if (error == DB_SUCCESS && rtr_info.mbr_adj) { + mtr.commit(); + rtr_clean_rtr_info(&rtr_info, true); + rtr_init_rtr_info(&rtr_info, false, &ins_cur, + index, false); + rtr_info_update_btr(&ins_cur, &rtr_info); + mtr.start(); + index->set_modified(mtr); + error = rtr_insert_leaf(&ins_cur, dtuple, + BTR_MODIFY_TREE, &mtr); + } + + if (error == DB_SUCCESS) { + error = btr_cur_optimistic_insert( + flag, &ins_cur, &ins_offsets, + &heap, dtuple, &rec, &big_rec, + 0, NULL, &mtr); + } + + ut_ad(!big_rec); + + if (error == DB_FAIL) { + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + + rtr_clean_rtr_info(&rtr_info, true); + rtr_init_rtr_info(&rtr_info, false, + &ins_cur, index, false); + + rtr_info_update_btr(&ins_cur, &rtr_info); + error = rtr_insert_leaf(&ins_cur, dtuple, + BTR_MODIFY_TREE, &mtr); + + if (error == DB_SUCCESS) { + error = btr_cur_pessimistic_insert( + flag, &ins_cur, &ins_offsets, + &heap, dtuple, &rec, + &big_rec, 0, NULL, &mtr); + } + } + + ut_ad(!big_rec); + + DBUG_EXECUTE_IF( + "row_merge_ins_spatial_fail", + error = DB_FAIL; + ); + + if (error == DB_SUCCESS) { + if (rtr_info.mbr_adj) { + error = rtr_ins_enlarge_mbr( + &ins_cur, &mtr); + } + + if (error == DB_SUCCESS) { + page_update_max_trx_id( + btr_cur_get_block(&ins_cur), + btr_cur_get_page_zip(&ins_cur), + trx_id, &mtr); + } + } + + mtr.commit(); + + rtr_clean_rtr_info(&rtr_info, true); + } + + m_dtuple_vec.clear(); + + return(error); + } + +private: + /** Cache index rows made from a cluster index scan. Usually + for rows on single cluster index page */ + typedef std::vector<dtuple_t*, ut_allocator<dtuple_t*> > idx_tuple_vec; + + /** vector used to cache index rows made from cluster index scan */ + idx_tuple_vec m_dtuple_vec; +public: + /** the index being built */ + dict_index_t*const index; +}; + +/* Maximum pending doc memory limit in bytes for a fts tokenization thread */ +#define FTS_PENDING_DOC_MEMORY_LIMIT 1000000 + +/** Insert sorted data tuples to the index. +@param[in] index index to be inserted +@param[in] old_table old table +@param[in] fd file descriptor +@param[in,out] block file buffer +@param[in] row_buf row_buf the sorted data tuples, +or NULL if fd, block will be used instead +@param[in,out] btr_bulk btr bulk instance +@param[in] table_total_rows total rows of old table +@param[in] pct_progress total progress percent untill now +@param[in] pct_cost current progress percent +@param[in] crypt_block buffer for encryption or NULL +@param[in] space space id +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL stage->begin_phase_insert() will be called initially +and then stage->inc() will be called for each record that is processed. +@param[in] blob_file To read big column field data from + the given blob file. It is + applicable only for bulk insert + operation +@return DB_SUCCESS or error number */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_merge_insert_index_tuples( + dict_index_t* index, + const dict_table_t* old_table, + const pfs_os_file_t& fd, + row_merge_block_t* block, + const row_merge_buf_t* row_buf, + BtrBulk* btr_bulk, + const ib_uint64_t table_total_rows, + double pct_progress, + double pct_cost, + row_merge_block_t* crypt_block, + ulint space, + ut_stage_alter_t* stage= nullptr, + merge_file_t* blob_file= nullptr); + +/** Encode an index record. +@return size of the record */ +static MY_ATTRIBUTE((nonnull)) +ulint +row_merge_buf_encode( +/*=================*/ + byte** b, /*!< in/out: pointer to + current end of output buffer */ + const dict_index_t* index, /*!< in: index */ + const mtuple_t* entry, /*!< in: index fields + of the record to encode */ + ulint n_fields) /*!< in: number of fields + in the entry */ +{ + ulint size; + ulint extra_size; + + size = rec_get_converted_size_temp<false>( + index, entry->fields, n_fields, &extra_size); + ut_ad(size >= extra_size); + + /* Encode extra_size + 1 */ + if (extra_size + 1 < 0x80) { + *(*b)++ = (byte) (extra_size + 1); + } else { + ut_ad((extra_size + 1) < 0x8000); + *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8)); + *(*b)++ = (byte) (extra_size + 1); + } + + rec_convert_dtuple_to_temp<false>(*b + extra_size, index, + entry->fields, n_fields); + + *b += size; + return size; +} + +static MY_ATTRIBUTE((malloc, nonnull)) +row_merge_buf_t* +row_merge_buf_create_low( + row_merge_buf_t *buf, mem_heap_t *heap, dict_index_t *index) +{ + ulint max_tuples = srv_sort_buf_size + / std::max<ulint>(1, dict_index_get_min_size(index)); + ut_ad(max_tuples > 0); + ut_ad(max_tuples <= srv_sort_buf_size); + + buf->heap = heap; + buf->index = index; + buf->max_tuples = max_tuples; + buf->tuples = static_cast<mtuple_t*>( + ut_malloc_nokey(2 * max_tuples * sizeof *buf->tuples)); + buf->tmp_tuples = buf->tuples + max_tuples; + return(buf); +} + +/******************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ +{ + row_merge_buf_t* buf; + ulint buf_size; + mem_heap_t* heap; + + buf_size = (sizeof *buf); + + heap = mem_heap_create(buf_size); + + buf = static_cast<row_merge_buf_t*>( + mem_heap_zalloc(heap, buf_size)); + row_merge_buf_create_low(buf, heap, index); + + return(buf); +} + +/******************************************************//** +Empty a sort buffer. +@return sort buffer */ +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ +{ + ulint buf_size = sizeof *buf; + ulint max_tuples = buf->max_tuples; + mem_heap_t* heap = buf->heap; + dict_index_t* index = buf->index; + mtuple_t* tuples = buf->tuples; + + mem_heap_empty(heap); + + buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size)); + buf->heap = heap; + buf->index = index; + buf->max_tuples = max_tuples; + buf->tuples = tuples; + buf->tmp_tuples = buf->tuples + max_tuples; + + return(buf); +} + +/******************************************************//** +Deallocate a sort buffer. */ +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ +{ + ut_free(buf->tuples); + mem_heap_free(buf->heap); +} + +/** Convert the field data from compact to redundant format. +@param[in] row_field field to copy from +@param[out] field field to copy to +@param[in] len length of the field data +@param[in] zip_size compressed BLOB page size, + zero for uncompressed BLOBs +@param[in,out] heap memory heap where to allocate data when + converting to ROW_FORMAT=REDUNDANT, or NULL + when not to invoke + row_merge_buf_redundant_convert(). */ +static +void +row_merge_buf_redundant_convert( + const dfield_t* row_field, + dfield_t* field, + ulint len, + ulint zip_size, + mem_heap_t* heap) +{ + ut_ad(field->type.mbminlen == 1); + ut_ad(field->type.mbmaxlen > 1); + + byte* buf = (byte*) mem_heap_alloc(heap, len); + ulint field_len = row_field->len; + ut_ad(field_len <= len); + + if (row_field->ext) { + const byte* field_data = static_cast<const byte*>( + dfield_get_data(row_field)); + ulint ext_len; + + ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_a(memcmp(field_data + field_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + byte* data = btr_copy_externally_stored_field( + &ext_len, field_data, zip_size, field_len, heap); + + ut_ad(ext_len < len); + + memcpy(buf, data, ext_len); + field_len = ext_len; + } else { + memcpy(buf, row_field->data, field_len); + } + + memset(buf + field_len, 0x20, len - field_len); + + dfield_set_data(field, buf, len); +} + +/** Insert the tuple into bulk buffer insert operation +@param buf merge buffer for the index operation +@param table bulk insert operation for the table +@param row tuple to be inserted +@return number of rows inserted */ +static ulint row_merge_bulk_buf_add(row_merge_buf_t* buf, + const dict_table_t &table, + const dtuple_t &row) +{ + if (buf->n_tuples >= buf->max_tuples) + return 0; + + const dict_index_t *index= buf->index; + ulint n_fields= dict_index_get_n_fields(index); + mtuple_t *entry= &buf->tuples[buf->n_tuples]; + ulint data_size= 0; + ulint extra_size= UT_BITS_IN_BYTES(unsigned(index->n_nullable)); + dfield_t *field= entry->fields= static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields)); + const dict_field_t *ifield= dict_index_get_nth_field(index, 0); + + for (ulint i = 0; i < n_fields; i++, field++, ifield++) + { + dfield_copy(field, &row.fields[i]); + ulint len= dfield_get_len(field); + const dict_col_t* const col= ifield->col; + + if (dfield_is_null(field)) + continue; + + ulint fixed_len= ifield->fixed_len; + + /* CHAR in ROW_FORMAT=REDUNDANT is always + fixed-length, but in the temporary file it is + variable-length for variable-length character sets. */ + if (fixed_len && !index->table->not_redundant() && + col->mbminlen != col->mbmaxlen) + fixed_len= 0; + + if (fixed_len); + else if (len < 128 || (!DATA_BIG_COL(col))) + extra_size++; + else + extra_size += 2; + data_size += len; + } + + /* Add to the total size of the record in row_merge_block_t + the encoded length of extra_size and the extra bytes (extra_size). + See row_merge_buf_write() for the variable-length encoding + of extra_size. */ + data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); + + /* Reserve bytes for the end marker of row_merge_block_t. */ + if (buf->total_size + data_size >= srv_sort_buf_size) + return 0; + + buf->total_size += data_size; + buf->n_tuples++; + + field= entry->fields; + + do + dfield_dup(field++, buf->heap); + while (--n_fields); + + return 1; +} + +/** Insert a data tuple into a sort buffer. +@param[in,out] buf sort buffer +@param[in] fts_index fts index to be created +@param[in] old_table original table +@param[in] new_table new table +@param[in,out] psort_info parallel sort info +@param[in,out] row table row +@param[in] ext cache of externally stored + column prefixes, or NULL +@param[in] history_fts row is historical in a system-versioned table + on which a FTS_DOC_ID_INDEX(FTS_DOC_ID) exists +@param[in,out] doc_id Doc ID if we are creating + FTS index +@param[in,out] conv_heap memory heap where to allocate data when + converting to ROW_FORMAT=REDUNDANT, or NULL + when not to invoke + row_merge_buf_redundant_convert() +@param[in,out] err set if error occurs +@param[in,out] v_heap heap memory to process data for virtual column +@param[in,out] my_table mysql table object +@param[in] trx transaction object +@param[in] col_collate columns whose collations changed, or nullptr +@return number of rows added, 0 if out of space */ +static +ulint +row_merge_buf_add( + row_merge_buf_t* buf, + dict_index_t* fts_index, + const dict_table_t* old_table, + const dict_table_t* new_table, + fts_psort_t* psort_info, + dtuple_t* row, + const row_ext_t* ext, + const bool history_fts, + doc_id_t* doc_id, + mem_heap_t* conv_heap, + dberr_t* err, + mem_heap_t** v_heap, + TABLE* my_table, + trx_t* trx, + const col_collations* col_collate) +{ + ulint i; + const dict_index_t* index; + mtuple_t* entry; + dfield_t* field; + const dict_field_t* ifield; + ulint n_fields; + ulint data_size; + ulint extra_size; + ulint bucket = 0; + doc_id_t write_doc_id; + ulint n_row_added = 0; + VCOL_STORAGE vcol_storage; + + DBUG_ENTER("row_merge_buf_add"); + + if (buf->n_tuples >= buf->max_tuples) { +error: + n_row_added = 0; + goto end; + } + + DBUG_EXECUTE_IF( + "ib_row_merge_buf_add_two", + if (buf->n_tuples >= 2) DBUG_RETURN(0);); + + UNIV_PREFETCH_R(row->fields); + + /* If we are building FTS index, buf->index points to + the 'fts_sort_idx', and real FTS index is stored in + fts_index */ + index = (buf->index->type & DICT_FTS) ? fts_index : buf->index; + + /* create spatial index should not come here */ + ut_ad(!dict_index_is_spatial(index)); + + n_fields = dict_index_get_n_fields(index); + + entry = &buf->tuples[buf->n_tuples]; + field = entry->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields)); + + data_size = 0; + extra_size = UT_BITS_IN_BYTES(unsigned(index->n_nullable)); + + ifield = dict_index_get_nth_field(index, 0); + + for (i = 0; i < n_fields; i++, field++, ifield++) { + ulint len; + ulint fixed_len; + const dfield_t* row_field; + const dict_col_t* const col = ifield->col; + const dict_v_col_t* const v_col = col->is_virtual() + ? reinterpret_cast<const dict_v_col_t*>(col) + : NULL; + + /* Process the Doc ID column */ + if (!v_col && (history_fts || *doc_id) + && col->ind == index->table->fts->doc_col) { + fts_write_doc_id((byte*) &write_doc_id, *doc_id); + + /* Note: field->data now points to a value on the + stack: &write_doc_id after dfield_set_data(). Because + there is only one doc_id per row, it shouldn't matter. + We allocate a new buffer before we leave the function + later below. */ + + dfield_set_data( + field, &write_doc_id, sizeof(write_doc_id)); + + field->type.mtype = ifield->col->mtype; + field->type.prtype = ifield->col->prtype; + field->type.mbminlen = 0; + field->type.mbmaxlen = 0; + field->type.len = ifield->col->len; + } else { + /* Use callback to get the virtual column value */ + if (v_col) { + dict_index_t* clust_index + = dict_table_get_first_index(new_table); + + if (!vcol_storage.innobase_record && + !innobase_allocate_row_for_vcol( + trx->mysql_thd, clust_index, + v_heap, &my_table, + &vcol_storage)) { + *err = DB_OUT_OF_MEMORY; + goto error; + } + + row_field = innobase_get_computed_value( + row, v_col, clust_index, + v_heap, NULL, ifield, trx->mysql_thd, + my_table, vcol_storage.innobase_record, + old_table, NULL); + + if (row_field == NULL) { + *err = DB_COMPUTE_VALUE_FAILED; + goto error; + } + dfield_copy(field, row_field); + } else { + row_field = dtuple_get_nth_field(row, + col->ind); + dfield_copy(field, row_field); + + /* Copy the column collation to the + tuple field */ + if (col_collate) { + auto it = col_collate->find(col->ind); + if (it != col_collate->end()) { + field->type + .assign(*it->second); + } + } + } + + /* Tokenize and process data for FTS */ + if (!history_fts && (index->type & DICT_FTS)) { + fts_doc_item_t* doc_item; + byte* value; + void* ptr; + const ulint max_trial_count = 10000; + ulint trial_count = 0; + + /* fetch Doc ID if it already exists + in the row, and not supplied by the + caller. Even if the value column is + NULL, we still need to get the Doc + ID so to maintain the correct max + Doc ID */ + if (*doc_id == 0) { + const dfield_t* doc_field; + doc_field = dtuple_get_nth_field( + row, + index->table->fts->doc_col); + *doc_id = (doc_id_t) mach_read_from_8( + static_cast<const byte*>( + dfield_get_data(doc_field))); + + if (*doc_id == 0) { + ib::warn() << "FTS Doc ID is" + " zero. Record" + " skipped"; + goto error; + } + } + + if (dfield_is_null(field)) { + n_row_added = 1; + continue; + } + + ptr = ut_malloc_nokey(sizeof(*doc_item) + + field->len); + + doc_item = static_cast<fts_doc_item_t*>(ptr); + value = static_cast<byte*>(ptr) + + sizeof(*doc_item); + memcpy(value, field->data, field->len); + field->data = value; + + doc_item->field = field; + doc_item->doc_id = *doc_id; + + bucket = static_cast<ulint>( + *doc_id % fts_sort_pll_degree); + + /* Add doc item to fts_doc_list */ + mysql_mutex_lock(&psort_info[bucket].mutex); + + if (psort_info[bucket].error == DB_SUCCESS) { + UT_LIST_ADD_LAST( + psort_info[bucket].fts_doc_list, + doc_item); + psort_info[bucket].memory_used += + sizeof(*doc_item) + field->len; + } else { + ut_free(doc_item); + } + + mysql_mutex_unlock(&psort_info[bucket].mutex); + + /* Sleep when memory used exceeds limit*/ + while (psort_info[bucket].memory_used + > FTS_PENDING_DOC_MEMORY_LIMIT + && trial_count++ < max_trial_count) { + std::this_thread::sleep_for( + std::chrono::milliseconds(1)); + } + + n_row_added = 1; + continue; + } + + /* innobase_get_computed_value() sets the + length of the virtual column field. */ + if (v_col == NULL + && field->len != UNIV_SQL_NULL + && col->mtype == DATA_MYSQL + && col->len != field->len) { + if (conv_heap != NULL) { + row_merge_buf_redundant_convert( + row_field, field, col->len, + old_table->space->zip_size(), + conv_heap); + } + } + } + + len = dfield_get_len(field); + + if (dfield_is_null(field)) { + ut_ad(!(col->prtype & DATA_NOT_NULL)); + continue; + } else if (!ext) { + } else if (dict_index_is_clust(index)) { + /* Flag externally stored fields. */ + const byte* buf = row_ext_lookup(ext, col->ind, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + if (i < dict_index_get_n_unique(index)) { + dfield_set_data(field, buf, len); + } else { + dfield_set_ext(field); + len = dfield_get_len(field); + } + } + } else if (!v_col) { + /* Only non-virtual column are stored externally */ + const byte* buf = row_ext_lookup(ext, col->ind, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + dfield_set_data(field, buf, len); + } + } + + /* If a column prefix index, take only the prefix */ + + if (ifield->prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminlen, col->mbmaxlen, + ifield->prefix_len, + len, + static_cast<char*>(dfield_get_data(field))); + dfield_set_len(field, len); + } + + ut_ad(len <= col->len + || DATA_LARGE_MTYPE(col->mtype)); + + fixed_len = ifield->fixed_len; + if (fixed_len && !dict_table_is_comp(index->table) + && col->mbminlen != col->mbmaxlen) { + /* CHAR in ROW_FORMAT=REDUNDANT is always + fixed-length, but in the temporary file it is + variable-length for variable-length character + sets. */ + fixed_len = 0; + } + + if (fixed_len) { +#ifdef UNIV_DEBUG + /* len should be between size calcualted base on + mbmaxlen and mbminlen */ + ut_ad(len <= fixed_len); + ut_ad(!col->mbmaxlen || len >= col->mbminlen + * (fixed_len / col->mbmaxlen)); + + ut_ad(!dfield_is_ext(field)); +#endif /* UNIV_DEBUG */ + } else if (dfield_is_ext(field)) { + extra_size += 2; + } else if (len < 128 + || (!DATA_BIG_COL(col))) { + extra_size++; + } else { + /* For variable-length columns, we look up the + maximum length from the column itself. If this + is a prefix index column shorter than 256 bytes, + this will waste one byte. */ + extra_size += 2; + } + data_size += len; + } + + /* If this is FTS index, we already populated the sort buffer, return + here */ + if (index->type & DICT_FTS) { + goto end; + } + +#ifdef UNIV_DEBUG + { + ulint size; + ulint extra; + + size = rec_get_converted_size_temp<false>( + index, entry->fields, n_fields, &extra); + + ut_ad(data_size + extra_size == size); + ut_ad(extra_size == extra); + } +#endif /* UNIV_DEBUG */ + + /* Add to the total size of the record in row_merge_block_t + the encoded length of extra_size and the extra bytes (extra_size). + See row_merge_buf_write() for the variable-length encoding + of extra_size. */ + data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); + + /* Record size can exceed page size while converting to + redundant row format. But there is assert + ut_ad(size < srv_page_size) in rec_offs_data_size(). + It may hit the assert before attempting to insert the row. */ + if (conv_heap != NULL && data_size > srv_page_size) { + *err = DB_TOO_BIG_RECORD; + } + + ut_ad(data_size < srv_sort_buf_size); + + /* Reserve bytes for the end marker of row_merge_block_t. */ + if (buf->total_size + data_size >= srv_sort_buf_size) { + goto error; + } + + buf->total_size += data_size; + buf->n_tuples++; + n_row_added++; + + field = entry->fields; + + /* Copy the data fields. */ + + do { + dfield_dup(field++, buf->heap); + } while (--n_fields); + + if (conv_heap != NULL) { + mem_heap_empty(conv_heap); + } + +end: + if (vcol_storage.innobase_record) + innobase_free_row_for_vcol(&vcol_storage); + DBUG_RETURN(n_row_added); +} + +/*************************************************************//** +Report a duplicate key. */ +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ +{ + if (!dup->n_dup++ && dup->table) { + /* Only report the first duplicate record, + but count all duplicate records. */ + innobase_fields_to_mysql(dup->table, dup->index, entry); + } +} + +/*************************************************************//** +Compare two tuples. +@return positive, 0, negative if a is greater, equal, less, than b, +respectively */ +static MY_ATTRIBUTE((warn_unused_result)) +int +row_merge_tuple_cmp( +/*================*/ + const dict_index_t* index, /*< in: index tree */ + ulint n_uniq, /*!< in: number of unique fields */ + ulint n_field,/*!< in: number of fields */ + const mtuple_t& a, /*!< in: first tuple to be compared */ + const mtuple_t& b, /*!< in: second tuple to be compared */ + row_merge_dup_t* dup) /*!< in/out: for reporting duplicates, + NULL if non-unique index */ +{ + int cmp; + const dfield_t* af = a.fields; + const dfield_t* bf = b.fields; + ulint n = n_uniq; + const dict_field_t* f = index->fields; + + ut_ad(n_uniq > 0); + ut_ad(n_uniq <= n_field); + + /* Compare the fields of the tuples until a difference is + found or we run out of fields to compare. If !cmp at the + end, the tuples are equal. */ + do { + cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending); + } while (!cmp && --n); + + if (cmp) { + return(cmp); + } + + if (dup) { + /* Report a duplicate value error if the tuples are + logically equal. NULL columns are logically inequal, + although they are equal in the sorting order. Find + out if any of the fields are NULL. */ + for (const dfield_t* df = a.fields; df != af; df++) { + if (dfield_is_null(df)) { + goto no_report; + } + } + + row_merge_dup_report(dup, a.fields); + } + +no_report: + /* The n_uniq fields were equal, but we compare all fields so + that we will get the same (internal) order as in the B-tree. */ + for (n = n_field - n_uniq + 1; --n; ) { + cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending); + if (cmp) { + return(cmp); + } + } + + /* This should never be reached, except in a secondary index + when creating a secondary index and a PRIMARY KEY, and there + is a duplicate in the PRIMARY KEY that has not been detected + yet. Internally, an index must never contain duplicates. */ + return(cmp); +} + +/** Wrapper for row_merge_tuple_sort() to inject some more context to +UT_SORT_FUNCTION_BODY(). +@param tuples array of tuples that being sorted +@param aux work area, same size as tuples[] +@param low lower bound of the sorting area, inclusive +@param high upper bound of the sorting area, inclusive */ +#define row_merge_tuple_sort_ctx(tuples, aux, low, high) \ + row_merge_tuple_sort(index,n_uniq,n_field,dup, tuples, aux, low, high) +/** Wrapper for row_merge_tuple_cmp() to inject some more context to +UT_SORT_FUNCTION_BODY(). +@param a first tuple to be compared +@param b second tuple to be compared +@return positive, 0, negative, if a is greater, equal, less, than b, +respectively */ +#define row_merge_tuple_cmp_ctx(a,b) \ + row_merge_tuple_cmp(index, n_uniq, n_field, a, b, dup) + +/**********************************************************************//** +Merge sort the tuple buffer in main memory. */ +static +void +row_merge_tuple_sort( +/*=================*/ + const dict_index_t* index, /*!< in: index tree */ + ulint n_uniq, /*!< in: number of unique fields */ + ulint n_field,/*!< in: number of fields */ + row_merge_dup_t* dup, /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + mtuple_t* tuples, /*!< in/out: tuples */ + mtuple_t* aux, /*!< in/out: work area */ + ulint low, /*!< in: lower bound of the + sorting area, inclusive */ + ulint high) /*!< in: upper bound of the + sorting area, exclusive */ +{ + ut_ad(n_field > 0); + ut_ad(n_uniq <= n_field); + + UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx, + tuples, aux, low, high, row_merge_tuple_cmp_ctx); +} + +/******************************************************//** +Sort a buffer. */ +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ +{ + ut_ad(!buf->index->is_spatial()); + row_merge_tuple_sort(buf->index, buf->index->n_uniq, buf->index->n_fields, + dup, buf->tuples, buf->tmp_tuples, 0, buf->n_tuples); +} + +/** Write the blob field data to temporary file and fill the offset, +length in the field data +@param field tuple field +@param blob_file file to store the blob data +@param heap heap to store the blob offset and length +@return DB_SUCCESS if successful */ +static dberr_t row_merge_write_blob_to_tmp_file( + dfield_t *field, merge_file_t *blob_file,mem_heap_t **heap) +{ + if (blob_file->fd == OS_FILE_CLOSED) + { + blob_file->fd= row_merge_file_create_low(nullptr); + if (blob_file->fd == OS_FILE_CLOSED) + return DB_OUT_OF_MEMORY; + } + uint64_t val= blob_file->offset; + uint32_t len= field->len; + dberr_t err= os_file_write( + IORequestWrite, "(bulk insert)", blob_file->fd, + field->data, blob_file->offset, len); + + if (err != DB_SUCCESS) + return err; + + byte *data= static_cast<byte*> + (mem_heap_alloc(*heap, BTR_EXTERN_FIELD_REF_SIZE)); + + /* Write zeroes for first 8 bytes */ + memset(data, 0, 8); + /* Write offset for next 8 bytes */ + mach_write_to_8(data + 8, val); + /* Write length of the blob in 4 bytes */ + mach_write_to_4(data + 16, len); + blob_file->offset+= field->len; + blob_file->n_rec++; + dfield_set_data(field, data, BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(field); + return err; +} + +/** This function is invoked when tuple size is greater than +innodb_sort_buffer_size. Basically it recreates the tuple +by writing the blob field to the temporary file. +@param entry index fields to be encode the blob +@param blob_file file to store the blob data +@param heap heap to store the blob offset and blob length +@return tuple which fits into sort_buffer_size */ +static dtuple_t* row_merge_buf_large_tuple(const dtuple_t &entry, + merge_file_t *blob_file, + mem_heap_t **heap) +{ + if (!*heap) + *heap= mem_heap_create(DTUPLE_EST_ALLOC(entry.n_fields)); + + dtuple_t *tuple= dtuple_copy(&entry, *heap); + for (ulint i= 0; i < tuple->n_fields; i++) + { + dfield_t *field= &tuple->fields[i]; + if (dfield_is_null(field) || field->len <= 2000) + continue; + + dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap); + if (err != DB_SUCCESS) + return nullptr; + } + + return tuple; +} + + +/** Write the field data whose length is more than 2000 bytes +into blob temporary file and write offset, length into the +tuple field +@param entry index fields to be encode the blob +@param n_fields number of fields in the entry +@param heap heap to store the blob offset and blob length +@param blob_file file to store the blob data */ +static dberr_t row_merge_buf_blob(const mtuple_t *entry, ulint n_fields, + mem_heap_t **heap, merge_file_t *blob_file) +{ + + if (!*heap) + *heap= mem_heap_create(100); + + for (ulint i= 0; i < n_fields; i++) + { + dfield_t *field= &entry->fields[i]; + if (dfield_is_null(field) || field->len <= 2000) + continue; + + dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap); + if (err != DB_SUCCESS) + return err; + } + + return DB_SUCCESS; +} + +/** Write a buffer to a block. +@param buf sorted buffer +@param block buffer for writing to file +@param blob_file blob file handle for doing bulk insert operation */ +dberr_t row_merge_buf_write(const row_merge_buf_t *buf, +#ifndef DBUG_OFF + const merge_file_t *of, /*!< output file */ +#endif + row_merge_block_t *block, + merge_file_t *blob_file) +{ + const dict_index_t* index = buf->index; + ulint n_fields= dict_index_get_n_fields(index); + byte* b = &block[0]; + mem_heap_t* blob_heap = nullptr; + dberr_t err = DB_SUCCESS; + + DBUG_ENTER("row_merge_buf_write"); + + for (ulint i = 0; i < buf->n_tuples; i++) { + const mtuple_t* entry = &buf->tuples[i]; + + if (blob_file) { + ut_ad(buf->index->is_primary()); + err = row_merge_buf_blob( + entry, n_fields, &blob_heap, blob_file); + if (err != DB_SUCCESS) { + goto func_exit; + } + } + + ulint rec_size= row_merge_buf_encode( + &b, index, entry, n_fields); + if (blob_file && rec_size > srv_page_size) { + err = DB_TOO_BIG_RECORD; + goto func_exit; + } + + ut_ad(b < &block[srv_sort_buf_size]); + + DBUG_LOG("ib_merge_sort", + reinterpret_cast<const void*>(b) << ',' + << of->fd << ',' << of->offset << ' ' << + i << ": " << + rec_printer(entry->fields, n_fields).str()); + } + + /* Write an "end-of-chunk" marker. */ + ut_a(b < &block[srv_sort_buf_size]); + ut_a(b == &block[0] + buf->total_size || blob_file); + *b++ = 0; +#ifdef HAVE_valgrind + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, &block[srv_sort_buf_size] - b); +#endif /* HAVE_valgrind */ + DBUG_LOG("ib_merge_sort", + "write " << reinterpret_cast<const void*>(b) << ',' + << of->fd << ',' << of->offset << " EOF"); +func_exit: + if (blob_heap) { + mem_heap_free(blob_heap); + } + + DBUG_RETURN(err); +} + +/******************************************************//** +Create a memory heap and allocate space for row_merge_rec_offsets() +and mrec_buf_t[3]. +@return memory heap */ +static +mem_heap_t* +row_merge_heap_create( +/*==================*/ + const dict_index_t* index, /*!< in: record descriptor */ + mrec_buf_t** buf, /*!< out: 3 buffers */ + rec_offs** offsets1, /*!< out: offsets */ + rec_offs** offsets2) /*!< out: offsets */ +{ + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1 + + 3 * sizeof **buf); + + *buf = static_cast<mrec_buf_t*>( + mem_heap_alloc(heap, 3 * sizeof **buf)); + *offsets1 = static_cast<rec_offs*>( + mem_heap_alloc(heap, i * sizeof **offsets1)); + *offsets2 = static_cast<rec_offs*>( + mem_heap_alloc(heap, i * sizeof **offsets2)); + + rec_offs_set_n_alloc(*offsets1, i); + rec_offs_set_n_alloc(*offsets2, i); + rec_offs_set_n_fields(*offsets1, dict_index_get_n_fields(index)); + rec_offs_set_n_fields(*offsets2, dict_index_get_n_fields(index)); + + return(heap); +} + +/** Read a merge block from the file system. +@return whether the request was completed successfully */ +bool +row_merge_read( +/*===========*/ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ + row_merge_block_t* buf, /*!< out: data */ + row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ +{ + os_offset_t ofs = ((os_offset_t) offset) * srv_sort_buf_size; + + DBUG_ENTER("row_merge_read"); + DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs); + DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE);); + + const dberr_t err = os_file_read( + IORequestRead, fd, buf, ofs, srv_sort_buf_size, nullptr); + + /* If encryption is enabled decrypt buffer */ + if (err == DB_SUCCESS && srv_encrypt_log) { + if (!log_tmp_block_decrypt(buf, srv_sort_buf_size, + crypt_buf, ofs)) { + DBUG_RETURN(false); + } + + srv_stats.n_merge_blocks_decrypted.inc(); + memcpy(buf, crypt_buf, srv_sort_buf_size); + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + DBUG_RETURN(err == DB_SUCCESS); +} + +/********************************************************************//** +Write a merge block to the file system. +@return whether the request was completed successfully +@retval false on error +@retval true on success */ +bool +row_merge_write( + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ + const void* buf, /*!< in: data */ + void* crypt_buf, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ +{ + size_t buf_len = srv_sort_buf_size; + os_offset_t ofs = buf_len * (os_offset_t) offset; + void* out_buf = (void *)buf; + + DBUG_ENTER("row_merge_write"); + DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs); + DBUG_EXECUTE_IF("row_merge_write_failure", DBUG_RETURN(FALSE);); + + /* For encrypted tables, encrypt data before writing */ + if (srv_encrypt_log) { + if (!log_tmp_block_encrypt(static_cast<const byte*>(buf), + buf_len, + static_cast<byte*>(crypt_buf), + ofs)) { + DBUG_RETURN(false); + } + + srv_stats.n_merge_blocks_encrypted.inc(); + out_buf = crypt_buf; + } + + const bool success = DB_SUCCESS == os_file_write( + IORequestWrite, "(merge)", fd, out_buf, ofs, buf_len); + +#ifdef POSIX_FADV_DONTNEED + /* The block will be needed on the next merge pass, + but it can be evicted from the file cache meanwhile. */ + posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + DBUG_RETURN(success); +} + +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + rec_offs* offsets,/*!< out: offsets of mrec */ + row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ +{ + ulint extra_size; + ulint data_size; + ulint avail_size; + + ut_ad(b >= &block[0]); + ut_ad(b < &block[srv_sort_buf_size]); + + ut_ad(rec_offs_get_n_alloc(offsets) == 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index)); + + DBUG_ENTER("row_merge_read_rec"); + + extra_size = *b++; + + if (UNIV_UNLIKELY(!extra_size)) { + /* End of list */ + *mrec = NULL; + DBUG_LOG("ib_merge_sort", + "read " << reinterpret_cast<const void*>(b) << ',' << + reinterpret_cast<const void*>(block) << ',' << + fd << ',' << *foffs << " EOF"); + DBUG_RETURN(NULL); + } + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) { + if (!row_merge_read(fd, ++(*foffs), block, + crypt_block, + space)) { +err_exit: + /* Signal I/O error. */ + *mrec = b; + DBUG_RETURN(NULL); + } + + /* Wrap around to the beginning of the buffer. */ + b = &block[0]; + } + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *b++; + } + + /* Normalize extra_size. Above, value 0 signals "end of list". */ + extra_size--; + + /* Read the extra bytes. */ + + if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) { + /* The record spans two blocks. Copy the entire record + to the auxiliary buffer and handle this as a special + case. */ + + avail_size = ulint(&block[srv_sort_buf_size] - b); + ut_ad(avail_size < sizeof *buf); + memcpy(*buf, b, avail_size); + + if (!row_merge_read(fd, ++(*foffs), block, + crypt_block, + space)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = &block[0]; + + /* Copy the record. */ + memcpy(*buf + avail_size, b, extra_size - avail_size); + b += extra_size - avail_size; + + *mrec = *buf + extra_size; + + rec_init_offsets_temp(*mrec, index, offsets); + + data_size = rec_offs_data_size(offsets); + + /* These overflows should be impossible given that + records are much smaller than either buffer, and + the record starts near the beginning of each buffer. */ + ut_a(extra_size + data_size < sizeof *buf); + ut_a(b + data_size < &block[srv_sort_buf_size]); + + /* Copy the data bytes. */ + memcpy(*buf + extra_size, b, data_size); + b += data_size; + + goto func_exit; + } + + *mrec = b + extra_size; + + rec_init_offsets_temp(*mrec, index, offsets); + + data_size = rec_offs_data_size(offsets); + ut_ad(extra_size + data_size < sizeof *buf); + + b += extra_size + data_size; + + if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) { + /* The record fits entirely in the block. + This is the normal case. */ + goto func_exit; + } + + /* The record spans two blocks. Copy it to buf. */ + + b -= extra_size + data_size; + avail_size = ulint(&block[srv_sort_buf_size] - b); + memcpy(*buf, b, avail_size); + *mrec = *buf + extra_size; + + rec_init_offsets_temp(*mrec, index, offsets); + + if (!row_merge_read(fd, ++(*foffs), block, + crypt_block, + space)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = &block[0]; + + /* Copy the rest of the record. */ + memcpy(*buf + avail_size, b, extra_size + data_size - avail_size); + b += extra_size + data_size - avail_size; + +func_exit: + DBUG_LOG("ib_merge_sort", + reinterpret_cast<const void*>(b) << ',' << + reinterpret_cast<const void*>(block) + << ",fd=" << fd << ',' << *foffs << ": " + << rec_printer(*mrec, 0, offsets).str()); + DBUG_RETURN(b); +} + +/********************************************************************//** +Write a merge record. */ +static +void +row_merge_write_rec_low( +/*====================*/ + byte* b, /*!< out: buffer */ + ulint e, /*!< in: encoded extra_size */ +#ifndef DBUG_OFF + ulint size, /*!< in: total size to write */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint foffs, /*!< in: file offset */ +#endif /* !DBUG_OFF */ + const mrec_t* mrec, /*!< in: record to write */ + const rec_offs* offsets)/*!< in: offsets of mrec */ +#ifdef DBUG_OFF +# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \ + row_merge_write_rec_low(b, e, mrec, offsets) +#endif /* DBUG_OFF */ +{ + DBUG_ENTER("row_merge_write_rec_low"); + +#ifndef DBUG_OFF + const byte* const end = b + size; +#endif /* DBUG_OFF */ + DBUG_ASSERT(e == rec_offs_extra_size(offsets) + 1); + + DBUG_LOG("ib_merge_sort", + reinterpret_cast<const void*>(b) << ",fd=" << fd << ',' + << foffs << ": " << rec_printer(mrec, 0, offsets).str()); + + if (e < 0x80) { + *b++ = (byte) e; + } else { + *b++ = (byte) (0x80 | (e >> 8)); + *b++ = (byte) e; + } + + memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets)); + DBUG_SLOW_ASSERT(b + rec_offs_size(offsets) == end); + DBUG_VOID_RETURN; +} + +/********************************************************************//** +Write a merge record. +@return pointer to end of block, or NULL on error */ +static +byte* +row_merge_write_rec( +/*================*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + byte* b, /*!< in: pointer to end of block */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t* mrec, /*!< in: record to write */ + const rec_offs* offsets,/*!< in: offsets of mrec */ + row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ +{ + ulint extra_size; + ulint size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= &block[0]); + ut_ad(b < &block[srv_sort_buf_size]); + ut_ad(mrec); + ut_ad(foffs); + ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]); + ut_ad(mrec < buf[0] || mrec > buf[1]); + + /* Normalize extra_size. Value 0 signals "end of list". */ + extra_size = rec_offs_extra_size(offsets) + 1; + + size = extra_size + (extra_size >= 0x80) + + rec_offs_data_size(offsets); + + if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) { + /* The record spans two blocks. + Copy it to the temporary buffer first. */ + avail_size = ulint(&block[srv_sort_buf_size] - b); + + row_merge_write_rec_low(buf[0], + extra_size, size, fd, *foffs, + mrec, offsets); + + /* Copy the head of the temporary buffer, write + the completed block, and copy the tail of the + record to the head of the new block. */ + memcpy(b, buf[0], avail_size); + + if (!row_merge_write(fd, (*foffs)++, block, + crypt_block, + space)) { + return(NULL); + } + + MEM_UNDEFINED(&block[0], srv_sort_buf_size); + + /* Copy the rest. */ + b = &block[0]; + memcpy(b, buf[0] + avail_size, size - avail_size); + b += size - avail_size; + } else { + row_merge_write_rec_low(b, extra_size, size, fd, *foffs, + mrec, offsets); + b += size; + } + + return(b); +} + +/********************************************************************//** +Write an end-of-list marker. +@return pointer to end of block, or NULL on error */ +static +byte* +row_merge_write_eof( +/*================*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + byte* b, /*!< in: pointer to end of block */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ +{ + ut_ad(block); + ut_ad(b >= &block[0]); + ut_ad(b < &block[srv_sort_buf_size]); + ut_ad(foffs); + + DBUG_ENTER("row_merge_write_eof"); + DBUG_LOG("ib_merge_sort", + reinterpret_cast<const void*>(b) << ',' << + reinterpret_cast<const void*>(block) << + ",fd=" << fd << ',' << *foffs); + + *b++ = 0; + MEM_CHECK_DEFINED(&block[0], b - &block[0]); + MEM_CHECK_ADDRESSABLE(&block[0], srv_sort_buf_size); + + /* The rest of the block is uninitialized. Silence warnings. */ + MEM_MAKE_DEFINED(b, &block[srv_sort_buf_size] - b); + + if (!row_merge_write(fd, (*foffs)++, block, crypt_block, space)) { + DBUG_RETURN(NULL); + } + + MEM_UNDEFINED(&block[0], srv_sort_buf_size); + DBUG_RETURN(&block[0]); +} + +/** Create a temporary file if it has not been created already. +@param[in,out] tmpfd temporary file handle +@param[in] path location for creating temporary file +@return true on success, false on error */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +row_merge_tmpfile_if_needed( + pfs_os_file_t* tmpfd, + const char* path) +{ + if (*tmpfd == OS_FILE_CLOSED) { + *tmpfd = row_merge_file_create_low(path); + if (*tmpfd != OS_FILE_CLOSED) { + MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES); + } + } + + return(*tmpfd != OS_FILE_CLOSED); +} + +/** Create a temporary file for merge sort if it was not created already. +@param[in,out] file merge file structure +@param[in] nrec number of records in the file +@param[in] path location for creating temporary file +@return true on success, false on error */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +row_merge_file_create_if_needed( + merge_file_t* file, + pfs_os_file_t* tmpfd, + ulint nrec, + const char* path) +{ + ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED); + if (file->fd == OS_FILE_CLOSED && row_merge_file_create(file, path)!= OS_FILE_CLOSED) { + MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES); + if (!row_merge_tmpfile_if_needed(tmpfd, path) ) { + return(false); + } + + file->n_rec = nrec; + } + + ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED); + return(file->fd != OS_FILE_CLOSED); +} + +/** Copy the merge data tuple from another merge data tuple. +@param[in] mtuple source merge data tuple +@param[in,out] prev_mtuple destination merge data tuple +@param[in] n_unique number of unique fields exist in the mtuple +@param[in,out] heap memory heap where last_mtuple allocated */ +static +void +row_mtuple_create( + const mtuple_t* mtuple, + mtuple_t* prev_mtuple, + ulint n_unique, + mem_heap_t* heap) +{ + memcpy(prev_mtuple->fields, mtuple->fields, + n_unique * sizeof *mtuple->fields); + + dfield_t* field = prev_mtuple->fields; + + for (ulint i = 0; i < n_unique; i++) { + dfield_dup(field++, heap); + } +} + +/** Compare two merge data tuples. +@param[in] prev_mtuple merge data tuple +@param[in] current_mtuple merge data tuple +@param[in,out] dup reporter of duplicates +@retval positive, 0, negative if current_mtuple is greater, equal, less, than +last_mtuple. */ +static +int +row_mtuple_cmp( + const mtuple_t* prev_mtuple, + const mtuple_t* current_mtuple, + row_merge_dup_t* dup) +{ + ut_ad(dup->index->is_primary()); + const ulint n_uniq= dup->index->n_uniq; + return row_merge_tuple_cmp(dup->index, n_uniq, n_uniq, + *current_mtuple, *prev_mtuple, dup); +} + +/** Insert cached spatial index rows. +@param[in] trx_id transaction id +@param[in] sp_tuples cached spatial rows +@param[in] num_spatial number of spatial indexes +@param[in,out] heap temporary memory heap +@param[in,out] pcur cluster index cursor +@param[in,out] started whether mtr is active +@param[in,out] mtr mini-transaction +@return DB_SUCCESS or error number */ +static +dberr_t +row_merge_spatial_rows( + trx_id_t trx_id, + spatial_index_info** sp_tuples, + ulint num_spatial, + mem_heap_t* heap, + btr_pcur_t* pcur, + bool& started, + mtr_t* mtr) +{ + if (!sp_tuples) + return DB_SUCCESS; + + for (ulint j= 0; j < num_spatial; j++) + if (dberr_t err= sp_tuples[j]->insert(trx_id, pcur, started, heap, mtr)) + return err; + + mem_heap_empty(heap); + return DB_SUCCESS; +} + +/** Check if the geometry field is valid. +@param[in] row the row +@param[in] index spatial index +@return true if it's valid, false if it's invalid. */ +static +bool +row_geo_field_is_valid( + const dtuple_t* row, + dict_index_t* index) +{ + const dict_field_t* ind_field + = dict_index_get_nth_field(index, 0); + const dict_col_t* col + = ind_field->col; + ulint col_no + = dict_col_get_no(col); + const dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + + if (dfield_is_null(dfield) + || dfield_get_len(dfield) < GEO_DATA_HEADER_SIZE) { + return(false); + } + + return(true); +} + +/** Reads clustered index of the table and create temporary files +containing the index entries for the indexes to be built. +@param[in] trx transaction +@param[in,out] table MySQL table object, for reporting erroneous + records +@param[in] old_table table where rows are read from +@param[in] new_table table where indexes are created; identical to + old_table unless creating a PRIMARY KEY +@param[in] online true if creating indexes online +@param[in] index indexes to be created +@param[in] fts_sort_idx full-text index to be created, or NULL +@param[in] psort_info parallel sort info for fts_sort_idx creation, + or NULL +@param[in] files temporary files +@param[in] key_numbers MySQL key numbers to create +@param[in] n_index number of indexes to create +@param[in] defaults default values of added, changed columns, or NULL +@param[in] add_v newly added virtual columns along with indexes +@param[in] col_map mapping of old column numbers to new ones, or +NULL if old_table == new_table +@param[in] add_autoinc number of added AUTO_INCREMENT columns, or +ULINT_UNDEFINED if none is added +@param[in,out] sequence autoinc sequence +@param[in,out] block file buffer +@param[in] skip_pk_sort whether the new PRIMARY KEY will follow +existing order +@param[in,out] tmpfd temporary file handle +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->n_pk_recs_inc() will be called for each record read and +stage->inc() will be called for each page read. +@param[in] pct_cost percent of task weight out of total alter job +@param[in,out] crypt_block crypted file buffer +@param[in] eval_table mysql table used to evaluate virtual column + value, see innobase_get_computed_value(). +@param[in] allow_not_null allow null to not-null conversion +@param[in] col_collate columns whose collations changed, or nullptr +@return DB_SUCCESS or error */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_merge_read_clustered_index( + trx_t* trx, + struct TABLE* table, + const dict_table_t* old_table, + dict_table_t* new_table, + bool online, + dict_index_t** index, + dict_index_t* fts_sort_idx, + fts_psort_t* psort_info, + merge_file_t* files, + const ulint* key_numbers, + ulint n_index, + const dtuple_t* defaults, + const dict_add_v_col_t* add_v, + const ulint* col_map, + ulint add_autoinc, + ib_sequence_t& sequence, + row_merge_block_t* block, + bool skip_pk_sort, + pfs_os_file_t* tmpfd, + ut_stage_alter_t* stage, + double pct_cost, + row_merge_block_t* crypt_block, + struct TABLE* eval_table, + bool allow_not_null, + const col_collations* col_collate) +{ + dict_index_t* clust_index; /* Clustered index */ + mem_heap_t* row_heap = NULL;/* Heap memory to create + clustered index tuples */ + row_merge_buf_t** merge_buf; /* Temporary list for records*/ + mem_heap_t* v_heap = NULL; /* Heap memory to process large + data for virtual column */ + btr_pcur_t pcur; /* Cursor on the clustered + index */ + mtr_t mtr; /* Mini transaction */ + bool mtr_started = false; + dberr_t err = DB_SUCCESS;/* Return code */ + ulint n_nonnull = 0; /* number of columns + changed to NOT NULL */ + ulint* nonnull = NULL; /* NOT NULL columns */ + dict_index_t* fts_index = NULL;/* FTS index */ + doc_id_t doc_id = 0; + doc_id_t max_doc_id = 0; + ibool add_doc_id = FALSE; + pthread_cond_t* fts_parallel_sort_cond = nullptr; + spatial_index_info** sp_tuples = nullptr; + ulint num_spatial = 0; + BtrBulk* clust_btr_bulk = NULL; + bool clust_temp_file = false; + mem_heap_t* mtuple_heap = NULL; + mtuple_t prev_mtuple; + mem_heap_t* conv_heap = NULL; + double curr_progress = 0.0; + ib_uint64_t read_rows = 0; + ib_uint64_t table_total_rows = 0; + char new_sys_trx_start[8]; + char new_sys_trx_end[8]; + byte any_autoinc_data[8] = {0}; + bool vers_update_trt = false; + + DBUG_ENTER("row_merge_read_clustered_index"); + + ut_ad((old_table == new_table) == !col_map); + ut_ad(!defaults || col_map); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->id); + + table_total_rows = dict_table_get_n_rows(old_table); + if(table_total_rows == 0) { + /* We don't know total row count */ + table_total_rows = 1; + } + + trx->op_info = "reading clustered index"; + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n"); +#endif + + /* Create and initialize memory for record buffers */ + + merge_buf = static_cast<row_merge_buf_t**>( + ut_malloc_nokey(n_index * sizeof *merge_buf)); + + row_merge_dup_t clust_dup = {index[0], table, col_map, 0}; + dfield_t* prev_fields = nullptr; + const ulint n_uniq = dict_index_get_n_unique(index[0]); + + ut_ad(trx->mysql_thd != NULL); + + const char* path = thd_innodb_tmpdir(trx->mysql_thd); + + ut_ad(!skip_pk_sort || dict_index_is_clust(index[0])); + /* There is no previous tuple yet. */ + prev_mtuple.fields = NULL; + + for (ulint i = 0; i < n_index; i++) { + if (index[i]->type & DICT_FTS) { + + /* We are building a FT index, make sure + we have the temporary 'fts_sort_idx' */ + ut_a(fts_sort_idx); + + fts_index = index[i]; + + merge_buf[i] = row_merge_buf_create(fts_sort_idx); + + add_doc_id = DICT_TF2_FLAG_IS_SET( + new_table, DICT_TF2_FTS_ADD_DOC_ID); + + /* If Doc ID does not exist in the table itself, + fetch the first FTS Doc ID */ + if (add_doc_id) { + fts_get_next_doc_id( + (dict_table_t*) new_table, + &doc_id); + ut_ad(doc_id > 0); + } + + row_fts_start_psort(psort_info); + fts_parallel_sort_cond = + &psort_info[0].psort_common->sort_cond; + } else { + if (dict_index_is_spatial(index[i])) { + num_spatial++; + } + + merge_buf[i] = row_merge_buf_create(index[i]); + } + } + + if (num_spatial > 0) { + ulint count = 0; + + sp_tuples = static_cast<spatial_index_info**>( + ut_malloc_nokey(num_spatial + * sizeof(*sp_tuples))); + + for (ulint i = 0; i < n_index; i++) { + if (dict_index_is_spatial(index[i])) { + sp_tuples[count] + = UT_NEW_NOKEY( + spatial_index_info(index[i])); + count++; + } + } + + ut_ad(count == num_spatial); + } + + mtr.start(); + mtr_started = true; + + /* Find the clustered index and create a persistent cursor + based on that. */ + + clust_index = dict_table_get_first_index(old_table); + const ulint old_trx_id_col = ulint(old_table->n_cols) + - (DATA_N_SYS_COLS - DATA_TRX_ID); + ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS); + ut_ad(old_table->cols[old_trx_id_col].prtype + == (DATA_TRX_ID | DATA_NOT_NULL)); + ut_ad(old_table->cols[old_trx_id_col + 1].mtype == DATA_SYS); + ut_ad(old_table->cols[old_trx_id_col + 1].prtype + == (DATA_ROLL_PTR | DATA_NOT_NULL)); + const ulint new_trx_id_col = col_map + ? col_map[old_trx_id_col] : old_trx_id_col; + uint64_t n_rows = 0; + + err = pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr); + if (err != DB_SUCCESS) { +err_exit: + trx->error_key_num = 0; + goto func_exit; + } else { + rec_t* rec = page_rec_get_next(btr_pcur_get_rec(&pcur)); + if (!rec) { +corrupted_metadata: + err = DB_CORRUPTION; + goto err_exit; + } + if (rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG) { + if (!clust_index->is_instant()) { + goto corrupted_metadata; + } + if (page_rec_is_comp(rec) + && rec_get_status(rec) != REC_STATUS_INSTANT) { + goto corrupted_metadata; + } + /* Skip the metadata pseudo-record. */ + btr_pcur_get_page_cur(&pcur)->rec = rec; + } else if (clust_index->is_instant()) { + goto corrupted_metadata; + } + } + + /* Check if the table is supposed to be empty for our read view. + + If we read bulk_trx_id as an older transaction ID, it is not + incorrect to check here whether that transaction should be + visible to us. If bulk_trx_id is not visible to us, the table + must have been empty at an earlier point of time, also in our + read view. + + An INSERT would only update bulk_trx_id in + row_ins_clust_index_entry_low() if the table really was empty + (everything had been purged), when holding a leaf page latch + in the clustered index (actually, the root page is the only + leaf page in that case). + + We are holding a clustered index leaf page latch here. + That will obviously prevent any concurrent INSERT from + updating bulk_trx_id while we read it. */ + if (!online) { + } else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) { + ut_ad(trx->read_view.is_open()); + ut_ad(bulk_trx_id != trx->id); + if (!trx->read_view.changes_visible(bulk_trx_id)) { + goto func_exit; + } + } + + if (old_table != new_table) { + /* The table is being rebuilt. Identify the columns + that were flagged NOT NULL in the new table, so that + we can quickly check that the records in the old table + do not violate the added NOT NULL constraints. */ + + nonnull = static_cast<ulint*>( + ut_malloc_nokey(dict_table_get_n_cols(new_table) + * sizeof *nonnull)); + + for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) { + if (dict_table_get_nth_col(old_table, i)->prtype + & DATA_NOT_NULL) { + continue; + } + + const ulint j = col_map[i]; + + if (j == ULINT_UNDEFINED) { + /* The column was dropped. */ + continue; + } + + if (dict_table_get_nth_col(new_table, j)->prtype + & DATA_NOT_NULL) { + nonnull[n_nonnull++] = j; + } + } + + if (!n_nonnull) { + ut_free(nonnull); + nonnull = NULL; + } + } + + row_heap = mem_heap_create(sizeof(mrec_buf_t)); + + if (dict_table_is_comp(old_table) + && !dict_table_is_comp(new_table)) { + conv_heap = mem_heap_create(sizeof(mrec_buf_t)); + } + + if (skip_pk_sort) { + prev_fields = static_cast<dfield_t*>( + ut_malloc_nokey(n_uniq * sizeof *prev_fields)); + mtuple_heap = mem_heap_create(sizeof(mrec_buf_t)); + } + + mach_write_to_8(new_sys_trx_start, trx->id); + mach_write_to_8(new_sys_trx_end, TRX_ID_MAX); + + /* Scan the clustered index. */ + for (;;) { + /* Do not continue if table pages are still encrypted */ + if (!old_table->is_readable() || !new_table->is_readable()) { + err = DB_DECRYPTION_FAILED; + goto err_exit; + } + + const rec_t* rec; + trx_id_t rec_trx_id; + rec_offs* offsets; + dtuple_t* row; + row_ext_t* ext; + page_cur_t* cur = btr_pcur_get_page_cur(&pcur); + bool history_row, history_fts = false; + + stage->n_pk_recs_inc(); + + if (!page_cur_move_to_next(cur)) { +corrupted_rec: + err = DB_CORRUPTION; + goto err_exit; + } + + if (page_cur_is_after_last(cur)) { + + stage->inc(); + + if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + err = DB_INTERRUPTED; + goto err_exit; + } + + if (online && old_table != new_table) { + err = row_log_table_get_error(clust_index); + if (err != DB_SUCCESS) { + goto err_exit; + } + } + + /* Insert the cached spatial index rows. */ + err = row_merge_spatial_rows( + trx->id, sp_tuples, num_spatial, + row_heap, &pcur, mtr_started, &mtr); + + if (err != DB_SUCCESS) { + goto func_exit; + } + + mem_heap_empty(row_heap); + + if (!mtr_started) { + goto scan_next; + } + + if (clust_index->lock.is_waiting()) { + /* There are waiters on the clustered + index tree lock, likely the purge + thread. Store and restore the cursor + position, and yield so that scanning a + large table will not starve other + threads. */ + + /* Store the cursor position on the last user + record on the page. */ + if (!btr_pcur_move_to_prev_on_page(&pcur)) { + goto corrupted_index; + } + /* Leaf pages must never be empty, unless + this is the only page in the index tree. */ + if (!btr_pcur_is_on_user_rec(&pcur) + && btr_pcur_get_block(&pcur)->page.id() + .page_no() != clust_index->page) { + goto corrupted_index; + } + + btr_pcur_store_position(&pcur, &mtr); + mtr.commit(); + mtr_started = false; + + /* Give the waiters a chance to proceed. */ + std::this_thread::yield(); +scan_next: + ut_ad(!mtr_started); + ut_ad(!mtr.is_active()); + mtr.start(); + mtr_started = true; + /* Restore position on the record, or its + predecessor if the record was purged + meanwhile. */ + if (pcur.restore_position(BTR_SEARCH_LEAF, + &mtr) + == btr_pcur_t::CORRUPTED) { +corrupted_index: + err = DB_CORRUPTION; + goto func_exit; + } + /* Move to the successor of the + original record. */ + if (!btr_pcur_move_to_next_user_rec( + &pcur, &mtr)) { +end_of_index: + row = NULL; + mtr.commit(); + mtr_started = false; + mem_heap_free(row_heap); + row_heap = NULL; + ut_free(nonnull); + nonnull = NULL; + goto write_buffers; + } + } else { + uint32_t next_page_no = btr_page_get_next( + page_cur_get_page(cur)); + + if (next_page_no == FIL_NULL) { + goto end_of_index; + } + + buf_block_t* block = buf_page_get_gen( + page_id_t(old_table->space->id, + next_page_no), + old_table->space->zip_size(), + RW_S_LATCH, nullptr, BUF_GET, &mtr, + &err, false); + if (!block) { + goto err_exit; + } + + page_cur_set_before_first(block, cur); + if (!page_cur_move_to_next(cur) + || page_cur_is_after_last(cur)) { + goto corrupted_rec; + } + + const auto s = mtr.get_savepoint(); + mtr.rollback_to_savepoint(s - 2, s - 1); + } + } else { + mem_heap_empty(row_heap); + } + + rec = page_cur_get_rec(cur); + + if (online) { + offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &row_heap); + rec_trx_id = row_get_rec_trx_id(rec, clust_index, + offsets); + + /* Perform a REPEATABLE READ. + + When rebuilding the table online, + row_log_table_apply() must not see a newer + state of the table when applying the log. + This is mainly to prevent false duplicate key + errors, because the log will identify records + by the PRIMARY KEY, and also to prevent unsafe + BLOB access. + + When creating a secondary index online, this + table scan must not see records that have only + been inserted to the clustered index, but have + not been written to the online_log of + index[]. If we performed READ UNCOMMITTED, it + could happen that the ADD INDEX reaches + ONLINE_INDEX_COMPLETE state between the time + the DML thread has updated the clustered index + but has not yet accessed secondary index. */ + ut_ad(trx->read_view.is_open()); + ut_ad(rec_trx_id != trx->id); + + if (!trx->read_view.changes_visible(rec_trx_id)) { + if (rec_trx_id + >= trx->read_view.low_limit_id() + && rec_trx_id + >= trx_sys.get_max_trx_id()) { + goto corrupted_rec; + } + + rec_t* old_vers; + + row_vers_build_for_consistent_read( + rec, &mtr, clust_index, &offsets, + &trx->read_view, &row_heap, + row_heap, &old_vers, NULL); + + if (!old_vers) { + continue; + } + + /* The old version must necessarily be + in the "prehistory", because the + exclusive lock in + ha_innobase::prepare_inplace_alter_table() + forced the completion of any transactions + that accessed this table. */ + ut_ad(row_get_rec_trx_id(old_vers, clust_index, + offsets) < trx->id); + + rec = old_vers; + rec_trx_id = 0; + } + + if (rec_get_deleted_flag( + rec, + dict_table_is_comp(old_table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. + Above, we did reset rec_trx_id = 0 + for rec = old_vers.*/ + ut_ad(rec == page_cur_get_rec(cur) + ? rec_trx_id + : !rec_trx_id); + /* This record was deleted in the latest + committed version, or it was deleted and + then reinserted-by-update before purge + kicked in. Skip it. */ + continue; + } + + ut_ad(!rec_offs_any_null_extern(rec, offsets)); + } else if (rec_get_deleted_flag( + rec, dict_table_is_comp(old_table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_d(rec_trx_id = rec_get_trx_id(rec, clust_index)); + ut_ad(rec_trx_id); + /* This must be a purgeable delete-marked record, + and the transaction that delete-marked the record + must have been committed before this + !online ALTER TABLE transaction. */ + ut_ad(rec_trx_id < trx->id); + /* Skip delete-marked records. + + Skipping delete-marked records will make the + created indexes unuseable for transactions + whose read views were created before the index + creation completed, but an attempt to preserve + the history would make it tricky to detect + duplicate keys. */ + continue; + } else { + offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &row_heap); + /* This is a locking ALTER TABLE. + + If we are not rebuilding the table, the + DB_TRX_ID does not matter, as it is not being + written to any secondary indexes; see + if (old_table == new_table) below. + + If we are rebuilding the table, the + DB_TRX_ID,DB_ROLL_PTR should be reset, because + there will be no history available. */ + ut_ad(rec_get_trx_id(rec, clust_index) < trx->id); + rec_trx_id = 0; + } + + /* When !online, we are holding a lock on old_table, preventing + any inserts that could have written a record 'stub' before + writing out off-page columns. */ + ut_ad(!rec_offs_any_null_extern(rec, offsets)); + + /* Build a row based on the clustered index. */ + + row = row_build_w_add_vcol(ROW_COPY_POINTERS, clust_index, + rec, offsets, new_table, + defaults, add_v, col_map, &ext, + row_heap); + ut_ad(row); + + history_row = new_table->versioned() + && dtuple_get_nth_field(row, new_table->vers_end) + ->vers_history_row(); + history_fts = history_row && new_table->fts; + + for (ulint i = 0; i < n_nonnull; i++) { + dfield_t* field = &row->fields[nonnull[i]]; + + ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL); + + if (dfield_is_null(field)) { + + Field* null_field = + table->field[nonnull[i]]; + + null_field->set_warning( + Sql_condition::WARN_LEVEL_WARN, + WARN_DATA_TRUNCATED, 1, + ulong(n_rows + 1)); + + if (!allow_not_null) { + err = DB_INVALID_NULL; + goto err_exit; + } + + const dfield_t& default_field + = defaults->fields[nonnull[i]]; + + *field = default_field; + } + } + + /* Get the next Doc ID */ + if (add_doc_id && !history_fts) { + doc_id++; + } else { + doc_id = 0; + } + + ut_ad(row->fields[new_trx_id_col].type.mtype == DATA_SYS); + ut_ad(row->fields[new_trx_id_col].type.prtype + == (DATA_TRX_ID | DATA_NOT_NULL)); + ut_ad(row->fields[new_trx_id_col].len == DATA_TRX_ID_LEN); + ut_ad(row->fields[new_trx_id_col + 1].type.mtype == DATA_SYS); + ut_ad(row->fields[new_trx_id_col + 1].type.prtype + == (DATA_ROLL_PTR | DATA_NOT_NULL)); + ut_ad(row->fields[new_trx_id_col + 1].len == DATA_ROLL_PTR_LEN); + + if (old_table == new_table) { + /* Do not bother touching DB_TRX_ID,DB_ROLL_PTR + because they are not going to be written into + secondary indexes. */ + } else if (rec_trx_id < trx->id) { + /* Reset the DB_TRX_ID,DB_ROLL_PTR of old rows + for which history is not going to be + available after the rebuild operation. + This essentially mimics row_purge_reset_trx_id(). */ + row->fields[new_trx_id_col].data + = const_cast<byte*>(reset_trx_id); + row->fields[new_trx_id_col + 1].data + = const_cast<byte*>(reset_trx_id + + DATA_TRX_ID_LEN); + } + + if (add_autoinc != ULINT_UNDEFINED) { + + ut_ad(add_autoinc + < dict_table_get_n_user_cols(new_table)); + + dfield_t* dfield = dtuple_get_nth_field(row, + add_autoinc); + + if (new_table->versioned()) { + if (history_row) { + if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) { + err = DB_UNSUPPORTED; + my_error(ER_UNSUPPORTED_EXTENSION, MYF(0), + old_table->name.m_name); + goto func_exit; + } + dfield_set_null(dfield); + } else { + // set not null + ulint len = dfield_get_type(dfield)->len; + dfield_set_data(dfield, any_autoinc_data, len); + } + } + + if (dfield_is_null(dfield)) { + goto write_buffers; + } + + const dtype_t* dtype = dfield_get_type(dfield); + byte* b = static_cast<byte*>(dfield_get_data(dfield)); + + if (sequence.eof()) { + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_AUTOINC_READ_FAILED, "[NULL]"); + err = DB_ERROR; + goto err_exit; + } + + ulonglong value = sequence++; + + switch (dtype_get_mtype(dtype)) { + case DATA_INT: { + ibool usign; + ulint len = dfield_get_len(dfield); + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + mach_write_ulonglong(b, value, len, usign); + + break; + } + + case DATA_FLOAT: + mach_float_write( + b, static_cast<float>(value)); + break; + + case DATA_DOUBLE: + mach_double_write( + b, static_cast<double>(value)); + break; + + default: + ut_ad(0); + } + } + + if (old_table->versioned()) { + if (!new_table->versioned() + && clust_index->vers_history_row(rec, offsets)) { + continue; + } + } else if (new_table->versioned()) { + dfield_t* start = + dtuple_get_nth_field(row, new_table->vers_start); + dfield_t* end = + dtuple_get_nth_field(row, new_table->vers_end); + dfield_set_data(start, new_sys_trx_start, 8); + dfield_set_data(end, new_sys_trx_end, 8); + vers_update_trt = true; + } + +write_buffers: + /* Build all entries for all the indexes to be created + in a single scan of the clustered index. */ + + n_rows++; + ulint s_idx_cnt = 0; + bool skip_sort = skip_pk_sort + && dict_index_is_clust(merge_buf[0]->index); + + for (ulint k = 0, i = 0; i < n_index; i++, skip_sort = false) { + row_merge_buf_t* buf = merge_buf[i]; + ulint rows_added = 0; + + if (dict_index_is_spatial(buf->index)) { + if (!row) { + continue; + } + + ut_ad(sp_tuples[s_idx_cnt]->index + == buf->index); + + /* If the geometry field is invalid, report + error. */ + if (!row_geo_field_is_valid(row, buf->index)) { + err = DB_CANT_CREATE_GEOMETRY_OBJECT; + break; + } + + sp_tuples[s_idx_cnt]->add(row, ext, buf->heap); + s_idx_cnt++; + + continue; + } + + ut_ad(!row + || !dict_index_is_clust(buf->index) + || trx_id_check(row->fields[new_trx_id_col].data, + trx->id)); + + merge_file_t* file = &files[k++]; + + if (UNIV_LIKELY + (row && (rows_added = row_merge_buf_add( + buf, fts_index, old_table, new_table, + psort_info, row, ext, history_fts, + &doc_id, conv_heap, &err, + &v_heap, eval_table, trx, + col_collate)))) { + + /* If we are creating FTS index, + a single row can generate more + records for tokenized word */ + file->n_rec += rows_added; + + if (err != DB_SUCCESS) { + ut_ad(err == DB_TOO_BIG_RECORD); + break; + } + + if (doc_id > max_doc_id) { + max_doc_id = doc_id; + } + + if (buf->index->type & DICT_FTS) { + /* Check if error occurs in child thread */ + for (ulint j = 0; + j < fts_sort_pll_degree; j++) { + if (psort_info[j].error + != DB_SUCCESS) { + err = psort_info[j].error; + trx->error_key_num = i; + break; + } + } + + if (err != DB_SUCCESS) { + break; + } + } + + if (skip_sort) { + ut_ad(buf->n_tuples > 0); + const mtuple_t* curr = + &buf->tuples[buf->n_tuples - 1]; + + ut_ad(i == 0); + ut_ad(dict_index_is_clust(merge_buf[0]->index)); + /* Detect duplicates by comparing the + current record with previous record. + When temp file is not used, records + should be in sorted order. */ + if (prev_mtuple.fields != NULL + && (row_mtuple_cmp( + &prev_mtuple, curr, + &clust_dup) == 0)) { + + err = DB_DUPLICATE_KEY; + trx->error_key_num + = key_numbers[0]; + goto func_exit; + } + + prev_mtuple.fields = curr->fields; + } + + continue; + } + + if (err == DB_COMPUTE_VALUE_FAILED) { + trx->error_key_num = i; + goto func_exit; + } + + if (buf->index->type & DICT_FTS) { + if (!row || !doc_id) { + continue; + } + } + + /* The buffer must be sufficiently large + to hold at least one record. It may only + be empty when we reach the end of the + clustered index. row_merge_buf_add() + must not have been called in this loop. */ + ut_ad(buf->n_tuples || row == NULL); + + /* We have enough data tuples to form a block. + Sort them and write to disk if temp file is used + or insert into index if temp file is not used. */ + ut_ad(old_table == new_table + ? !dict_index_is_clust(buf->index) + : (i == 0) == dict_index_is_clust(buf->index)); + + /* We have enough data tuples to form a block. + Sort them (if !skip_sort) and write to disk. */ + + if (buf->n_tuples) { + if (skip_sort) { + /* Temporary File is not used. + so insert sorted block to the index */ + if (row != NULL) { + /* We have to do insert the + cached spatial index rows, since + after the mtr_commit, the cluster + index page could be updated, then + the data in cached rows become + invalid. */ + err = row_merge_spatial_rows( + trx->id, sp_tuples, + num_spatial, + row_heap, + &pcur, mtr_started, + &mtr); + + if (err != DB_SUCCESS) { + goto func_exit; + } + + /* We are not at the end of + the scan yet. We must + mtr.commit() in order to be + able to call log_free_check() + in row_merge_insert_index_tuples(). + Due to mtr.commit(), the + current row will be invalid, and + we must reread it on the next + loop iteration. */ + if (mtr_started) { + if (!btr_pcur_move_to_prev_on_page(&pcur)) { + err = DB_CORRUPTION; + goto func_exit; + } + btr_pcur_store_position( + &pcur, &mtr); + + mtr.commit(); + mtr_started = false; + } + } + + mem_heap_empty(mtuple_heap); + prev_mtuple.fields = prev_fields; + + row_mtuple_create( + &buf->tuples[buf->n_tuples - 1], + &prev_mtuple, n_uniq, + mtuple_heap); + + if (clust_btr_bulk == NULL) { + clust_btr_bulk = UT_NEW_NOKEY( + BtrBulk(index[i], + trx)); + } else { + clust_btr_bulk->latch(); + } + + err = row_merge_insert_index_tuples( + index[i], old_table, + OS_FILE_CLOSED, NULL, buf, + clust_btr_bulk, + table_total_rows, + curr_progress, + pct_cost, + crypt_block, + new_table->space_id); + + if (row == NULL) { + err = clust_btr_bulk->finish( + err); + UT_DELETE(clust_btr_bulk); + clust_btr_bulk = NULL; + } else { + /* Release latches for possible + log_free_chck in spatial index + build. */ + clust_btr_bulk->release(); + } + + if (err != DB_SUCCESS) { + break; + } + + if (row != NULL) { + /* Restore the cursor on the + previous clustered index record, + and empty the buffer. The next + iteration of the outer loop will + advance the cursor and read the + next record (the one which we + had to ignore due to the buffer + overflow). */ + mtr.start(); + mtr_started = true; + if (pcur.restore_position( + BTR_SEARCH_LEAF, &mtr) + == btr_pcur_t::CORRUPTED) { + goto corrupted_index; + } + buf = row_merge_buf_empty(buf); + merge_buf[i] = buf; + /* Restart the outer loop on the + record. We did not insert it + into any index yet. */ + ut_ad(i == 0); + break; + } + } else if (dict_index_is_unique(buf->index)) { + row_merge_dup_t dup = { + buf->index, table, col_map, 0}; + + row_merge_buf_sort(buf, &dup); + + if (dup.n_dup) { + err = DB_DUPLICATE_KEY; + trx->error_key_num + = key_numbers[i]; + break; + } + } else { + row_merge_buf_sort(buf, NULL); + } + } else if (online && new_table == old_table) { + /* Note the newest transaction that + modified this index when the scan was + completed. We prevent older readers + from accessing this index, to ensure + read consistency. */ + + ut_a(row == NULL); + + dict_index_t* index = buf->index; + index->lock.x_lock(SRW_LOCK_CALL); + ut_a(dict_index_get_online_status(index) + == ONLINE_INDEX_CREATION); + + trx_id_t max_trx_id = row_log_get_max_trx( + index); + + if (max_trx_id > index->trx_id) { + index->trx_id = max_trx_id; + } + + index->lock.x_unlock(); + } + + /* Secondary index and clustered index which is + not in sorted order can use the temporary file. + Fulltext index should not use the temporary file. */ + if (!skip_sort && !(buf->index->type & DICT_FTS)) { + /* In case we can have all rows in sort buffer, + we can insert directly into the index without + temporary file if clustered index does not uses + temporary file. */ + if (row == NULL && file->fd == OS_FILE_CLOSED + && !clust_temp_file) { + DBUG_EXECUTE_IF( + "row_merge_write_failure", + err = DB_TEMP_FILE_WRITE_FAIL; + trx->error_key_num = i; + goto all_done;); + + DBUG_EXECUTE_IF( + "row_merge_tmpfile_fail", + err = DB_OUT_OF_MEMORY; + trx->error_key_num = i; + goto all_done;); + + BtrBulk btr_bulk(index[i], trx); + + err = row_merge_insert_index_tuples( + index[i], old_table, + OS_FILE_CLOSED, NULL, buf, + &btr_bulk, + table_total_rows, + curr_progress, + pct_cost, + crypt_block, + new_table->space_id); + + err = btr_bulk.finish(err); + + DBUG_EXECUTE_IF( + "row_merge_insert_big_row", + err = DB_TOO_BIG_RECORD;); + + if (err != DB_SUCCESS) { + break; + } + } else { + if (!row_merge_file_create_if_needed( + file, tmpfd, + buf->n_tuples, path)) { + err = DB_OUT_OF_MEMORY; + trx->error_key_num = i; + break; + } + + /* Ensure that duplicates in the + clustered index will be detected before + inserting secondary index records. */ + if (dict_index_is_clust(buf->index)) { + clust_temp_file = true; + } + + ut_ad(file->n_rec > 0); + + row_merge_buf_write(buf, +#ifndef DBUG_OFF + file, +#endif + block); + + if (!row_merge_write( + file->fd, file->offset++, + block, crypt_block, + new_table->space_id)) { + err = DB_TEMP_FILE_WRITE_FAIL; + trx->error_key_num = i; + break; + } + + MEM_UNDEFINED( + &block[0], srv_sort_buf_size); + } + } + merge_buf[i] = row_merge_buf_empty(buf); + buf = merge_buf[i]; + + if (UNIV_LIKELY(row != NULL)) { + /* Try writing the record again, now + that the buffer has been written out + and emptied. */ + + if (UNIV_UNLIKELY + (!(rows_added = row_merge_buf_add( + buf, fts_index, old_table, + new_table, psort_info, + row, ext, history_fts, &doc_id, + conv_heap, &err, &v_heap, + eval_table, trx, col_collate)))) { + /* An empty buffer should have enough + room for at least one record. */ + ut_ad(err == DB_COMPUTE_VALUE_FAILED + || err == DB_OUT_OF_MEMORY + || err == DB_TOO_BIG_RECORD); + } else if (err == DB_SUCCESS) { + file->n_rec += rows_added; + continue; + } + + trx->error_key_num = i; + break; + } + } + + if (row == NULL) { + if (old_table != new_table) { + new_table->stat_n_rows = n_rows; + } + + goto all_done; + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (v_heap) { + mem_heap_empty(v_heap); + } + + /* Increment innodb_onlineddl_pct_progress status variable */ + read_rows++; + if(read_rows % 1000 == 0) { + /* Update progress for each 1000 rows */ + curr_progress = (read_rows >= table_total_rows) ? + pct_cost : + pct_cost * static_cast<double>(read_rows) + / static_cast<double>(table_total_rows); + /* presenting 10.12% as 1012 integer */ + onlineddl_pct_progress = (ulint) (curr_progress * 100); + } + } + +func_exit: + ut_ad(mtr_started == mtr.is_active()); + if (mtr_started) { + mtr.commit(); + } + if (row_heap) { + mem_heap_free(row_heap); + } + ut_free(nonnull); + +all_done: + if (clust_btr_bulk != NULL) { + ut_ad(err != DB_SUCCESS); + clust_btr_bulk->latch(); + err = clust_btr_bulk->finish( + err); + UT_DELETE(clust_btr_bulk); + } + + if (prev_fields) { + ut_free(prev_fields); + mem_heap_free(mtuple_heap); + } + + if (v_heap) { + mem_heap_free(v_heap); + } + + if (conv_heap != NULL) { + mem_heap_free(conv_heap); + } + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n"); +#endif + if (UNIV_LIKELY_NULL(fts_parallel_sort_cond)) { +wait_again: + /* Check if error occurs in child thread */ + for (ulint j = 0; j < fts_sort_pll_degree; j++) { + if (psort_info[j].error != DB_SUCCESS) { + err = psort_info[j].error; + trx->error_key_num = j; + break; + } + } + + /* Tell all children that parent has done scanning */ + for (ulint i = 0; i < fts_sort_pll_degree; i++) { + if (err == DB_SUCCESS) { + psort_info[i].state = FTS_PARENT_COMPLETE; + } else { + psort_info[i].state = FTS_PARENT_EXITING; + } + } + + /* Now wait all children to report back to be completed */ + timespec abstime; + set_timespec(abstime, 1); + mysql_mutex_lock(&psort_info[0].mutex); + my_cond_timedwait(fts_parallel_sort_cond, + &psort_info[0].mutex.m_mutex, &abstime); + mysql_mutex_unlock(&psort_info[0].mutex); + + for (ulint i = 0; i < fts_sort_pll_degree; i++) { + if (!psort_info[i].child_status) { + goto wait_again; + } + } + + for (ulint j = 0; j < fts_sort_pll_degree; j++) { + psort_info[j].task->wait(); + delete psort_info[j].task; + } + } + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n"); +#endif + for (ulint i = 0; i < n_index; i++) { + row_merge_buf_free(merge_buf[i]); + } + + row_fts_free_pll_merge_buf(psort_info); + + ut_free(merge_buf); + ut_free(pcur.old_rec_buf); + + if (sp_tuples != NULL) { + for (ulint i = 0; i < num_spatial; i++) { + UT_DELETE(sp_tuples[i]); + } + ut_free(sp_tuples); + } + + /* Update the next Doc ID we used. Table should be locked, so + no concurrent DML */ + if (max_doc_id && err == DB_SUCCESS) { + /* Sync fts cache for other fts indexes to keep all + fts indexes consistent in sync_doc_id. */ + err = fts_sync_table(const_cast<dict_table_t*>(new_table)); + + if (err == DB_SUCCESS) { + new_table->fts->cache->synced_doc_id = max_doc_id; + + /* Update the max value as next FTS_DOC_ID */ + if (max_doc_id >= new_table->fts->cache->next_doc_id) { + new_table->fts->cache->next_doc_id = + max_doc_id + 1; + } + + new_table->fts->cache->first_doc_id = + new_table->fts->cache->next_doc_id; + + err= fts_update_sync_doc_id( + new_table, + new_table->fts->cache->synced_doc_id, + NULL); + } + } + + if (vers_update_trt) { + trx->mod_tables.emplace(new_table, 0) + .first->second.set_versioned(0); + } + + trx->op_info = ""; + + DBUG_RETURN(err); +} + +/** Write a record via buffer 2 and read the next record to buffer N. +@param N number of the buffer (0 or 1) +@param INDEX record descriptor +@param AT_END statement to execute at end of input */ +#define ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END) \ + do { \ + b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \ + &buf[2], b2, \ + of->fd, &of->offset, \ + mrec##N, offsets##N, \ + crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL , \ + space); \ + if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \ + goto corrupt; \ + } \ + b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\ + &buf[N], b##N, INDEX, \ + file->fd, foffs##N, \ + &mrec##N, offsets##N, \ + crypt_block ? &crypt_block[N * srv_sort_buf_size] : NULL, \ + space); \ + \ + if (UNIV_UNLIKELY(!b##N)) { \ + if (mrec##N) { \ + goto corrupt; \ + } \ + AT_END; \ + } \ + } while (0) + +#ifdef HAVE_PSI_STAGE_INTERFACE +#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \ + do { \ + if (stage != NULL) { \ + stage->inc(); \ + } \ + ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END); \ + } while (0) +#else /* HAVE_PSI_STAGE_INTERFACE */ +#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \ + ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END) +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +/** Merge two blocks of records on disk and write a bigger block. +@param[in] dup descriptor of index being created +@param[in] file file containing index entries +@param[in,out] block 3 buffers +@param[in,out] foffs0 offset of first source list in the file +@param[in,out] foffs1 offset of second source list in the file +@param[in,out] of output file +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL stage->inc() will be called for each record +processed. +@param[in,out] crypt_block encryption buffer +@param[in] space tablespace ID for encryption +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_merge_blocks( + const row_merge_dup_t* dup, + const merge_file_t* file, + row_merge_block_t* block, + ulint* foffs0, + ulint* foffs1, + merge_file_t* of, + ut_stage_alter_t* stage MY_ATTRIBUTE((unused)), + row_merge_block_t* crypt_block, + ulint space) +{ + mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ + + mrec_buf_t* buf; /*!< buffer for handling + split mrec in block[] */ + const byte* b0; /*!< pointer to block[0] */ + const byte* b1; /*!< pointer to block[srv_sort_buf_size] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ + const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */ + const mrec_t* mrec1; /*!< merge rec, points to + block[srv_sort_buf_size] or buf[1] */ + rec_offs* offsets0;/* offsets of mrec0 */ + rec_offs* offsets1;/* offsets of mrec1 */ + + DBUG_ENTER("row_merge_blocks"); + DBUG_LOG("ib_merge_sort", + "fd=" << file->fd << ',' << *foffs0 << '+' << *foffs1 + << " to fd=" << of->fd << ',' << of->offset); + + heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1); + + /* Write a record and read the next record. Split the output + file in two halves, which can be merged on the following pass. */ + + if (!row_merge_read(file->fd, *foffs0, &block[0], + crypt_block ? &crypt_block[0] : NULL, + space) || + !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size], + crypt_block ? &crypt_block[srv_sort_buf_size] : NULL, + space)) { +corrupt: + mem_heap_free(heap); + DBUG_RETURN(DB_CORRUPTION); + } + + b0 = &block[0]; + b1 = &block[srv_sort_buf_size]; + b2 = &block[2 * srv_sort_buf_size]; + + b0 = row_merge_read_rec( + &block[0], &buf[0], b0, dup->index, + file->fd, foffs0, &mrec0, offsets0, + crypt_block ? &crypt_block[0] : NULL, + space); + + b1 = row_merge_read_rec( + &block[srv_sort_buf_size], + &buf[srv_sort_buf_size], b1, dup->index, + file->fd, foffs1, &mrec1, offsets1, + crypt_block ? &crypt_block[srv_sort_buf_size] : NULL, + space); + + if (UNIV_UNLIKELY(!b0 && mrec0) + || UNIV_UNLIKELY(!b1 && mrec1)) { + + goto corrupt; + } + + while (mrec0 && mrec1) { + int cmp = cmp_rec_rec_simple( + mrec0, mrec1, offsets0, offsets1, + dup->index, dup->table); + if (cmp < 0) { + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged); + } else if (cmp) { + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged); + } else { + mem_heap_free(heap); + DBUG_RETURN(DB_DUPLICATE_KEY); + } + } + +merged: + if (mrec0) { + /* append all mrec0 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0); + } + } +done0: + if (mrec1) { + /* append all mrec1 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1); + } + } +done1: + + mem_heap_free(heap); + + b2 = row_merge_write_eof( + &block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset, + crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL, + space); + DBUG_RETURN(b2 ? DB_SUCCESS : DB_CORRUPTION); +} + +/** Copy a block of index entries. +@param[in] index index being created +@param[in] file input file +@param[in,out] block 3 buffers +@param[in,out] foffs0 input file offset +@param[in,out] of output file +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL stage->inc() will be called for each record +processed. +@param[in,out] crypt_block encryption buffer +@param[in] space tablespace ID for encryption +@return TRUE on success, FALSE on failure */ +static MY_ATTRIBUTE((warn_unused_result)) +ibool +row_merge_blocks_copy( + const dict_index_t* index, + const merge_file_t* file, + row_merge_block_t* block, + ulint* foffs0, + merge_file_t* of, + ut_stage_alter_t* stage MY_ATTRIBUTE((unused)), + row_merge_block_t* crypt_block, + ulint space) +{ + mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ + + mrec_buf_t* buf; /*!< buffer for handling + split mrec in block[] */ + const byte* b0; /*!< pointer to block[0] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ + const mrec_t* mrec0; /*!< merge rec, points to block[0] */ + rec_offs* offsets0;/* offsets of mrec0 */ + rec_offs* offsets1;/* dummy offsets */ + + DBUG_ENTER("row_merge_blocks_copy"); + DBUG_LOG("ib_merge_sort", + "fd=" << file->fd << ',' << foffs0 + << " to fd=" << of->fd << ',' << of->offset); + + heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1); + + /* Write a record and read the next record. Split the output + file in two halves, which can be merged on the following pass. */ + + if (!row_merge_read(file->fd, *foffs0, &block[0], + crypt_block ? &crypt_block[0] : NULL, + space)) { +corrupt: + mem_heap_free(heap); + DBUG_RETURN(FALSE); + } + + b0 = &block[0]; + + b2 = &block[2 * srv_sort_buf_size]; + + b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, + file->fd, foffs0, &mrec0, offsets0, + crypt_block ? &crypt_block[0] : NULL, + space); + + if (UNIV_UNLIKELY(!b0 && mrec0)) { + + goto corrupt; + } + + if (mrec0) { + /* append all mrec0 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0); + } + } +done0: + + /* The file offset points to the beginning of the last page + that has been read. Update it to point to the next block. */ + (*foffs0)++; + + mem_heap_free(heap); + + DBUG_RETURN(row_merge_write_eof( + &block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset, + crypt_block + ? &crypt_block[2 * srv_sort_buf_size] + : NULL, space) + != NULL); +} + +/** Merge disk files. +@param[in] trx transaction +@param[in] dup descriptor of index being created +@param[in,out] file file containing index entries +@param[in,out] block 3 buffers +@param[in,out] tmpfd temporary file handle +@param[in,out] num_run Number of runs that remain to be merged +@param[in,out] run_offset Array that contains the first offset number +for each merge run +@param[in,out] stage performance schema accounting object, used by +@param[in,out] crypt_block encryption buffer +@param[in] space tablespace ID for encryption +ALTER TABLE. If not NULL stage->inc() will be called for each record +processed. +@return DB_SUCCESS or error code */ +static +dberr_t +row_merge( + trx_t* trx, + const row_merge_dup_t* dup, + merge_file_t* file, + row_merge_block_t* block, + pfs_os_file_t* tmpfd, + ulint* num_run, + ulint* run_offset, + ut_stage_alter_t* stage, + row_merge_block_t* crypt_block, + ulint space) +{ + ulint foffs0; /*!< first input offset */ + ulint foffs1; /*!< second input offset */ + dberr_t error; /*!< error code */ + merge_file_t of; /*!< output file */ + const ulint ihalf = run_offset[*num_run / 2]; + /*!< half the input file */ + ulint n_run = 0; + /*!< num of runs generated from this merge */ + + MEM_CHECK_ADDRESSABLE(&block[0], 3 * srv_sort_buf_size); + + if (crypt_block) { + MEM_CHECK_ADDRESSABLE(&crypt_block[0], 3 * srv_sort_buf_size); + } + + ut_ad(ihalf < file->offset); + + of.fd = *tmpfd; + of.offset = 0; + of.n_rec = 0; + +#ifdef POSIX_FADV_SEQUENTIAL + /* The input file will be read sequentially, starting from the + beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL + affects the entire file. Each block will be read exactly once. */ + posix_fadvise(file->fd, 0, 0, + POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE); +#endif /* POSIX_FADV_SEQUENTIAL */ + + /* Merge blocks to the output file. */ + foffs0 = 0; + foffs1 = ihalf; + + MEM_UNDEFINED(run_offset, *num_run * sizeof *run_offset); + + for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) { + + if (trx_is_interrupted(trx)) { + return(DB_INTERRUPTED); + } + + /* Remember the offset number for this run */ + run_offset[n_run++] = of.offset; + + error = row_merge_blocks(dup, file, block, + &foffs0, &foffs1, &of, stage, + crypt_block, space); + + if (error != DB_SUCCESS) { + return(error); + } + + } + + /* Copy the last blocks, if there are any. */ + + while (foffs0 < ihalf) { + + if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + return(DB_INTERRUPTED); + } + + /* Remember the offset number for this run */ + run_offset[n_run++] = of.offset; + + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs0, &of, stage, + crypt_block, space)) { + return(DB_CORRUPTION); + } + } + + ut_ad(foffs0 == ihalf); + + while (foffs1 < file->offset) { + + if (trx_is_interrupted(trx)) { + return(DB_INTERRUPTED); + } + + /* Remember the offset number for this run */ + run_offset[n_run++] = of.offset; + + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs1, &of, stage, + crypt_block, space)) { + return(DB_CORRUPTION); + } + } + + ut_ad(foffs1 == file->offset); + + if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) { + return(DB_CORRUPTION); + } + + ut_ad(n_run <= *num_run); + + *num_run = n_run; + + /* Each run can contain one or more offsets. As merge goes on, + the number of runs (to merge) will reduce until we have one + single run. So the number of runs will always be smaller than + the number of offsets in file */ + ut_ad((*num_run) <= file->offset); + + /* The number of offsets in output file is always equal or + smaller than input file */ + ut_ad(of.offset <= file->offset); + + /* Swap file descriptors for the next pass. */ + *tmpfd = file->fd; + *file = of; + + MEM_UNDEFINED(&block[0], 3 * srv_sort_buf_size); + + return(DB_SUCCESS); +} + +/** Merge disk files. +@param[in] trx transaction +@param[in] dup descriptor of index being created +@param[in,out] file file containing index entries +@param[in,out] block 3 buffers +@param[in,out] tmpfd temporary file handle +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially +and then stage->inc() will be called for each record processed. +@return DB_SUCCESS or error code */ +dberr_t +row_merge_sort( + trx_t* trx, + const row_merge_dup_t* dup, + merge_file_t* file, + row_merge_block_t* block, + pfs_os_file_t* tmpfd, + const bool update_progress, + /*!< in: update progress + status variable or not */ + const double pct_progress, + /*!< in: total progress percent + until now */ + const double pct_cost, /*!< in: current progress percent */ + row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ + ulint space, /*!< in: space id */ + ut_stage_alter_t* stage) +{ + const ulint half = file->offset / 2; + ulint num_runs; + ulint* run_offset; + dberr_t error = DB_SUCCESS; + ulint merge_count = 0; + ulint total_merge_sort_count; + double curr_progress = 0; + + DBUG_ENTER("row_merge_sort"); + + /* Record the number of merge runs we need to perform */ + num_runs = file->offset; + + if (stage != NULL) { + stage->begin_phase_sort(log2(double(num_runs))); + } + + /* If num_runs are less than 1, nothing to merge */ + if (num_runs <= 1) { + DBUG_RETURN(error); + } + + total_merge_sort_count = ulint(ceil(log2(double(num_runs)))); + + /* "run_offset" records each run's first offset number */ + run_offset = (ulint*) ut_malloc_nokey(file->offset * sizeof(ulint)); + + /* This tells row_merge() where to start for the first round + of merge. */ + run_offset[half] = half; + + /* The file should always contain at least one byte (the end + of file marker). Thus, it must be at least one block. */ + ut_ad(file->offset > 0); + + /* These thd_progress* calls will crash on sol10-64 when innodb_plugin + is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on + sol10-64 in buildbot. + */ +#ifndef __sun__ + /* Progress report only for "normal" indexes. */ + if (dup && !(dup->index->type & DICT_FTS)) { + thd_progress_init(trx->mysql_thd, 1); + } +#endif /* __sun__ */ + + if (global_system_variables.log_warnings > 2) { + sql_print_information("InnoDB: Online DDL : merge-sorting" + " has estimated " ULINTPF " runs", + num_runs); + } + + /* Merge the runs until we have one big run */ + do { + /* Report progress of merge sort to MySQL for + show processlist progress field */ + /* Progress report only for "normal" indexes. */ +#ifndef __sun__ + if (dup && !(dup->index->type & DICT_FTS)) { + thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset); + } +#endif /* __sun__ */ + + error = row_merge(trx, dup, file, block, tmpfd, + &num_runs, run_offset, stage, + crypt_block, space); + + if(update_progress) { + merge_count++; + curr_progress = (merge_count >= total_merge_sort_count) ? + pct_cost : + pct_cost * static_cast<double>(merge_count) + / static_cast<double>(total_merge_sort_count); + /* presenting 10.12% as 1012 integer */; + onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100); + } + + if (error != DB_SUCCESS) { + break; + } + + MEM_CHECK_DEFINED(run_offset, num_runs * sizeof *run_offset); + } while (num_runs > 1); + + ut_free(run_offset); + + /* Progress report only for "normal" indexes. */ +#ifndef __sun__ + if (dup && !(dup->index->type & DICT_FTS)) { + thd_progress_end(trx->mysql_thd); + } +#endif /* __sun__ */ + + DBUG_RETURN(error); +} + +/** Copy the blob from the given blob file and store it +in field data for the tuple +@param tuple tuple to be inserted +@param heap heap to allocate the memory for the blob storage +@param blob_file file to handle blob data */ +static dberr_t row_merge_copy_blob_from_file(dtuple_t *tuple, mem_heap_t *heap, + merge_file_t *blob_file) +{ + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) + { + dfield_t *field= dtuple_get_nth_field(tuple, i); + const byte *field_data= static_cast<byte*>(dfield_get_data(field)); + ulint field_len= dfield_get_len(field); + if (!dfield_is_ext(field)) + continue; + + ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(!dfield_is_null(field)); + + ut_ad(mach_read_from_8(field_data) == 0); + uint64_t offset= mach_read_from_8(field_data + 8); + uint32_t len= mach_read_from_4(field_data + 16); + + byte *data= (byte*) mem_heap_alloc(heap, len); + if (dberr_t err= os_file_read(IORequestRead, blob_file->fd, data, + offset, len, nullptr)) + return err; + dfield_set_data(field, data, len); + } + + return DB_SUCCESS; +} + +/** Copy externally stored columns to the data tuple. +@param[in] mrec record containing BLOB pointers, +or NULL to use tuple instead +@param[in] offsets offsets of mrec +@param[in] zip_size compressed page size in bytes, or 0 +@param[in,out] tuple data tuple +@param[in,out] heap memory heap */ +static +void +row_merge_copy_blobs( + const mrec_t* mrec, + const rec_offs* offsets, + ulint zip_size, + dtuple_t* tuple, + mem_heap_t* heap) +{ + ut_ad(mrec == NULL || rec_offs_any_extern(offsets)); + + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { + ulint len; + const void* data; + dfield_t* field = dtuple_get_nth_field(tuple, i); + ulint field_len; + const byte* field_data; + + if (!dfield_is_ext(field)) { + continue; + } + + ut_ad(!dfield_is_null(field)); + + /* During the creation of a PRIMARY KEY, the table is + X-locked, and we skip copying records that have been + marked for deletion. Therefore, externally stored + columns cannot possibly be freed between the time the + BLOB pointers are read (row_merge_read_clustered_index()) + and dereferenced (below). */ + if (mrec == NULL) { + field_data + = static_cast<byte*>(dfield_get_data(field)); + field_len = dfield_get_len(field); + + ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE); + + ut_a(memcmp(field_data + field_len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + + data = btr_copy_externally_stored_field( + &len, field_data, zip_size, field_len, heap); + } else { + data = btr_rec_copy_externally_stored_field( + mrec, offsets, zip_size, i, &len, heap); + } + + /* Because we have locked the table, any records + written by incomplete transactions must have been + rolled back already. There must not be any incomplete + BLOB columns. */ + ut_a(data); + + dfield_set_data(field, data, len); + } +} + +/** Convert a merge record to a typed data tuple. Note that externally +stored fields are not copied to heap. +@param[in,out] index index on the table +@param[in] mtuple merge record +@param[in] heap memory heap from which memory needed is allocated +@return index entry built. */ +static +void +row_merge_mtuple_to_dtuple( + dict_index_t* index, + dtuple_t* dtuple, + const mtuple_t* mtuple) +{ + ut_ad(!dict_index_is_ibuf(index)); + + memcpy(dtuple->fields, mtuple->fields, + dtuple->n_fields * sizeof *mtuple->fields); +} + +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_merge_insert_index_tuples( + dict_index_t* index, + const dict_table_t* old_table, + const pfs_os_file_t& fd, + row_merge_block_t* block, + const row_merge_buf_t* row_buf, + BtrBulk* btr_bulk, + const ib_uint64_t table_total_rows, + double pct_progress, + double pct_cost, + row_merge_block_t* crypt_block, + ulint space, + ut_stage_alter_t* stage, + merge_file_t* blob_file) +{ + const byte* b; + mem_heap_t* heap; + mem_heap_t* tuple_heap; + dberr_t error = DB_SUCCESS; + ulint foffs = 0; + rec_offs* offsets; + mrec_buf_t* buf; + ulint n_rows = 0; + dtuple_t* dtuple; + ib_uint64_t inserted_rows = 0; + double curr_progress = 0; + dict_index_t* old_index = NULL; + const mrec_t* mrec = NULL; + mtr_t mtr; + + + DBUG_ENTER("row_merge_insert_index_tuples"); + + ut_ad(!srv_read_only_mode); + ut_ad(!(index->type & DICT_FTS)); + ut_ad(!dict_index_is_spatial(index)); + + if (stage != NULL) { + stage->begin_phase_insert(); + } + + tuple_heap = mem_heap_create(1000); + + { + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); + offsets = static_cast<rec_offs*>( + mem_heap_alloc(heap, i * sizeof *offsets)); + rec_offs_set_n_alloc(offsets, i); + rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index)); + } + + if (row_buf != NULL) { + ut_ad(fd == OS_FILE_CLOSED); + ut_ad(block == NULL); + DBUG_EXECUTE_IF("row_merge_read_failure", + error = DB_CORRUPTION; + goto err_exit;); + buf = NULL; + b = NULL; + dtuple = dtuple_create( + heap, dict_index_get_n_fields(index)); + dtuple_set_n_fields_cmp( + dtuple, dict_index_get_n_unique_in_tree(index)); + } else { + b = block; + dtuple = NULL; + + if (!row_merge_read(fd, foffs, block, crypt_block, space)) { + error = DB_CORRUPTION; + goto err_exit; + } else { + buf = static_cast<mrec_buf_t*>( + mem_heap_alloc(heap, sizeof *buf)); + } + } + + for (;;) { + + if (stage != NULL) { + stage->inc(); + } + + if (row_buf != NULL) { + if (n_rows >= row_buf->n_tuples) { + break; + } + + /* Convert merge tuple record from + row buffer to data tuple record */ + row_merge_mtuple_to_dtuple( + index, dtuple, &row_buf->tuples[n_rows]); + n_rows++; + /* BLOB pointers must be copied from dtuple */ + mrec = NULL; + } else { + b = row_merge_read_rec(block, buf, b, index, + fd, &foffs, &mrec, offsets, + crypt_block, + space); + + if (UNIV_UNLIKELY(!b)) { + /* End of list, or I/O error */ + if (mrec) { + error = DB_CORRUPTION; + } + break; + } + + dtuple = row_rec_to_index_entry_low( + mrec, index, offsets, tuple_heap); + } + + old_index = dict_table_get_first_index(old_table); + + if (dict_index_is_clust(index) + && dict_index_is_online_ddl(old_index)) { + error = row_log_table_get_error(old_index); + if (error != DB_SUCCESS) { + break; + } + } + + ut_ad(!dtuple_get_n_ext(dtuple) || index->is_primary()); + + if (!dtuple_get_n_ext(dtuple)) { + } else if (blob_file) { + error = row_merge_copy_blob_from_file( + dtuple, tuple_heap, blob_file); + if (error != DB_SUCCESS) { + break; + } + } else { + /* Off-page columns can be fetched safely + when concurrent modifications to the table + are disabled. (Purge can process delete-marked + records, but row_merge_read_clustered_index() + would have skipped them.) + + When concurrent modifications are enabled, + row_merge_read_clustered_index() will + only see rows from transactions that were + committed before the ALTER TABLE started + (REPEATABLE READ). + + Any modifications after the + row_merge_read_clustered_index() scan + will go through row_log_table_apply(). */ + row_merge_copy_blobs( + mrec, offsets, + old_table->space->zip_size(), + dtuple, tuple_heap); + } + + ut_ad(dtuple_validate(dtuple)); + error = btr_bulk->insert(dtuple); + + if (error != DB_SUCCESS) { + goto err_exit; + } + + mem_heap_empty(tuple_heap); + + /* Increment innodb_onlineddl_pct_progress status variable */ + inserted_rows++; + if(inserted_rows % 1000 == 0) { + /* Update progress for each 1000 rows */ + curr_progress = (inserted_rows >= table_total_rows || + table_total_rows <= 0) ? + pct_cost : + pct_cost * static_cast<double>(inserted_rows) + / static_cast<double>(table_total_rows); + + /* presenting 10.12% as 1012 integer */; + onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100); + } + } + +err_exit: + mem_heap_free(tuple_heap); + mem_heap_free(heap); + + DBUG_RETURN(error); +} + +/*********************************************************************//** +Drop an index that was created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +static +void +row_merge_drop_index_dict( +/*======================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + index_id_t index_id)/*!< in: index identifier */ +{ + static const char sql[] = + "PROCEDURE DROP_INDEX_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n" + "END;\n"; + dberr_t error; + pars_info_t* info; + + ut_ad(!srv_read_only_mode); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(trx->dict_operation); + ut_ad(dict_sys.locked()); + + info = pars_info_create(); + pars_info_add_ull_literal(info, "indexid", index_id); + trx->op_info = "dropping index from dictionary"; + error = que_eval_sql(info, sql, trx); + + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ib::error() << "row_merge_drop_index_dict failed with error " + << error; + } + + trx->op_info = ""; +} + +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +static +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ +{ + static const char sql[] = + "PROCEDURE DROP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" + + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE TABLE_ID=:tableid AND\n" + " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + + "END;\n"; + dberr_t error; + pars_info_t* info; + + ut_ad(!srv_read_only_mode); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(trx->dict_operation); + ut_ad(dict_sys.locked()); + + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by dict_sys.latch. */ + + info = pars_info_create(); + pars_info_add_ull_literal(info, "tableid", table_id); + trx->op_info = "dropping indexes"; + error = que_eval_sql(info, sql, trx); + + switch (error) { + case DB_SUCCESS: + break; + default: + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + ib::error() << "row_merge_drop_indexes_dict failed with error " + << error; + /* fall through */ + case DB_TOO_MANY_CONCURRENT_TRXS: + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; +} + +/** Drop common internal tables if all fulltext indexes are dropped +@param trx transaction +@param table user table */ +static void row_merge_drop_fulltext_indexes(trx_t *trx, dict_table_t *table) +{ + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) || + !table->fts || + !ib_vector_is_empty(table->fts->indexes)) + return; + + for (const dict_index_t *index= dict_table_get_first_index(table); + index; index= dict_table_get_next_index(index)) + if (index->type & DICT_FTS) + return; + + fts_optimize_remove_table(table); + fts_drop_tables(trx, *table); + table->fts->~fts_t(); + table->fts= nullptr; + DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS); +} + +/** Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. +@param trx dictionary transaction +@param table table containing the indexes +@param locked True if table is locked, + false - may need to do lazy drop +@param alter_trx Alter table transaction */ +void +row_merge_drop_indexes( + trx_t* trx, + dict_table_t* table, + bool locked, + const trx_t* alter_trx) +{ + dict_index_t* index; + dict_index_t* next_index; + + ut_ad(!srv_read_only_mode); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(trx->dict_operation); + ut_ad(dict_sys.locked()); + + index = dict_table_get_first_index(table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE); + + /* the caller should have an open handle to the table */ + ut_ad(table->get_ref_count() >= 1); + + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by MDL. */ + + if (!locked && (table->get_ref_count() > 1 + || table->has_lock_other_than(alter_trx))) { + while ((index = dict_table_get_next_index(index)) != NULL) { + ut_ad(!dict_index_is_clust(index)); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_ABORTED_DROPPED: + continue; + case ONLINE_INDEX_COMPLETE: + if (index->is_committed()) { + /* Do nothing to already + published indexes. */ + } else if (index->type & DICT_FTS) { + /* Drop a completed FULLTEXT + index, due to a timeout during + MDL upgrade for + commit_inplace_alter_table(). + Because only concurrent reads + are allowed (and they are not + seeing this index yet) we + are safe to drop the index. */ + dict_index_t* prev = UT_LIST_GET_PREV( + indexes, index); + /* At least there should be + the clustered index before + this one. */ + ut_ad(prev); + ut_a(table->fts); + fts_drop_index(table, index, trx); + row_merge_drop_index_dict( + trx, index->id); + /* We can remove a DICT_FTS + index from the cache, because + we do not allow ADD FULLTEXT INDEX + with LOCK=NONE. If we allowed that, + we should exclude FTS entries from + prebuilt->ins_node->entry_list + in ins_node_create_entry_list(). */ +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!index->search_info->ref_count); +#endif /* BTR_CUR_HASH_ADAPT */ + dict_index_remove_from_cache( + table, index); + index = prev; + } else { + index->lock.x_lock(SRW_LOCK_CALL); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED); + index->type |= DICT_CORRUPT; + table->drop_aborted = TRUE; + goto drop_aborted; + } + continue; + case ONLINE_INDEX_CREATION: + index->lock.x_lock(SRW_LOCK_CALL); + ut_ad(!index->is_committed()); + row_log_abort_sec(index); + drop_aborted: + index->lock.x_unlock(); + + DEBUG_SYNC_C("merge_drop_index_after_abort"); + /* covered by dict_sys.latch */ + MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX); + /* fall through */ + case ONLINE_INDEX_ABORTED: + /* Drop the index tree from the + data dictionary and free it from + the tablespace, but keep the object + in the data dictionary cache. */ + row_merge_drop_index_dict(trx, index->id); + index->lock.x_lock(SRW_LOCK_CALL); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED_DROPPED); + index->lock.x_unlock(); + table->drop_aborted = TRUE; + continue; + } + ut_error; + } + + row_merge_drop_fulltext_indexes(trx, table); + return; + } + + row_merge_drop_indexes_dict(trx, table->id); + + /* Invalidate all row_prebuilt_t::ins_graph that are referring + to this table. That is, force row_get_prebuilt_insert_row() to + rebuild prebuilt->ins_node->entry_list). */ + if (table->def_trx_id < trx->id) { + table->def_trx_id = trx->id; + } else { + ut_ad(table->def_trx_id == trx->id || table->name.part()); + } + + next_index = dict_table_get_next_index(index); + + while ((index = next_index) != NULL) { + /* read the next pointer before freeing the index */ + next_index = dict_table_get_next_index(index); + + ut_ad(!dict_index_is_clust(index)); + + if (!index->is_committed()) { + /* If it is FTS index, drop from table->fts + and also drop its auxiliary tables */ + if (index->type & DICT_FTS) { + ut_a(table->fts); + fts_drop_index(table, index, trx); + } + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + /* This state should only be possible + when prepare_inplace_alter_table() fails + after invoking row_merge_create_index(). + In inplace_alter_table(), + row_merge_build_indexes() + should never leave the index in this state. + It would invoke row_log_abort_sec() on + failure. */ + case ONLINE_INDEX_COMPLETE: + /* In these cases, we are able to drop + the index straight. The DROP INDEX was + never deferred. */ + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* covered by dict_sys.latch */ + MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX); + } + + dict_index_remove_from_cache(table, index); + } + } + + row_merge_drop_fulltext_indexes(trx, table); + table->drop_aborted = FALSE; + ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE)); +} + +/** Drop fulltext indexes */ +static ibool row_merge_drop_fts(void *node, void *trx) +{ + auto s= static_cast<sel_node_t*>(node); + + const dfield_t *table_id= que_node_get_val(s->select_list); + ut_ad(table_id->type.mtype == DATA_BINARY); + node= que_node_get_next(s->select_list); + ut_ad(!que_node_get_next(node)); + const dfield_t *index_id= que_node_get_val(node); + ut_ad(index_id->type.mtype == DATA_BINARY); + + static const char sql[]= + "PROCEDURE DROP_TABLES_PROC () IS\n" + "tid CHAR;\n" + "iid CHAR;\n" + + "DECLARE CURSOR cur_tab IS\n" + "SELECT ID FROM SYS_TABLES\n" + "WHERE INSTR(NAME,:name)+45=LENGTH(NAME)" + " AND INSTR('123456',SUBSTR(NAME,LENGTH(NAME)-1,1))>0" + " FOR UPDATE;\n" + + "DECLARE CURSOR cur_idx IS\n" + "SELECT ID FROM SYS_INDEXES\n" + "WHERE TABLE_ID = tid FOR UPDATE;\n" + + "BEGIN\n" + "OPEN cur_tab;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH cur_tab INTO tid;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " OPEN cur_idx;\n" + " WHILE 1 = 1 LOOP\n" + " FETCH cur_idx INTO iid;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF cur_idx;\n" + " END LOOP;\n" + " CLOSE cur_idx;\n" + " DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n" + " DELETE FROM SYS_TABLES WHERE CURRENT OF cur_tab;\n" + "END LOOP;\n" + "CLOSE cur_tab;\n" + "END;\n"; + + if (table_id->len == 8 && index_id->len == 8) + { + char buf[sizeof "/FTS_0000000000000000_0000000000000000_INDEX_"]; + snprintf(buf, sizeof buf, "/FTS_%016llx_%016llx_INDEX_", + static_cast<ulonglong> + (mach_read_from_8(static_cast<const byte*>(table_id->data))), + static_cast<ulonglong> + (mach_read_from_8(static_cast<const byte*>(index_id->data)))); + auto pinfo= pars_info_create(); + pars_info_add_str_literal(pinfo, "name", buf); + que_eval_sql(pinfo, sql, static_cast<trx_t*>(trx)); + } + + return true; +} + +/** During recovery, drop recovered index stubs that were created in +prepare_inplace_alter_table_dict(). */ +void row_merge_drop_temp_indexes() +{ + static_assert(DICT_FTS == 32, "compatibility"); + + static const char sql[] = + "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" + + "DECLARE FUNCTION drop_fts;\n" + + "DECLARE CURSOR fts_cur IS\n" + " SELECT TABLE_ID,ID FROM SYS_INDEXES\n" + " WHERE TYPE=32" + " AND SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + " FOR UPDATE;\n" + + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "found := 1;\n" + "OPEN fts_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH fts_cur INTO drop_fts();\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE fts_cur;\n" + + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + "END;\n"; + + /* Load the table definitions that contain partially defined + indexes, so that the data dictionary information can be checked + when accessing the tablename.ibd files. */ + trx_t* trx = trx_create(); + trx_start_for_ddl(trx); + trx->op_info = "dropping partially created indexes"; + dberr_t error = lock_sys_tables(trx); + + row_mysql_lock_data_dictionary(trx); + /* Ensure that this transaction will be rolled back and locks + will be released, if the server gets killed before the commit + gets written to the redo log. */ + trx->dict_operation = true; + + trx->op_info = "dropping indexes"; + + pars_info_t* pinfo = pars_info_create(); + pars_info_bind_function(pinfo, "drop_fts", row_merge_drop_fts, trx); + if (error == DB_SUCCESS) { + error = que_eval_sql(pinfo, sql, trx); + } + + if (error) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ib::error() << "row_merge_drop_temp_indexes(): " << error; + } + + trx_commit_for_mysql(trx); + row_mysql_unlock_data_dictionary(trx); + trx->free(); +} + + +/** Create temporary merge files in the given paramater path, and if +UNIV_PFS_IO defined, register the file descriptor with Performance Schema. +@param[in] path location for creating temporary merge files, or NULL +@return File descriptor */ +pfs_os_file_t +row_merge_file_create_low( + const char* path) +{ + if (!path) { + path = mysql_tmpdir; + } +#ifdef UNIV_PFS_IO + /* This temp file open does not go through normal + file APIs, add instrumentation to register with + performance schema */ + struct PSI_file_locker* locker; + PSI_file_locker_state state; + static const char label[] = "/Innodb Merge Temp File"; + char* name = static_cast<char*>( + ut_malloc_nokey(strlen(path) + sizeof label)); + strcpy(name, path); + strcat(name, label); + + register_pfs_file_open_begin( + &state, locker, innodb_temp_file_key, + PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__); + +#endif + DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); + char filename[FN_REFLEN]; + File f = create_temp_file(filename, path, "ib", + O_BINARY | O_SEQUENTIAL, + MYF(MY_WME | MY_TEMPORARY)); + pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f); + +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, fd, + (fd == OS_FILE_CLOSED)?NULL:&fd); + ut_free(name); +#endif + + if (fd == OS_FILE_CLOSED) { + ib::error() << "Cannot create temporary merge file"; + } + return(fd); +} + + +/** Create a merge file in the given location. +@param[out] merge_file merge file structure +@param[in] path location for creating temporary file, or NULL +@return file descriptor, or OS_FILE_CLOSED on error */ +pfs_os_file_t +row_merge_file_create( + merge_file_t* merge_file, + const char* path) +{ + merge_file->fd = row_merge_file_create_low(path); + merge_file->offset = 0; + merge_file->n_rec = 0; + + if (merge_file->fd != OS_FILE_CLOSED) { + if (srv_disable_sort_file_cache) { + os_file_set_nocache(merge_file->fd, + "row0merge.cc", "sort"); + } + } + return(merge_file->fd); +} + +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +void +row_merge_file_destroy_low( +/*=======================*/ + const pfs_os_file_t& fd) /*!< in: merge file descriptor */ +{ + if (fd != OS_FILE_CLOSED) { + int res = mysql_file_close(IF_WIN(my_win_handle2File((os_file_t)fd), fd), + MYF(MY_WME)); + ut_a(res != -1); + } +} +/*********************************************************************//** +Destroy a merge file. */ +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< in/out: merge file structure */ +{ + ut_ad(!srv_read_only_mode); + + if (merge_file->fd != OS_FILE_CLOSED) { + row_merge_file_destroy_low(merge_file->fd); + merge_file->fd = OS_FILE_CLOSED; + } +} + +/*********************************************************************//** +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ +{ + dberr_t err = DB_SUCCESS; + pars_info_t* info = pars_info_create(); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ + + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" + "END;\n"; + + ut_ad(trx->dict_operation_lock_mode); + ut_ad(trx->dict_operation); + + trx->op_info = "renaming index to add"; + + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); + + err = que_eval_sql(info, rename_index, trx); + + if (err != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; + + ib::error() << "row_merge_rename_index_to_add failed with" + " error " << err; + } + + trx->op_info = ""; + + return(err); +} + +/** Create the index and load in to the dictionary. +@param[in,out] table the index is on this table +@param[in] index_def the index definition +@param[in] add_v new virtual columns added along with add + index call +@return index, or NULL on error */ +dict_index_t* +row_merge_create_index( + dict_table_t* table, + const index_def_t* index_def, + const dict_add_v_col_t* add_v) +{ + dict_index_t* index; + ulint n_fields = index_def->n_fields; + ulint i; + ulint n_add_vcol = 0; + + DBUG_ENTER("row_merge_create_index"); + + ut_ad(!srv_read_only_mode); + + /* Create the index prototype, using the passed in def, this is not + a persistent operation. We pass 0 as the space id, and determine at + a lower level the space id where to store the table. */ + + index = dict_mem_index_create(table, index_def->name, + index_def->ind_type, n_fields); + index->set_committed(index_def->rebuild); + + for (i = 0; i < n_fields; i++) { + const char* name; + index_field_t* ifield = &index_def->fields[i]; + + if (ifield->is_v_col) { + if (ifield->col_no >= table->n_v_def) { + ut_ad(ifield->col_no < table->n_v_def + + add_v->n_v_col); + ut_ad(ifield->col_no >= table->n_v_def); + name = add_v->v_col_name[ + ifield->col_no - table->n_v_def]; + n_add_vcol++; + } else { + name = dict_table_get_v_col_name( + table, ifield->col_no); + } + } else { + name = dict_table_get_col_name(table, ifield->col_no); + } + + dict_mem_index_add_field(index, name, ifield->prefix_len, + ifield->descending); + } + + if (n_add_vcol) { + index->assign_new_v_col(n_add_vcol); + } + + DBUG_RETURN(index); +} + +/*********************************************************************//** +Check if a transaction can use an index. */ +bool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to check */ +{ + if (!index->is_primary() + && dict_index_is_online_ddl(index)) { + /* Indexes that are being created are not useable. */ + return(false); + } + + return(!index->is_corrupted() + && (index->table->is_temporary() || index->table->no_rollback() + || index->trx_id == 0 + || !trx->read_view.is_open() + || trx->read_view.changes_visible(index->trx_id))); +} + +/** Build indexes on a table by reading a clustered index, creating a temporary +file containing index entries, merge sorting these index entries and inserting +sorted index entries to indexes. +@param[in] trx transaction +@param[in] old_table table where rows are read from +@param[in] new_table table where indexes are created; identical to +old_table unless creating a PRIMARY KEY +@param[in] online true if creating indexes online +@param[in] indexes indexes to be created +@param[in] key_numbers MySQL key numbers +@param[in] n_indexes size of indexes[] +@param[in,out] table MySQL table, for reporting erroneous key value +if applicable +@param[in] defaults default values of added, changed columns, or NULL +@param[in] col_map mapping of old column numbers to new ones, or +NULL if old_table == new_table +@param[in] add_autoinc number of added AUTO_INCREMENT columns, or +ULINT_UNDEFINED if none is added +@param[in,out] sequence autoinc sequence +@param[in] skip_pk_sort whether the new PRIMARY KEY will follow +existing order +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of +this function and it will be passed to other functions for further accounting. +@param[in] add_v new virtual columns added along with indexes +@param[in] eval_table mysql table used to evaluate virtual column + value, see innobase_get_computed_value(). +@param[in] allow_not_null allow the conversion from null to not-null +@param[in] col_collate columns whose collations changed, or nullptr +@return DB_SUCCESS or error code */ +dberr_t +row_merge_build_indexes( + trx_t* trx, + dict_table_t* old_table, + dict_table_t* new_table, + bool online, + dict_index_t** indexes, + const ulint* key_numbers, + ulint n_indexes, + struct TABLE* table, + const dtuple_t* defaults, + const ulint* col_map, + ulint add_autoinc, + ib_sequence_t& sequence, + bool skip_pk_sort, + ut_stage_alter_t* stage, + const dict_add_v_col_t* add_v, + struct TABLE* eval_table, + bool allow_not_null, + const col_collations* col_collate) +{ + merge_file_t* merge_files; + row_merge_block_t* block; + ut_new_pfx_t block_pfx; + size_t block_size; + ut_new_pfx_t crypt_pfx; + row_merge_block_t* crypt_block = NULL; + ulint i; + ulint j; + dberr_t error; + pfs_os_file_t tmpfd = OS_FILE_CLOSED; + dict_index_t* fts_sort_idx = NULL; + fts_psort_t* psort_info = NULL; + fts_psort_t* merge_info = NULL; + bool fts_psort_initiated = false; + + double total_static_cost = 0; + double total_dynamic_cost = 0; + ulint total_index_blocks = 0; + double pct_cost=0; + double pct_progress=0; + + DBUG_ENTER("row_merge_build_indexes"); + + ut_ad(!srv_read_only_mode); + ut_ad((old_table == new_table) == !col_map); + ut_ad(!defaults || col_map); + + stage->begin_phase_read_pk(skip_pk_sort && new_table != old_table + ? n_indexes - 1 + : n_indexes); + + /* Allocate memory for merge file data structure and initialize + fields */ + + ut_allocator<row_merge_block_t> alloc(mem_key_row_merge_sort); + + /* This will allocate "3 * srv_sort_buf_size" elements of type + row_merge_block_t. The latter is defined as byte. */ + block_size = 3 * srv_sort_buf_size; + block = alloc.allocate_large(block_size, &block_pfx); + + if (block == NULL) { + DBUG_RETURN(DB_OUT_OF_MEMORY); + } + + crypt_pfx.m_size = 0; /* silence bogus -Wmaybe-uninitialized */ + TRASH_ALLOC(&crypt_pfx, sizeof crypt_pfx); + + if (srv_encrypt_log) { + crypt_block = static_cast<row_merge_block_t*>( + alloc.allocate_large(block_size, + &crypt_pfx)); + + if (crypt_block == NULL) { + DBUG_RETURN(DB_OUT_OF_MEMORY); + } + } + + trx_start_if_not_started_xa(trx, true); + ulint n_merge_files = 0; + + for (ulint i = 0; i < n_indexes; i++) + { + if (!dict_index_is_spatial(indexes[i])) { + n_merge_files++; + } + } + + merge_files = static_cast<merge_file_t*>( + ut_malloc_nokey(n_merge_files * sizeof *merge_files)); + + /* Initialize all the merge file descriptors, so that we + don't call row_merge_file_destroy() on uninitialized + merge file descriptor */ + + for (i = 0; i < n_merge_files; i++) { + merge_files[i].fd = OS_FILE_CLOSED; + merge_files[i].offset = 0; + merge_files[i].n_rec = 0; + } + + total_static_cost = COST_BUILD_INDEX_STATIC + * static_cast<double>(n_indexes) + COST_READ_CLUSTERED_INDEX; + total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC + * static_cast<double>(n_indexes); + for (i = 0; i < n_indexes; i++) { + if (indexes[i]->type & DICT_FTS) { + ibool opt_doc_id_size = FALSE; + + /* To build FTS index, we would need to extract + doc's word, Doc ID, and word's position, so + we need to build a "fts sort index" indexing + on above three 'fields' */ + fts_sort_idx = row_merge_create_fts_sort_index( + indexes[i], old_table, &opt_doc_id_size); + + row_merge_dup_t* dup + = static_cast<row_merge_dup_t*>( + ut_malloc_nokey(sizeof *dup)); + dup->index = fts_sort_idx; + dup->table = table; + dup->col_map = col_map; + dup->n_dup = 0; + + /* This can fail e.g. if temporal files can't be + created */ + if (!row_fts_psort_info_init( + trx, dup, new_table, opt_doc_id_size, + old_table->space->zip_size(), + &psort_info, &merge_info)) { + error = DB_CORRUPTION; + goto func_exit; + } + + /* We need to ensure that we free the resources + allocated */ + fts_psort_initiated = true; + } + } + + if (global_system_variables.log_warnings > 2) { + sql_print_information("InnoDB: Online DDL : Start reading" + " clustered index of the table" + " and create temporary files"); + } + + pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost); + + /* Do not continue if we can't encrypt table pages */ + if (!old_table->is_readable() || + !new_table->is_readable()) { + error = DB_DECRYPTION_FAILED; + ib_push_warning(trx->mysql_thd, DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + !old_table->is_readable() ? old_table->name.m_name : + new_table->name.m_name); + goto func_exit; + } + + /* Read clustered index of the table and create files for + secondary index entries for merge sort */ + error = row_merge_read_clustered_index( + trx, table, old_table, new_table, online, indexes, + fts_sort_idx, psort_info, merge_files, key_numbers, + n_indexes, defaults, add_v, col_map, add_autoinc, + sequence, block, skip_pk_sort, &tmpfd, stage, + pct_cost, crypt_block, eval_table, allow_not_null, + col_collate); + + stage->end_phase_read_pk(); + + pct_progress += pct_cost; + + if (global_system_variables.log_warnings > 2) { + sql_print_information("InnoDB: Online DDL : End of reading " + "clustered index of the table" + " and create temporary files"); + } + + for (i = 0; i < n_merge_files; i++) { + total_index_blocks += merge_files[i].offset; + } + + if (error != DB_SUCCESS) { + goto func_exit; + } + + DEBUG_SYNC_C("row_merge_after_scan"); + + /* Now we have files containing index entries ready for + sorting and inserting. */ + + for (ulint k = 0, i = 0; i < n_indexes; i++) { + dict_index_t* sort_idx = indexes[i]; + + if (dict_index_is_spatial(sort_idx)) { + continue; + } + + if (indexes[i]->type & DICT_FTS) { + + sort_idx = fts_sort_idx; + + if (FTS_PLL_MERGE) { + row_fts_start_parallel_merge(merge_info); + for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { + merge_info[j].task->wait(); + delete merge_info[j].task; + } + } else { + /* This cannot report duplicates; an + assertion would fail in that case. */ + error = row_fts_merge_insert( + sort_idx, new_table, + psort_info, 0); + } + +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); +#endif + } else if (merge_files[k].fd != OS_FILE_CLOSED) { + char buf[NAME_LEN + 1]; + row_merge_dup_t dup = { + sort_idx, table, col_map, 0}; + + pct_cost = (COST_BUILD_INDEX_STATIC + + (total_dynamic_cost + * static_cast<double>(merge_files[k].offset) + / static_cast<double>(total_index_blocks))) + / (total_static_cost + total_dynamic_cost) + * PCT_COST_MERGESORT_INDEX * 100; + char* bufend = innobase_convert_name( + buf, sizeof buf, + indexes[i]->name, + strlen(indexes[i]->name), + trx->mysql_thd); + buf[bufend - buf]='\0'; + + if (global_system_variables.log_warnings > 2) { + sql_print_information("InnoDB: Online DDL :" + " Start merge-sorting" + " index %s" + " (" ULINTPF + " / " ULINTPF ")," + " estimated cost :" + " %2.4f", + buf, i + 1, n_indexes, + pct_cost); + } + + error = row_merge_sort( + trx, &dup, &merge_files[k], + block, &tmpfd, true, + pct_progress, pct_cost, + crypt_block, new_table->space_id, + stage); + + pct_progress += pct_cost; + + if (global_system_variables.log_warnings > 2) { + sql_print_information("InnoDB: Online DDL :" + " End of " + " merge-sorting index %s" + " (" ULINTPF + " / " ULINTPF ")", + buf, i + 1, n_indexes); + } + + if (error == DB_SUCCESS) { + BtrBulk btr_bulk(sort_idx, trx); + + pct_cost = (COST_BUILD_INDEX_STATIC + + (total_dynamic_cost + * static_cast<double>( + merge_files[k].offset) + / static_cast<double>( + total_index_blocks))) + / (total_static_cost + + total_dynamic_cost) + * PCT_COST_INSERT_INDEX * 100; + + if (global_system_variables.log_warnings > 2) { + sql_print_information( + "InnoDB: Online DDL : Start " + "building index %s" + " (" ULINTPF + " / " ULINTPF "), estimated " + "cost : %2.4f", buf, i + 1, + n_indexes, pct_cost); + } + + error = row_merge_insert_index_tuples( + sort_idx, old_table, + merge_files[k].fd, block, NULL, + &btr_bulk, + merge_files[k].n_rec, pct_progress, pct_cost, + crypt_block, new_table->space_id, + stage); + + error = btr_bulk.finish(error); + + pct_progress += pct_cost; + + if (global_system_variables.log_warnings > 2) { + sql_print_information( + "InnoDB: Online DDL : " + "End of building index %s" + " (" ULINTPF " / " ULINTPF ")", + buf, i + 1, n_indexes); + } + } + } + + /* Close the temporary file to free up space. */ + row_merge_file_destroy(&merge_files[k++]); + + if (indexes[i]->type & DICT_FTS) { + row_fts_psort_info_destroy(psort_info, merge_info); + fts_psort_initiated = false; + } else if (old_table != new_table) { + ut_ad(!sort_idx->online_log); + ut_ad(sort_idx->online_status + == ONLINE_INDEX_COMPLETE); + } + + if (old_table != new_table + || (indexes[i]->type & (DICT_FTS | DICT_SPATIAL)) + || error != DB_SUCCESS || !online) { + /* Do not apply any online log. */ + } else { + if (global_system_variables.log_warnings > 2) { + sql_print_information( + "InnoDB: Online DDL : Applying" + " log to index"); + } + + DEBUG_SYNC_C("row_log_apply_before"); + error = row_log_apply(trx, sort_idx, table, stage); + DEBUG_SYNC_C("row_log_apply_after"); + } + + if (error != DB_SUCCESS) { + trx->error_key_num = key_numbers[i]; + goto func_exit; + } + + if (indexes[i]->type & DICT_FTS + && UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "Finished building full-text index " + << indexes[i]->name; + } + } + +func_exit: + + DBUG_EXECUTE_IF( + "ib_build_indexes_too_many_concurrent_trxs", + error = DB_TOO_MANY_CONCURRENT_TRXS; + trx->error_state = error;); + + if (fts_psort_initiated) { + /* Clean up FTS psort related resource */ + row_fts_psort_info_destroy(psort_info, merge_info); + fts_psort_initiated = false; + } + + row_merge_file_destroy_low(tmpfd); + + for (i = 0; i < n_merge_files; i++) { + row_merge_file_destroy(&merge_files[i]); + } + + if (fts_sort_idx) { + dict_mem_index_free(fts_sort_idx); + } + + ut_free(merge_files); + + alloc.deallocate_large(block, &block_pfx); + + if (crypt_block) { + alloc.deallocate_large(crypt_block, &crypt_pfx); + } + + DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); + + if (online && old_table == new_table && error != DB_SUCCESS) { + /* On error, flag all online secondary index creation + as aborted. */ + for (i = 0; i < n_indexes; i++) { + ut_ad(!(indexes[i]->type & DICT_FTS)); + ut_ad(!indexes[i]->is_committed()); + ut_ad(!dict_index_is_clust(indexes[i])); + + /* Completed indexes should be dropped as + well, and indexes whose creation was aborted + should be dropped from the persistent + storage. However, at this point we can only + set some flags in the not-yet-published + indexes. These indexes will be dropped later + in row_merge_drop_indexes(), called by + rollback_inplace_alter_table(). */ + + switch (dict_index_get_online_status(indexes[i])) { + case ONLINE_INDEX_COMPLETE: + break; + case ONLINE_INDEX_CREATION: + indexes[i]->lock.x_lock(SRW_LOCK_CALL); + row_log_abort_sec(indexes[i]); + indexes[i]->type |= DICT_CORRUPT; + indexes[i]->lock.x_unlock(); + new_table->drop_aborted = TRUE; + /* fall through */ + case ONLINE_INDEX_ABORTED_DROPPED: + case ONLINE_INDEX_ABORTED: + MONITOR_ATOMIC_INC( + MONITOR_BACKGROUND_DROP_INDEX); + } + } + + dict_index_t *clust_index= new_table->indexes.start; + clust_index->lock.x_lock(SRW_LOCK_CALL); + ut_ad(!clust_index->online_log || + clust_index->online_log_is_dummy()); + clust_index->online_log= nullptr; + clust_index->lock.x_unlock(); + } + + DBUG_RETURN(error); +} + +dberr_t row_merge_bulk_t::alloc_block() +{ + if (m_block) + return DB_SUCCESS; + m_block= m_alloc.allocate_large_dontdump( + 3 * srv_sort_buf_size, &m_block_pfx); + if (m_block == nullptr) + return DB_OUT_OF_MEMORY; + + m_crypt_pfx.m_size= 0; + TRASH_ALLOC(&m_crypt_pfx, sizeof m_crypt_pfx); + if (srv_encrypt_log) + { + m_crypt_block= static_cast<row_merge_block_t*>( + m_alloc.allocate_large(3 * srv_sort_buf_size, &m_crypt_pfx)); + if (!m_crypt_block) + return DB_OUT_OF_MEMORY; + } + return DB_SUCCESS; +} + +row_merge_bulk_t::row_merge_bulk_t(dict_table_t *table) +{ + ulint n_index= 0; + for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes); + index; index= UT_LIST_GET_NEXT(indexes, index)) + { + if (!index->is_btree()) + continue; + n_index++; + } + + m_merge_buf= static_cast<row_merge_buf_t*>( + ut_zalloc_nokey(n_index * sizeof *m_merge_buf)); + + ulint i= 0; + for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes); + index; index= UT_LIST_GET_NEXT(indexes, index)) + { + if (!index->is_btree()) + continue; + + mem_heap_t *heap= mem_heap_create(100); + row_merge_buf_create_low(&m_merge_buf[i], heap, index); + i++; + } + + m_tmpfd= OS_FILE_CLOSED; + m_blob_file.fd= OS_FILE_CLOSED; + m_blob_file.offset= 0; + m_blob_file.n_rec= 0; +} + +row_merge_bulk_t::~row_merge_bulk_t() +{ + ulint i= 0; + dict_table_t *table= m_merge_buf[0].index->table; + for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes); + index; index= UT_LIST_GET_NEXT(indexes, index)) + { + if (!index->is_btree()) + continue; + row_merge_buf_free(&m_merge_buf[i]); + if (m_merge_files) + row_merge_file_destroy(&m_merge_files[i]); + i++; + } + + row_merge_file_destroy_low(m_tmpfd); + + row_merge_file_destroy(&m_blob_file); + + ut_free(m_merge_buf); + + ut_free(m_merge_files); + + if (m_block) + m_alloc.deallocate_large(m_block, &m_block_pfx); + + if (m_crypt_block) + m_alloc.deallocate_large(m_crypt_block, &m_crypt_pfx); +} + +void row_merge_bulk_t::init_tmp_file() +{ + if (m_merge_files) + return; + + ulint n_index= 0; + dict_table_t *table= m_merge_buf[0].index->table; + for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes); + index; index= UT_LIST_GET_NEXT(indexes, index)) + { + if (!index->is_btree()) + continue; + n_index++; + } + + m_merge_files= static_cast<merge_file_t*>( + ut_malloc_nokey(n_index * sizeof *m_merge_files)); + + for (ulint i= 0; i < n_index; i++) + { + m_merge_files[i].fd= OS_FILE_CLOSED; + m_merge_files[i].offset= 0; + m_merge_files[i].n_rec= 0; + } +} + +void row_merge_bulk_t::clean_bulk_buffer(ulint index_no) +{ + mem_heap_empty(m_merge_buf[index_no].heap); + m_merge_buf[index_no].total_size = m_merge_buf[index_no].n_tuples = 0; +} + +bool row_merge_bulk_t::create_tmp_file(ulint index_no) +{ + return row_merge_file_create_if_needed( + &m_merge_files[index_no], &m_tmpfd, + m_merge_buf[index_no].n_tuples, NULL); +} + +dberr_t row_merge_bulk_t::write_to_tmp_file(ulint index_no) +{ + if (!create_tmp_file(index_no)) + return DB_OUT_OF_MEMORY; + merge_file_t *file= &m_merge_files[index_no]; + row_merge_buf_t *buf= &m_merge_buf[index_no]; + + alloc_block(); + + if (dberr_t err= row_merge_buf_write(buf, +#ifndef DBUG_OFF + file, +#endif + m_block, + index_no == 0 ? &m_blob_file : nullptr)) + return err; + + if (!row_merge_write(file->fd, file->offset++, + m_block, m_crypt_block, + buf->index->table->space->id)) + return DB_TEMP_FILE_WRITE_FAIL; + MEM_UNDEFINED(&m_block[0], srv_sort_buf_size); + return DB_SUCCESS; +} + +dberr_t row_merge_bulk_t::bulk_insert_buffered(const dtuple_t &row, + const dict_index_t &ind, + trx_t *trx) +{ + dberr_t err= DB_SUCCESS; + ulint i= 0; + mem_heap_t *large_tuple_heap= nullptr; + for (dict_index_t *index= UT_LIST_GET_FIRST(ind.table->indexes); + index; index= UT_LIST_GET_NEXT(indexes, index)) + { + if (!index->is_btree()) + continue; + + if (index != &ind) + { + i++; + continue; + } + row_merge_buf_t *buf= &m_merge_buf[i]; +add_to_buf: + if (row_merge_bulk_buf_add(buf, *ind.table, row)) + { + i++; + goto func_exit; + } + + if (buf->n_tuples == 0) + { + /* Tuple data size is greater than srv_sort_buf_size */ + dtuple_t *big_tuple= row_merge_buf_large_tuple( + row, &m_blob_file, &large_tuple_heap); + if (row_merge_bulk_buf_add(buf, *ind.table, *big_tuple)) + { + i++; + goto func_exit; + } + } + + if (index->is_unique()) + { + row_merge_dup_t dup{index, nullptr, nullptr, 0}; + row_merge_buf_sort(buf, &dup); + if (dup.n_dup) + { + trx->error_info= index; + err= DB_DUPLICATE_KEY; + goto func_exit; + } + } + else + row_merge_buf_sort(buf, NULL); + init_tmp_file(); + merge_file_t *file= &m_merge_files[i]; + file->n_rec+= buf->n_tuples; + err= write_to_tmp_file(i); + if (err != DB_SUCCESS) + { + trx->error_info= index; + goto func_exit; + } + clean_bulk_buffer(i); + buf= &m_merge_buf[i]; + goto add_to_buf; + } + +func_exit: + if (large_tuple_heap) + mem_heap_free(large_tuple_heap); + return err; +} + +dberr_t row_merge_bulk_t::write_to_index(ulint index_no, trx_t *trx) +{ + dberr_t err= DB_SUCCESS; + row_merge_buf_t buf= m_merge_buf[index_no]; + merge_file_t *file= m_merge_files ? + &m_merge_files[index_no] : nullptr; + dict_index_t *index= buf.index; + dict_table_t *table= index->table; + BtrBulk btr_bulk(index, trx); + row_merge_dup_t dup = {index, nullptr, nullptr, 0}; + + if (buf.n_tuples) + { + if (dict_index_is_unique(index)) + { + row_merge_buf_sort(&buf, &dup); + if (dup.n_dup) + { + err= DB_DUPLICATE_KEY; + goto func_exit; + } + } + else row_merge_buf_sort(&buf, NULL); + if (file && file->fd != OS_FILE_CLOSED) + { + file->n_rec+= buf.n_tuples; + err= write_to_tmp_file(index_no); + if (err!= DB_SUCCESS) + goto func_exit; + } + else + { + /* Data got fit in merge buffer. */ + err= row_merge_insert_index_tuples( + index, table, OS_FILE_CLOSED, nullptr, + &buf, &btr_bulk, 0, 0, 0, nullptr, table->space_id, nullptr, + m_blob_file.fd == OS_FILE_CLOSED ? nullptr : &m_blob_file); + goto func_exit; + } + } + + err= row_merge_sort(trx, &dup, file, + m_block, &m_tmpfd, true, 0, 0, + m_crypt_block, table->space_id, nullptr); + if (err != DB_SUCCESS) + goto func_exit; + + err= row_merge_insert_index_tuples( + index, table, file->fd, m_block, nullptr, + &btr_bulk, 0, 0, 0, m_crypt_block, table->space_id, + nullptr, &m_blob_file); + +func_exit: + if (err != DB_SUCCESS) + trx->error_info= index; + else if (index->is_primary() && table->persistent_autoinc) + btr_write_autoinc(index, table->autoinc - 1); + err= btr_bulk.finish(err); + return err; +} + +dberr_t row_merge_bulk_t::write_to_table(dict_table_t *table, trx_t *trx) +{ + ulint i= 0; + for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes); + index; index= UT_LIST_GET_NEXT(indexes, index)) + { + if (!index->is_btree()) + continue; + + dberr_t err= write_to_index(i, trx); + if (err != DB_SUCCESS) + return err; + i++; + } + + return DB_SUCCESS; +} + +dberr_t trx_mod_table_time_t::write_bulk(dict_table_t *table, trx_t *trx) +{ + if (!bulk_store) + return DB_SUCCESS; + dberr_t err= bulk_store->write_to_table(table, trx); + delete bulk_store; + bulk_store= nullptr; + return err; +} + +dberr_t trx_t::bulk_insert_apply_low() +{ + ut_ad(bulk_insert); + ut_ad(!check_unique_secondary); + ut_ad(!check_foreigns); + dberr_t err; + for (auto& t : mod_tables) + if (t.second.is_bulk_insert()) + if ((err= t.second.write_bulk(t.first, this)) != DB_SUCCESS) + goto bulk_rollback; + return DB_SUCCESS; +bulk_rollback: + undo_no_t low_limit= UINT64_MAX; + for (auto& t : mod_tables) + { + if (t.second.is_bulk_insert()) + { + if (t.second.get_first() < low_limit) + low_limit= t.second.get_first(); + delete t.second.bulk_store; + t.second.bulk_store= nullptr; + } + } + trx_savept_t bulk_save{low_limit}; + rollback(&bulk_save); + return err; +} diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc new file mode 100644 index 00000000..c5ee3be7 --- /dev/null +++ b/storage/innobase/row/row0mysql.cc @@ -0,0 +1,2916 @@ +/***************************************************************************** + +Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0mysql.cc +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include <debug_sync.h> +#include <gstream.h> +#include <spatial.h> + +#include "row0mysql.h" +#include "buf0flu.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "dict0dict.h" +#include "dict0load.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "fsp0file.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0pars.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "row0import.h" +#include "row0ins.h" +#include "row0row.h" +#include "row0sel.h" +#include "row0upd.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "srv0mon.h" +#include "srv0start.h" +#include "log.h" + +#include <algorithm> +#include <vector> +#include <thread> + + +/** Delay an INSERT, DELETE or UPDATE operation if the purge is lagging. */ +static void row_mysql_delay_if_needed() +{ + const auto delay= srv_dml_needed_delay; + if (UNIV_UNLIKELY(delay != 0)) + { + /* Adjust for purge_coordinator_state::refresh() */ + log_sys.latch.rd_lock(SRW_LOCK_CALL); + const lsn_t last= log_sys.last_checkpoint_lsn, + max_age= log_sys.max_checkpoint_age; + log_sys.latch.rd_unlock(); + const lsn_t lsn= log_sys.get_lsn(); + if ((lsn - last) / 4 >= max_age / 5) + buf_flush_ahead(last + max_age / 5, false); + purge_sys.wake_if_not_active(); + std::this_thread::sleep_for(std::chrono::microseconds(delay)); + } +} + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +{ + DBUG_ENTER("row_mysql_prebuilt_free_blob_heap"); + + DBUG_PRINT("row_mysql_prebuilt_free_blob_heap", + ("blob_heap freeing: %p", prebuilt->blob_heap)); + + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + DBUG_VOID_RETURN; +} + +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen) /*!< in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + ut_a(len < 256 * 256); + + mach_write_to_2_little_endian(dest, len); + + return(dest + 2); + } + + ut_a(lenlen == 1); + ut_a(len < 256); + + mach_write_to_1(dest, len); + + return(dest + 1); +} + +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen) /*!< in: storage length of len: either 1 + or 2 bytes */ +{ + if (lenlen == 2) { + *len = mach_read_from_2_little_endian(field); + + return(field + 2); + } + + ut_a(lenlen == 1); + + *len = mach_read_from_1(field); + + return(field + 1); +} + +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len) /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +{ + /* MySQL might assume the field is set to zero except the length and + the pointer fields */ + + memset(dest, '\0', col_len); + + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + ut_a(col_len - 8 > 1 || len < 256); + ut_a(col_len - 8 > 2 || len < 256 * 256); + ut_a(col_len - 8 > 3 || len < 256 * 256 * 256); + + mach_write_to_n_little_endian(dest, col_len - 8, len); + + memcpy(dest + col_len - 8, &data, sizeof data); +} + +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len) /*!< in: BLOB reference length + (not BLOB length) */ +{ + byte* data; + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + memcpy(&data, ref + col_len - 8, sizeof data); + + return(data); +} + +/*******************************************************************//** +Converting InnoDB geometry data format to MySQL data format. */ +void +row_mysql_store_geometry( +/*=====================*/ + byte* dest, /*!< in/out: where to store */ + ulint dest_len, /*!< in: dest buffer size: determines + into how many bytes the GEOMETRY length + is stored, the space for the length + may vary from 1 to 4 bytes */ + const byte* src, /*!< in: GEOMETRY data; if the value to + store is SQL NULL this should be NULL + pointer */ + ulint src_len) /*!< in: GEOMETRY length; if the value + to store is SQL NULL this should be 0; + remember also to set the NULL bit in + the MySQL record header! */ +{ + /* MySQL might assume the field is set to zero except the length and + the pointer fields */ + MEM_CHECK_DEFINED(src, src_len); + + memset(dest, '\0', dest_len); + + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + ut_ad(dest_len - 8 > 1 || src_len < 1<<8); + ut_ad(dest_len - 8 > 2 || src_len < 1<<16); + ut_ad(dest_len - 8 > 3 || src_len < 1<<24); + + mach_write_to_n_little_endian(dest, dest_len - 8, src_len); + + memcpy(dest + dest_len - 8, &src, sizeof src); +} + +/*******************************************************************//** +Read geometry data in the MySQL format. +@return pointer to geometry data */ +static +const byte* +row_mysql_read_geometry( +/*====================*/ + ulint* len, /*!< out: data length */ + const byte* ref, /*!< in: geometry data in the + MySQL format */ + ulint col_len) /*!< in: MySQL format length */ +{ + byte* data; + ut_ad(col_len > 8); + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + memcpy(&data, ref + col_len - 8, sizeof data); + + return(data); +} + +/**************************************************************//** +Pad a column with spaces. */ +void +row_mysql_pad_col( +/*==============*/ + ulint mbminlen, /*!< in: minimum size of a character, + in bytes */ + byte* pad, /*!< out: padded buffer */ + ulint len) /*!< in: number of bytes to pad */ +{ + const byte* pad_end; + + switch (UNIV_EXPECT(mbminlen, 1)) { + default: + ut_error; + case 1: + /* space=0x20 */ + memset(pad, 0x20, len); + break; + case 2: + /* space=0x0020 */ + pad_end = pad + len; + ut_a(!(len % 2)); + while (pad < pad_end) { + *pad++ = 0x00; + *pad++ = 0x20; + }; + break; + case 4: + /* space=0x00000020 */ + pad_end = pad + len; + ut_a(!(len % 4)); + while (pad < pad_end) { + *pad++ = 0x00; + *pad++ = 0x00; + *pad++ = 0x00; + *pad++ = 0x20; + } + break; + } +} + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.cc. +@return up to which byte we used buf in the conversion */ +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! NOTE that dfield + may also get a pointer to 'buf', + therefore do not discard this as long + as dfield is used! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp) /*!< in: nonzero=compact format */ +{ + const byte* ptr = mysql_data; + const dtype_t* dtype; + ulint type; + ulint lenlen; + + dtype = dfield_get_type(dfield); + + type = dtype->mtype; + + if (type == DATA_INT) { + /* Store integer data in Innobase in a big-endian format, + sign bit negated if the data is a signed integer. In MySQL, + integers are stored in a little-endian format. */ + + byte* p = buf + col_len; + + for (;;) { + p--; + *p = *mysql_data; + if (p == buf) { + break; + } + mysql_data++; + } + + if (!(dtype->prtype & DATA_UNSIGNED)) { + + *buf ^= 128; + } + + ptr = buf; + buf += col_len; + } else if ((type == DATA_VARCHAR + || type == DATA_VARMYSQL + || type == DATA_BINARY)) { + + if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) { + /* The length of the actual data is stored to 1 or 2 + bytes at the start of the field */ + + if (row_format_col) { + if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) { + lenlen = 2; + } else { + lenlen = 1; + } + } else { + /* In a MySQL key value, lenlen is always 2 */ + lenlen = 2; + } + + ptr = row_mysql_read_true_varchar(&col_len, mysql_data, + lenlen); + } else { + /* Remove trailing spaces from old style VARCHAR + columns. */ + + /* Handle Unicode strings differently. */ + ulint mbminlen = dtype_get_mbminlen(dtype); + + ptr = mysql_data; + + switch (mbminlen) { + default: + ut_error; + case 4: + /* space=0x00000020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~3U; + + while (col_len >= 4 + && ptr[col_len - 4] == 0x00 + && ptr[col_len - 3] == 0x00 + && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 4; + } + break; + case 2: + /* space=0x0020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~1U; + + while (col_len >= 2 && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 2; + } + break; + case 1: + /* space=0x20 */ + while (col_len > 0 + && ptr[col_len - 1] == 0x20) { + col_len--; + } + } + } + } else if (comp && type == DATA_MYSQL + && dtype_get_mbminlen(dtype) == 1 + && dtype_get_mbmaxlen(dtype) > 1) { + /* In some cases we strip trailing spaces from UTF-8 and other + multibyte charsets, from FIXED-length CHAR columns, to save + space. UTF-8 would otherwise normally use 3 * the string length + bytes to store an ASCII string! */ + + /* We assume that this CHAR field is encoded in a + variable-length character set where spaces have + 1:1 correspondence to 0x20 bytes, such as UTF-8. + + Consider a CHAR(n) field, a field of n characters. + It will contain between n * mbminlen and n * mbmaxlen bytes. + We will try to truncate it to n bytes by stripping + space padding. If the field contains single-byte + characters only, it will be truncated to n characters. + Consider a CHAR(5) field containing the string + ".a " where "." denotes a 3-byte character represented + by the bytes "$%&". After our stripping, the string will + be stored as "$%&a " (5 bytes). The string + ".abc " will be stored as "$%&abc" (6 bytes). + + The space padding will be restored in row0sel.cc, function + row_sel_field_store_in_mysql_format(). */ + + ulint n_chars; + + ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars && ptr[col_len - 1] == 0x20) { + col_len--; + } + } else if (!row_format_col) { + /* if mysql data is from a MySQL key value + since the length is always stored in 2 bytes, + we need do nothing here. */ + } else if (type == DATA_BLOB) { + + ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); + } else if (DATA_GEOMETRY_MTYPE(type)) { + ptr = row_mysql_read_geometry(&col_len, mysql_data, col_len); + } + + dfield_set_data(dfield, ptr, col_len); + + return(buf); +} + +/**************************************************************//** +Convert a row in the MySQL format to a row in the Innobase format. Note that +the function to convert a MySQL format key value to an InnoDB dtuple is +row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */ +static +void +row_mysql_convert_row_to_innobase( +/*==============================*/ + dtuple_t* row, /*!< in/out: Innobase row where the + field type information is already + copied there! */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct where template + must be of type ROW_MYSQL_WHOLE_ROW */ + const byte* mysql_rec, /*!< in: row in the MySQL format; + NOTE: do not discard as long as + row is used, as row may contain + pointers to this record! */ + mem_heap_t** blob_heap) /*!< in: FIX_ME, remove this after + server fixes its issue */ +{ + const mysql_row_templ_t*templ; + dfield_t* dfield; + ulint i; + ulint n_col = 0; + ulint n_v_col = 0; + + ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(prebuilt->mysql_template); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + + if (templ->is_virtual) { + ut_ad(n_v_col < dtuple_get_n_v_fields(row)); + dfield = dtuple_get_nth_v_field(row, n_v_col); + n_v_col++; + } else { + dfield = dtuple_get_nth_field(row, n_col); + n_col++; + } + + if (templ->mysql_null_bit_mask != 0) { + /* Column may be SQL NULL */ + + if (mysql_rec[templ->mysql_null_byte_offset] + & (byte) (templ->mysql_null_bit_mask)) { + + /* It is SQL NULL */ + + dfield_set_null(dfield); + + goto next_column; + } + } + + row_mysql_store_col_in_innobase_format( + dfield, + prebuilt->ins_upd_rec_buff + templ->mysql_col_offset, + TRUE, /* MySQL row format data */ + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, + dict_table_is_comp(prebuilt->table)); + + /* server has issue regarding handling BLOB virtual fields, + and we need to duplicate it with our own memory here */ + if (templ->is_virtual + && DATA_LARGE_MTYPE(dfield_get_type(dfield)->mtype)) { + if (*blob_heap == NULL) { + *blob_heap = mem_heap_create(dfield->len); + } + dfield_dup(dfield, *blob_heap); + } +next_column: + ; + } + + /* If there is a FTS doc id column and it is not user supplied ( + generated by server) then assign it a new doc id. */ + if (!prebuilt->table->fts) { + return; + } + + ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED); + + doc_id_t doc_id; + + if (!DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_HAS_DOC_ID)) { + if (prebuilt->table->fts->cache->first_doc_id + == FTS_NULL_DOC_ID) { + fts_get_next_doc_id(prebuilt->table, &doc_id); + } + return; + } + + dfield_t* fts_doc_id = dtuple_get_nth_field( + row, prebuilt->table->fts->doc_col); + + if (fts_get_next_doc_id(prebuilt->table, &doc_id) == DB_SUCCESS) { + ut_a(doc_id != FTS_NULL_DOC_ID); + ut_ad(sizeof(doc_id) == fts_doc_id->type.len); + dfield_set_data(fts_doc_id, prebuilt->ins_upd_rec_buff + + prebuilt->mysql_row_len, 8); + fts_write_doc_id(fts_doc_id->data, doc_id); + } else { + dfield_set_null(fts_doc_id); + } +} + +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return true if it was a lock wait and we should continue running the +query thread and in that case the thr is ALREADY in the running state. */ +bool +row_mysql_handle_errors( +/*====================*/ + dberr_t* new_err,/*!< out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ +{ + dberr_t err; + + DBUG_ENTER("row_mysql_handle_errors"); + DEBUG_SYNC_C("row_mysql_handle_errors"); + + err = trx->error_state; + +handle_new_error: + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + DBUG_LOG("trx", "handle error: " << err + << ";id=" << ib::hex(trx->id) << ", " << trx); + + switch (err) { + case DB_LOCK_WAIT_TIMEOUT: + extern my_bool innobase_rollback_on_timeout; + if (innobase_rollback_on_timeout) { + goto rollback; + } + /* fall through */ + case DB_DUPLICATE_KEY: + case DB_FOREIGN_DUPLICATE_KEY: + case DB_TOO_BIG_RECORD: + case DB_UNDO_RECORD_TOO_BIG: + case DB_ROW_IS_REFERENCED: + case DB_NO_REFERENCED_ROW: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_TOO_MANY_CONCURRENT_TRXS: + case DB_OUT_OF_FILE_SPACE: + case DB_READ_ONLY: + case DB_FTS_INVALID_DOCID: + case DB_INTERRUPTED: + case DB_CANT_CREATE_GEOMETRY_OBJECT: + case DB_TABLE_NOT_FOUND: + case DB_DECRYPTION_FAILED: + case DB_COMPUTE_VALUE_FAILED: + rollback_to_savept: + DBUG_EXECUTE_IF("row_mysql_crash_if_error", { + log_buffer_flush_to_disk(); + DBUG_SUICIDE(); }); + if (savept) { + /* Roll back the latest, possibly incomplete insertion + or update */ + + trx->rollback(savept); + } + if (!trx->bulk_insert) { + /* MariaDB will roll back the latest SQL statement */ + break; + } + /* MariaDB will roll back the entire transaction. */ + trx->bulk_insert = false; + trx->last_sql_stat_start.least_undo_no = 0; + trx->savepoints_discard(); + break; + case DB_LOCK_WAIT: + err = lock_wait(thr); + if (err != DB_SUCCESS) { + goto handle_new_error; + } + + *new_err = err; + + DBUG_RETURN(true); + + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + rollback: + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx->rollback(); + break; + + case DB_IO_ERROR: + case DB_TABLE_CORRUPT: + case DB_CORRUPTION: + case DB_PAGE_CORRUPTED: + ib::error() << "We detected index corruption in an InnoDB type" + " table. You have to dump + drop + reimport the" + " table or, in a case of widespread corruption," + " dump all InnoDB tables and recreate the whole" + " tablespace. If the mariadbd server crashes after" + " the startup or when you dump the tables. " + << FORCE_RECOVERY_MSG; + goto rollback_to_savept; + case DB_FOREIGN_EXCEED_MAX_CASCADE: + ib::error() << "Cannot delete/update rows with cascading" + " foreign key constraints that exceed max depth of " + << FK_MAX_CASCADE_DEL << ". Please drop excessive" + " foreign constraints and try again"; + goto rollback_to_savept; + case DB_UNSUPPORTED: + ib::error() << "Cannot delete/update rows with cascading" + " foreign key constraints in timestamp-based temporal" + " table. Please drop excessive" + " foreign constraints and try again"; + goto rollback_to_savept; + default: + ib::fatal() << "Unknown error " << err; + } + + if (dberr_t n_err = trx->error_state) { + trx->error_state = DB_SUCCESS; + *new_err = n_err; + } else { + *new_err = err; + } + + DBUG_RETURN(false); +} + +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table, /*!< in: Innobase table handle */ + ulint mysql_row_len) /*!< in: length in bytes of a row in + the MySQL format */ +{ + DBUG_ENTER("row_create_prebuilt"); + + row_prebuilt_t* prebuilt; + mem_heap_t* heap; + dict_index_t* clust_index; + dict_index_t* temp_index; + dtuple_t* ref; + ulint ref_len; + uint srch_key_len = 0; + ulint search_tuple_n_fields; + + search_tuple_n_fields = 2 * (dict_table_get_n_cols(table) + + dict_table_get_n_v_cols(table)); + + clust_index = dict_table_get_first_index(table); + + /* Make sure that search_tuple is long enough for clustered index */ + ut_a(2 * unsigned(table->n_cols) >= unsigned(clust_index->n_fields) + - clust_index->table->n_dropped()); + + ref_len = dict_index_get_n_unique(clust_index); + + + /* Maximum size of the buffer needed for conversion of INTs from + little endian format to big endian format in an index. An index + can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore + Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes + Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */ +#define MAX_SRCH_KEY_VAL_BUFFER 2* (8 * MAX_REF_PARTS) + +#define PREBUILT_HEAP_INITIAL_SIZE \ + ( \ + sizeof(*prebuilt) \ + /* allocd in this function */ \ + + DTUPLE_EST_ALLOC(search_tuple_n_fields) \ + + DTUPLE_EST_ALLOC(ref_len) \ + /* allocd in row_prebuild_sel_graph() */ \ + + sizeof(sel_node_t) \ + + sizeof(que_fork_t) \ + + sizeof(que_thr_t) \ + /* allocd in row_get_prebuilt_update_vector() */ \ + + sizeof(upd_node_t) \ + + sizeof(upd_t) \ + + sizeof(upd_field_t) \ + * dict_table_get_n_cols(table) \ + + sizeof(que_fork_t) \ + + sizeof(que_thr_t) \ + /* allocd in row_get_prebuilt_insert_row() */ \ + + sizeof(ins_node_t) \ + /* mysql_row_len could be huge and we are not \ + sure if this prebuilt instance is going to be \ + used in inserts */ \ + + (mysql_row_len < 256 ? mysql_row_len : 0) \ + + DTUPLE_EST_ALLOC(dict_table_get_n_cols(table) \ + + dict_table_get_n_v_cols(table)) \ + + sizeof(que_fork_t) \ + + sizeof(que_thr_t) \ + + sizeof(*prebuilt->pcur) \ + + sizeof(*prebuilt->clust_pcur) \ + ) + + /* Calculate size of key buffer used to store search key in + InnoDB format. MySQL stores INTs in little endian format and + InnoDB stores INTs in big endian format with the sign bit + flipped. All other field types are stored/compared the same + in MySQL and InnoDB, so we must create a buffer containing + the INT key parts in InnoDB format.We need two such buffers + since both start and end keys are used in records_in_range(). */ + + for (temp_index = dict_table_get_first_index(table); temp_index; + temp_index = dict_table_get_next_index(temp_index)) { + DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value", + ut_a(temp_index->n_user_defined_cols + == MAX_REF_PARTS);); + if (temp_index->is_corrupted()) { + continue; + } + + uint temp_len = 0; + for (uint i = 0; i < temp_index->n_uniq; i++) { + ulint type = temp_index->fields[i].col->mtype; + if (type == DATA_INT) { + temp_len += + temp_index->fields[i].fixed_len; + } + } + srch_key_len = std::max(srch_key_len,temp_len); + } + + ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER); + + DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value", + ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER);); + + /* We allocate enough space for the objects that are likely to + be created later in order to minimize the number of malloc() + calls */ + heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len); + + prebuilt = static_cast<row_prebuilt_t*>( + mem_heap_zalloc(heap, sizeof(*prebuilt))); + + prebuilt->magic_n = ROW_PREBUILT_ALLOCATED; + prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED; + + prebuilt->table = table; + + prebuilt->sql_stat_start = TRUE; + prebuilt->heap = heap; + + prebuilt->srch_key_val_len = srch_key_len; + if (prebuilt->srch_key_val_len) { + prebuilt->srch_key_val1 = static_cast<byte*>( + mem_heap_alloc(prebuilt->heap, + 2 * prebuilt->srch_key_val_len)); + prebuilt->srch_key_val2 = prebuilt->srch_key_val1 + + prebuilt->srch_key_val_len; + } else { + prebuilt->srch_key_val1 = NULL; + prebuilt->srch_key_val2 = NULL; + } + + prebuilt->pcur = static_cast<btr_pcur_t*>( + mem_heap_zalloc(prebuilt->heap, + sizeof(btr_pcur_t))); + prebuilt->clust_pcur = static_cast<btr_pcur_t*>( + mem_heap_zalloc(prebuilt->heap, + sizeof(btr_pcur_t))); + btr_pcur_reset(prebuilt->pcur); + btr_pcur_reset(prebuilt->clust_pcur); + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE_UNSET; + + prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + prebuilt->clust_ref = ref; + + prebuilt->autoinc_error = DB_SUCCESS; + prebuilt->autoinc_offset = 0; + + /* Default to 1, we will set the actual value later in + ha_innobase::get_auto_increment(). */ + prebuilt->autoinc_increment = 1; + + prebuilt->autoinc_last_value = 0; + + /* During UPDATE and DELETE we need the doc id. */ + prebuilt->fts_doc_id = 0; + + prebuilt->mysql_row_len = mysql_row_len; + + prebuilt->fts_doc_id_in_read_set = 0; + prebuilt->blob_heap = NULL; + + DBUG_RETURN(prebuilt); +} + +/** Free a prebuilt struct for a TABLE handle. */ +void row_prebuilt_free(row_prebuilt_t *prebuilt) +{ + DBUG_ENTER("row_prebuilt_free"); + + ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); + + prebuilt->magic_n = ROW_PREBUILT_FREED; + prebuilt->magic_n2 = ROW_PREBUILT_FREED; + + btr_pcur_reset(prebuilt->pcur); + btr_pcur_reset(prebuilt->clust_pcur); + + ut_free(prebuilt->mysql_template); + + if (prebuilt->ins_graph) { + que_graph_free_recursive(prebuilt->ins_graph); + } + + if (prebuilt->sel_graph) { + que_graph_free_recursive(prebuilt->sel_graph); + } + + if (prebuilt->upd_graph) { + que_graph_free_recursive(prebuilt->upd_graph); + } + + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + + if (prebuilt->old_vers_heap) { + mem_heap_free(prebuilt->old_vers_heap); + } + + if (prebuilt->fetch_cache[0] != NULL) { + byte* base = prebuilt->fetch_cache[0] - 4; + byte* ptr = base; + + for (ulint i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + ulint magic1 = mach_read_from_4(ptr); + ut_a(magic1 == ROW_PREBUILT_FETCH_MAGIC_N); + ptr += 4; + + byte* row = ptr; + ut_a(row == prebuilt->fetch_cache[i]); + ptr += prebuilt->mysql_row_len; + + ulint magic2 = mach_read_from_4(ptr); + ut_a(magic2 == ROW_PREBUILT_FETCH_MAGIC_N); + ptr += 4; + } + + ut_free(base); + } + + if (prebuilt->rtr_info) { + rtr_clean_rtr_info(prebuilt->rtr_info, true); + } + if (prebuilt->table) { + dict_table_close(prebuilt->table); + } + + mem_heap_free(prebuilt->heap); + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx) /*!< in: transaction handle */ +{ + ut_a(trx->magic_n == TRX_MAGIC_N); + ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); + + prebuilt->trx = trx; + + if (prebuilt->ins_graph) { + prebuilt->ins_graph->trx = trx; + } + + if (prebuilt->upd_graph) { + prebuilt->upd_graph->trx = trx; + } + + if (prebuilt->sel_graph) { + prebuilt->sel_graph->trx = trx; + } +} + +/*********************************************************************//** +Gets pointer to a prebuilt dtuple used in insertions. If the insert graph +has not yet been built in the prebuilt struct, then this function first +builds it. +@return prebuilt dtuple; the column type information is also set in it */ +static +dtuple_t* +row_get_prebuilt_insert_row( +/*========================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->ins_node != 0) { + + /* Check if indexes have been dropped or added and we + may need to rebuild the row insert template. */ + + if (prebuilt->trx_id == table->def_trx_id + && prebuilt->ins_node->entry_list.size() + == UT_LIST_GET_LEN(table->indexes)) { + return(prebuilt->ins_node->row); + } + + ut_ad(prebuilt->trx_id < table->def_trx_id); + + que_graph_free_recursive(prebuilt->ins_graph); + + prebuilt->ins_graph = 0; + } + + /* Create an insert node and query graph to the prebuilt struct */ + + ins_node_t* node; + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == 0) { + prebuilt->ins_upd_rec_buff = static_cast<byte*>( + mem_heap_alloc( + prebuilt->heap, + DICT_TF2_FLAG_IS_SET(prebuilt->table, + DICT_TF2_FTS_HAS_DOC_ID) + ? prebuilt->mysql_row_len + 8/* FTS_DOC_ID */ + : prebuilt->mysql_row_len)); + } + + dtuple_t* row; + + row = dtuple_create_with_vcol( + prebuilt->heap, dict_table_get_n_cols(table), + dict_table_get_n_v_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + que_thr_t* fork = pars_complete_graph_for_exec( + node, prebuilt->trx, prebuilt->heap, prebuilt); + fork->state = QUE_THR_RUNNING; + + prebuilt->ins_graph = static_cast<que_fork_t*>( + que_node_get_parent(fork)); + + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + + prebuilt->trx_id = table->def_trx_id; + + return(prebuilt->ins_node->row); +} + +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +dberr_t +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ +{ + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + const dict_table_t* table = prebuilt->table; + que_thr_t* thr; + dberr_t err; + + /* If we already hold an AUTOINC lock on the table then do nothing. + Note: We peek at the value of the current owner without acquiring + lock_sys.latch. */ + if (trx == table->autoinc_trx) { + + return(DB_SUCCESS); + } + + trx->op_info = "setting auto-inc lock"; + + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + + /* We use the insert query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + do { + thr->run_node = node; + thr->prev_node = node; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started_xa(trx, true); + + err = lock_table(prebuilt->table, NULL, LOCK_AUTO_INC, thr); + + trx->error_state = err; + } while (err != DB_SUCCESS + && row_mysql_handle_errors(&err, trx, thr, NULL)); + + trx->op_info = ""; + + return(err); +} + +/** Lock a table. +@param[in,out] prebuilt table handle +@return error code or DB_SUCCESS */ +dberr_t +row_lock_table(row_prebuilt_t* prebuilt) +{ + trx_t* trx = prebuilt->trx; + que_thr_t* thr; + dberr_t err; + + trx->op_info = "setting table lock"; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + do { + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started_xa(trx, false); + + err = lock_table(prebuilt->table, NULL, static_cast<lock_mode>( + prebuilt->select_lock_type), thr); + trx->error_state = err; + } while (err != DB_SUCCESS + && row_mysql_handle_errors(&err, trx, thr, NULL)); + + trx->op_info = ""; + + return(err); +} + +/** Determine is tablespace encrypted but decryption failed, is table corrupted +or is tablespace .ibd file missing. +@param[in] table Table +@param[in] trx Transaction +@param[in] push_warning true if we should push warning to user +@retval DB_DECRYPTION_FAILED table is encrypted but decryption failed +@retval DB_CORRUPTION table is corrupted +@retval DB_TABLESPACE_NOT_FOUND tablespace .ibd file not found */ +static +dberr_t +row_mysql_get_table_status( + const dict_table_t* table, + trx_t* trx, + bool push_warning = true) +{ + dberr_t err; + if (const fil_space_t* space = table->space) { + if (space->crypt_data && space->crypt_data->is_encrypted()) { + // maybe we cannot access the table due to failing + // to decrypt + if (push_warning) { + ib_push_warning(trx, DB_DECRYPTION_FAILED, + "Table %s is encrypted." + "However key management plugin or used key_id is not found or" + " used encryption algorithm or method does not match.", + table->name.m_name); + } + + err = DB_DECRYPTION_FAILED; + } else { + if (push_warning) { + ib_push_warning(trx, DB_CORRUPTION, + "Table %s in tablespace %lu corrupted.", + table->name.m_name, table->space); + } + + err = DB_CORRUPTION; + } + } else { + ib::error() << ".ibd file is missing for table " + << table->name; + err = DB_TABLESPACE_NOT_FOUND; + } + + return(err); +} + +/** Does an insert for MySQL. +@param[in] mysql_rec row in the MySQL format +@param[in,out] prebuilt prebuilt struct in MySQL handle +@return error code or DB_SUCCESS */ +dberr_t +row_insert_for_mysql( + const byte* mysql_rec, + row_prebuilt_t* prebuilt, + ins_mode_t ins_mode) +{ + trx_savept_t savept; + que_thr_t* thr; + dberr_t err; + ibool was_lock_wait; + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + dict_table_t* table = prebuilt->table; + + /* FIX_ME: This blob heap is used to compensate an issue in server + for virtual column blob handling */ + mem_heap_t* blob_heap = NULL; + + ut_ad(trx); + ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); + + if (!table->space) { + ib::error() << "The table " << table->name + << " doesn't have a corresponding tablespace, it was" + " discarded."; + + return(DB_TABLESPACE_DELETED); + } else if (!table->is_readable()) { + return row_mysql_get_table_status(table, trx, true); + } else if (high_level_read_only) { + return(DB_READ_ONLY); + } else if (UNIV_UNLIKELY(table->corrupted) + || dict_table_get_first_index(table)->is_corrupted()) { + return DB_TABLE_CORRUPT; + } + + trx->op_info = "inserting"; + + row_mysql_delay_if_needed(); + + if (!table->no_rollback()) { + trx_start_if_not_started_xa(trx, true); + } + + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + + row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec, + &blob_heap); + + if (ins_mode != ROW_INS_NORMAL) { + node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL); + } + + /* Because we now allow multiple INSERT into the same + initially empty table in bulk insert mode, on error we must + roll back to the start of the transaction. For correctness, it + would suffice to roll back to the start of the first insert + into this empty table, but we will keep it simple and efficient. */ + savept.least_undo_no = trx->bulk_insert ? 0 : trx->undo_no; + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + if (prebuilt->sql_stat_start) { + node->state = INS_NODE_SET_IX_LOCK; + prebuilt->sql_stat_start = FALSE; + } else { + node->state = INS_NODE_ALLOC_ROW_ID; + node->trx_id = trx->id; + } + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + DEBUG_SYNC_C("ib_after_row_insert_step"); + + err = trx->error_state; + + if (err != DB_SUCCESS) { +error_exit: + /* FIXME: What's this ? */ + thr->lock_state = QUE_THR_LOCK_ROW; + + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, &savept); + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + ut_ad(node->state == INS_NODE_INSERT_ENTRIES + || node->state == INS_NODE_ALLOC_ROW_ID + || node->state == INS_NODE_SET_IX_LOCK); + goto run_again; + } + + trx->op_info = ""; + + if (blob_heap != NULL) { + mem_heap_free(blob_heap); + } + + return(err); + } + + if (dict_table_has_fts_index(table) + && (!table->versioned() + || !node->row->fields[table->vers_end].vers_history_row())) { + + doc_id_t doc_id; + + /* Extract the doc id from the hidden FTS column */ + doc_id = fts_get_doc_id_from_row(table, node->row); + + if (doc_id <= 0) { + ib::error() << "FTS_DOC_ID must be larger than 0 for table " + << table->name; + err = DB_FTS_INVALID_DOCID; + trx->error_state = DB_FTS_INVALID_DOCID; + goto error_exit; + } + + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + doc_id_t next_doc_id + = table->fts->cache->next_doc_id; + + if (doc_id < next_doc_id) { + ib::error() << "FTS_DOC_ID must be larger than " + << next_doc_id - 1 << " for table " + << table->name; + + err = DB_FTS_INVALID_DOCID; + trx->error_state = DB_FTS_INVALID_DOCID; + goto error_exit; + } + } + + if (table->skip_alter_undo) { + if (trx->fts_trx == NULL) { + trx->fts_trx = fts_trx_create(trx); + } + + fts_trx_table_t ftt; + ftt.table = table; + ftt.fts_trx = trx->fts_trx; + + fts_add_doc_from_tuple(&ftt, doc_id, node->row); + } else { + /* Pass NULL for the columns affected, since an INSERT affects + all FTS indexes. */ + fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL); + } + } + + /* Not protected by dict_sys.latch or table->stats_mutex_lock() + for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_inc(table); + + if (prebuilt->clust_index_was_generated) { + /* set row id to prebuilt */ + memcpy(prebuilt->row_id, node->sys_buf, DATA_ROW_ID_LEN); + } + + dict_stats_update_if_needed(table, *trx); + trx->op_info = ""; + + if (blob_heap != NULL) { + mem_heap_free(blob_heap); + } + + return(err); +} + +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + sel_node_t* node; + + ut_ad(prebuilt && prebuilt->trx); + + if (prebuilt->sel_graph == NULL) { + + node = sel_node_create(prebuilt->heap); + + que_thr_t* fork = pars_complete_graph_for_exec( + node, prebuilt->trx, prebuilt->heap, prebuilt); + fork->state = QUE_THR_RUNNING; + + prebuilt->sel_graph = static_cast<que_fork_t*>( + que_node_get_parent(fork)); + + prebuilt->sel_graph->state = QUE_FORK_ACTIVE; + } +} + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap) /*!< in: mem heap from which allocated */ +{ + upd_node_t* node; + + DBUG_ENTER("row_create_update_node_for_mysql"); + + node = upd_node_create(heap); + + node->in_mysql_interface = true; + node->is_delete = NO_DELETE; + node->pcur = new (mem_heap_alloc(heap, sizeof(btr_pcur_t))) + btr_pcur_t(); + + node->table = table; + + node->update = upd_create(dict_table_get_n_cols(table) + + dict_table_get_n_v_cols(table), heap); + + node->update_n_fields = dict_table_get_n_cols(table); + + UT_LIST_INIT(node->columns, &sym_node_t::col_var_list); + + node->has_clust_rec_x_lock = TRUE; + + DBUG_RETURN(node); +} + +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ +{ + if (prebuilt->upd_node == NULL) { + + /* Not called before for this handle: create an update node + and query graph to the prebuilt struct */ + + prebuilt->upd_node = row_create_update_node_for_mysql( + prebuilt->table, prebuilt->heap); + + prebuilt->upd_graph = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec( + prebuilt->upd_node, + prebuilt->trx, prebuilt->heap, + prebuilt))); + + prebuilt->upd_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->upd_node->update); +} + +/******************************************************************** +Handle an update of a column that has an FTS index. */ +static +void +row_fts_do_update( +/*==============*/ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: Table with FTS index */ + doc_id_t old_doc_id, /* in: old document id */ + doc_id_t new_doc_id) /* in: new document id */ +{ + if(trx->fts_next_doc_id) { + fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL); + if(new_doc_id != FTS_NULL_DOC_ID) + fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL); + } +} + +/************************************************************************ +Handles FTS matters for an update or a delete. +NOTE: should not be called if the table does not have an FTS index. .*/ +static +dberr_t +row_fts_update_or_delete( +/*=====================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_t* trx = prebuilt->trx; + dict_table_t* table = prebuilt->table; + upd_node_t* node = prebuilt->upd_node; + doc_id_t old_doc_id = prebuilt->fts_doc_id; + + DBUG_ENTER("row_fts_update_or_delete"); + + ut_a(dict_table_has_fts_index(prebuilt->table)); + + /* Deletes are simple; get them out of the way first. */ + if (node->is_delete) { + /* A delete affects all FTS indexes, so we pass NULL */ + fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL); + } else { + doc_id_t new_doc_id; + new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id); + + if (new_doc_id == 0) { + ib::error() << "InnoDB FTS: Doc ID cannot be 0"; + DBUG_RETURN(DB_FTS_INVALID_DOCID); + } + row_fts_do_update(trx, table, old_doc_id, new_doc_id); + } + + DBUG_RETURN(DB_SUCCESS); +} + +/*********************************************************************//** +Initialize the Doc ID system for FK table with FTS index */ +static +void +init_fts_doc_id_for_ref( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ulint* depth) /*!< in: recusive call depth */ +{ + table->fk_max_recusive_level = 0; + + /* Limit on tables involved in cascading delete/update */ + if (++*depth > FK_MAX_CASCADE_DEL) { + return; + } + + /* Loop through this table's referenced list and also + recursively traverse each table's foreign table list */ + for (dict_foreign_t* foreign : table->referenced_set) { + ut_ad(foreign->foreign_table); + + if (foreign->foreign_table->fts) { + fts_init_doc_id(foreign->foreign_table); + } + + if (foreign->foreign_table != table + && !foreign->foreign_table->referenced_set.empty()) { + init_fts_doc_id_for_ref( + foreign->foreign_table, depth); + } + } +} + +/** Does an update or delete of a row for MySQL. +@param[in,out] prebuilt prebuilt struct in MySQL handle +@return error code or DB_SUCCESS */ +dberr_t +row_update_for_mysql(row_prebuilt_t* prebuilt) +{ + trx_savept_t savept; + dberr_t err; + que_thr_t* thr; + dict_index_t* clust_index; + upd_node_t* node; + dict_table_t* table = prebuilt->table; + trx_t* trx = prebuilt->trx; + ulint fk_depth = 0; + + DBUG_ENTER("row_update_for_mysql"); + + ut_ad(trx); + ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); + ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(table->stat_initialized); + + if (!table->is_readable()) { + return(row_mysql_get_table_status(table, trx, true)); + } + + if (high_level_read_only) { + return(DB_READ_ONLY); + } + + DEBUG_SYNC_C("innodb_row_update_for_mysql_begin"); + + trx->op_info = "updating or deleting"; + + row_mysql_delay_if_needed(); + + init_fts_doc_id_for_ref(table, &fk_depth); + + if (!table->no_rollback()) { + trx_start_if_not_started_xa(trx, true); + } + + node = prebuilt->upd_node; + const bool is_delete = node->is_delete == PLAIN_DELETE; + ut_ad(node->table == table); + + clust_index = dict_table_get_first_index(table); + + btr_pcur_copy_stored_position(node->pcur, + prebuilt->pcur->index() == clust_index + ? prebuilt->pcur + : prebuilt->clust_pcur); + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + /* MySQL seems to call rnd_pos before updating each row it + has cached: we can get the correct cursor position from + prebuilt->pcur; NOTE that we cannot build the row reference + from mysql_rec if the clustered index was automatically + generated for the table: MySQL does not know anything about + the row id used as the clustered index key */ + + savept.least_undo_no = trx->undo_no; + + thr = que_fork_get_first_thr(prebuilt->upd_graph); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + ut_ad(!prebuilt->sql_stat_start); + + ut_ad(!prebuilt->versioned_write || node->table->versioned()); + + if (prebuilt->versioned_write && node->is_delete == VERSIONED_DELETE) { + node->vers_make_delete(trx); + } + + for (;;) { + thr->run_node = node; + thr->prev_node = node; + thr->fk_cascade_depth = 0; + + row_upd_step(thr); + + err = trx->error_state; + + if (err == DB_SUCCESS) { + break; + } + + if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + goto error; + } + + thr->lock_state= QUE_THR_LOCK_ROW; + + DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error"); + + bool was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (!was_lock_wait) { + goto error; + } + } + + if (dict_table_has_fts_index(table) + && trx->fts_next_doc_id != UINT64_UNDEFINED) { + err = row_fts_update_or_delete(prebuilt); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + ut_ad("unexpected error" == 0); + goto error; + } + } + + /* Completed cascading operations (if any) */ + bool update_statistics; + ut_ad(is_delete == (node->is_delete == PLAIN_DELETE)); + + if (is_delete) { + /* Not protected by dict_sys.latch + or prebuilt->table->stats_mutex_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(prebuilt->table); + + update_statistics = !srv_stats_include_delete_marked; + } else { + update_statistics + = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); + } + + if (update_statistics) { + dict_stats_update_if_needed(prebuilt->table, *trx); + } else { + /* Always update the table modification counter. */ + prebuilt->table->stat_modified_counter++; + } + +error: + trx->op_info = ""; + DBUG_RETURN(err); +} + +/** This can only be used when the current transaction is at +READ COMMITTED or READ UNCOMMITTED isolation level. +Before calling this function row_search_mvcc() must have +initialized prebuilt->new_rec_locks to store the information which new +record locks really were set. This function removes a newly set +clustered index record lock under prebuilt->pcur or +prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that +releases the latest clustered index record lock we set. +@param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] has_latches_on_recs TRUE if called so that we have the + latches on the records under pcur + and clust_pcur, and we do not need + to reposition the cursors. */ +void +row_unlock_for_mysql( + row_prebuilt_t* prebuilt, + ibool has_latches_on_recs) +{ + if (prebuilt->new_rec_locks == 1 && prebuilt->index->is_clust()) { + trx_t* trx = prebuilt->trx; + ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED); + trx->op_info = "unlock_row"; + + const rec_t* rec; + dict_index_t* index; + trx_id_t rec_trx_id; + mtr_t mtr; + btr_pcur_t* pcur = prebuilt->pcur; + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs + && pcur->restore_position(BTR_SEARCH_LEAF, &mtr) + != btr_pcur_t::SAME_ALL) { + goto no_unlock; + } + + rec = btr_pcur_get_rec(pcur); + index = pcur->index(); + + /* If the record has been modified by this + transaction, do not unlock it. */ + + if (index->trx_id_offset) { + rec_trx_id = trx_read_trx_id(rec + + index->trx_id_offset); + } else { + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + if (rec_trx_id != trx->id) { + /* We did not update the record: unlock it */ + + rec = btr_pcur_get_rec(pcur); + + lock_rec_unlock( + trx, + btr_pcur_get_block(pcur)->page.id(), + rec, + static_cast<enum lock_mode>( + prebuilt->select_lock_type)); + } +no_unlock: + mtr_commit(&mtr); + trx->op_info = ""; + } +} + +/** Write query start time as SQL field data to a buffer. Needed by InnoDB. +@param thd Thread object +@param buf Buffer to hold start time data */ +void thd_get_query_start_data(THD *thd, char *buf); + +/** Insert history row when evaluating foreign key referential action. + +1. Create new dtuple_t 'row' from node->historical_row; +2. Update its row_end to current timestamp; +3. Insert it to a table; +4. Update table statistics. + +This is used in UPDATE CASCADE/SET NULL of a system versioned referenced table. + +node->historical_row: dtuple_t containing pointers of row changed by refertial +action. + +@param[in] thr current query thread +@param[in] node a node which just updated a row in a foreign table +@return DB_SUCCESS or some error */ +static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node) +{ + trx_t* trx = thr_get_trx(thr); + dfield_t* row_end; + char row_end_data[8]; + dict_table_t* table = node->table; + const unsigned zip_size = table->space->zip_size(); + ut_ad(table->versioned()); + + dtuple_t* row; + const ulint n_cols = dict_table_get_n_cols(table); + const ulint n_v_cols = dict_table_get_n_v_cols(table); + + ut_ad(n_cols == dtuple_get_n_fields(node->historical_row)); + ut_ad(n_v_cols == dtuple_get_n_v_fields(node->historical_row)); + + row = dtuple_create_with_vcol(node->historical_heap, n_cols, n_v_cols); + + dict_table_copy_types(row, table); + + ins_node_t* insert_node = + ins_node_create(INS_DIRECT, table, node->historical_heap); + + if (!insert_node) { + trx->error_state = DB_OUT_OF_MEMORY; + goto exit; + } + + insert_node->common.parent = thr; + ins_node_set_new_row(insert_node, row); + + ut_ad(n_cols > DATA_N_SYS_COLS); + // Exclude DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR + for (ulint i = 0; i < n_cols - DATA_N_SYS_COLS; i++) { + dfield_t *src= dtuple_get_nth_field(node->historical_row, i); + dfield_t *dst= dtuple_get_nth_field(row, i); + dfield_copy(dst, src); + if (dfield_is_ext(src)) { + byte *field_data + = static_cast<byte*>(dfield_get_data(src)); + ulint ext_len; + ulint field_len = dfield_get_len(src); + + ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE); + + ut_a(memcmp(field_data + field_len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + + byte *data = btr_copy_externally_stored_field( + &ext_len, field_data, zip_size, field_len, + node->historical_heap); + dfield_set_data(dst, data, ext_len); + } + } + + for (ulint i = 0; i < n_v_cols; i++) { + dfield_t *dst= dtuple_get_nth_v_field(row, i); + dfield_t *src= dtuple_get_nth_v_field(node->historical_row, i); + dfield_copy(dst, src); + } + + node->historical_row = NULL; + + row_end = dtuple_get_nth_field(row, table->vers_end); + if (dict_table_get_nth_col(table, table->vers_end)->vers_native()) { + mach_write_to_8(row_end_data, trx->id); + dfield_set_data(row_end, row_end_data, 8); + } else { + thd_get_query_start_data(trx->mysql_thd, row_end_data); + dfield_set_data(row_end, row_end_data, 7); + } + + for (;;) { + thr->run_node = insert_node; + thr->prev_node = insert_node; + + row_ins_step(thr); + + switch (trx->error_state) { + case DB_LOCK_WAIT: + if (lock_wait(thr) == DB_SUCCESS) { + continue; + } + + /* fall through */ + default: + /* Other errors are handled for the parent node. */ + thr->fk_cascade_depth = 0; + goto exit; + + case DB_SUCCESS: + dict_stats_update_if_needed(table, *trx); + goto exit; + } + } +exit: + que_graph_free_recursive(insert_node); + mem_heap_free(node->historical_heap); + node->historical_heap = NULL; + return trx->error_state; +} + +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ +{ + /* Increment fk_cascade_depth to record the recursive call depth on + a single update/delete that affects multiple tables chained + together with foreign key relations. */ + + if (++thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) { + return(DB_FOREIGN_EXCEED_MAX_CASCADE); + } + + trx_t* trx = thr_get_trx(thr); + + if (table->versioned()) { + if (node->is_delete == PLAIN_DELETE) { + node->vers_make_delete(trx); + } else if (node->update->affects_versioned()) { + dberr_t err = row_update_vers_insert(thr, node); + if (err != DB_SUCCESS) { + return err; + } + node->vers_make_update(trx); + } + } + + for (;;) { + thr->run_node = node; + thr->prev_node = node; + + DEBUG_SYNC_C("foreign_constraint_update_cascade"); + { + TABLE *mysql_table = thr->prebuilt->m_mysql_table; + thr->prebuilt->m_mysql_table = NULL; + row_upd_step(thr); + thr->prebuilt->m_mysql_table = mysql_table; + } + + switch (trx->error_state) { + case DB_LOCK_WAIT: + if (lock_wait(thr) == DB_SUCCESS) { + continue; + } + + /* fall through */ + default: + /* Other errors are handled for the parent node. */ + thr->fk_cascade_depth = 0; + return trx->error_state; + + case DB_SUCCESS: + thr->fk_cascade_depth = 0; + bool stats; + + if (node->is_delete == PLAIN_DELETE) { + /* Not protected by dict_sys.latch + or node->table->stats_mutex_lock() for + performance reasons, we would rather + get garbage in stat_n_rows (which is + just an estimate anyway) than + protecting the following code with a + latch. */ + dict_table_n_rows_dec(node->table); + + stats = !srv_stats_include_delete_marked; + } else { + stats = !(node->cmpl_info + & UPD_NODE_NO_ORD_CHANGE); + } + + if (stats) { + dict_stats_update_if_needed(node->table, *trx); + } else { + /* Always update the table + modification counter. */ + node->table->stat_modified_counter++; + } + + return(DB_SUCCESS); + } + } +} + +/*********************************************************************//** +Creates a table for MySQL. On failure the transaction will be rolled back +and the 'table' object will be freed. +@return error code or DB_SUCCESS */ +dberr_t +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx) /*!< in/out: transaction */ +{ + tab_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + + ut_ad(trx->state == TRX_STATE_ACTIVE); + ut_ad(dict_sys.sys_tables_exist()); + ut_ad(dict_sys.locked()); + ut_ad(trx->dict_operation_lock_mode); + + DEBUG_SYNC_C("create_table"); + + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_start_of_row_create_table_for_mysql", + dict_mem_table_free(table); return DB_ERROR; + ); + + trx->op_info = "creating table"; + + heap = mem_heap_create(512); + + trx->dict_operation = true; + + node = tab_create_graph_create(table, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap, NULL); + + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>(que_node_get_parent(thr)))); + + que_run_threads(thr); + + dberr_t err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx->rollback(); + dict_mem_table_free(table); + } + + que_graph_free((que_t*) que_node_get_parent(thr)); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Create an index when creating a table. +On failure, the caller must drop the table! +@return error number or DB_SUCCESS */ +dberr_t +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths, /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ + fil_encryption_t mode, /*!< in: encryption mode */ + uint32_t key_id) /*!< in: encryption key_id */ +{ + ind_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + dberr_t err; + ulint i; + ulint len; + dict_table_t* table = index->table; + + ut_ad(dict_sys.locked()); + + for (i = 0; i < index->n_def; i++) { + /* Check that prefix_len and actual length + < DICT_MAX_INDEX_COL_LEN */ + + len = dict_index_get_nth_field(index, i)->prefix_len; + + if (field_lengths && field_lengths[i]) { + len = ut_max(len, field_lengths[i]); + } + + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_create_index", + len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1; + ); + + /* Column or prefix length exceeds maximum column length */ + if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) { + dict_mem_index_free(index); + return DB_TOO_BIG_INDEX_COL; + } + } + + /* For temp-table we avoid insertion into SYSTEM TABLES to + maintain performance and so we have separate path that directly + just updates dictonary cache. */ + if (!table->is_temporary()) { + ut_ad(trx->state == TRX_STATE_ACTIVE); + ut_ad(trx->dict_operation); + trx->op_info = "creating index"; + + /* Note that the space id where we store the index is + inherited from the table in dict_build_index_def_step() + in dict0crea.cc. */ + + heap = mem_heap_create(512); + node = ind_create_graph_create(index, table->name.m_name, + heap, mode, key_id); + + thr = pars_complete_graph_for_exec(node, trx, heap, NULL); + + ut_a(thr == que_fork_start_command( + static_cast<que_fork_t*>( + que_node_get_parent(thr)))); + + que_run_threads(thr); + + err = trx->error_state; + + index = node->index; + + ut_ad(!index == (err != DB_SUCCESS)); + + que_graph_free((que_t*) que_node_get_parent(thr)); + + if (index && (index->type & DICT_FTS)) { + err = fts_create_index_tables(trx, index, table->id); + } + + trx->op_info = ""; + } else { + dict_build_index_def(table, index, trx); + + err = dict_index_add_to_cache(index, FIL_NULL); + ut_ad((index == NULL) == (err != DB_SUCCESS)); + if (UNIV_LIKELY(err == DB_SUCCESS)) { + ut_ad(!index->is_instant()); + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + + err = dict_create_index_tree_in_mem(index, trx); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!index->search_info->ref_count); +#endif /* BTR_CUR_HASH_ADAPT */ + + if (err != DB_SUCCESS) { + dict_index_remove_from_cache(table, index); + } + } + } + + return(err); +} + +/** Reassigns the table identifier of a table. +@param[in,out] table table +@param[in,out] trx transaction +@param[out] new_id new table id +@return error code or DB_SUCCESS */ +static +dberr_t +row_mysql_table_id_reassign( + dict_table_t* table, + trx_t* trx, + table_id_t* new_id) +{ + if (!dict_sys.sys_tables || dict_sys.sys_tables->corrupted || + !dict_sys.sys_columns || dict_sys.sys_columns->corrupted || + !dict_sys.sys_indexes || dict_sys.sys_indexes->corrupted || + !dict_sys.sys_virtual || dict_sys.sys_virtual->corrupted) { + return DB_CORRUPTION; + } + + dberr_t err; + pars_info_t* info = pars_info_create(); + + dict_hdr_get_new_id(new_id, NULL, NULL); + + pars_info_add_ull_literal(info, "old_id", table->id); + pars_info_add_ull_literal(info, "new_id", *new_id); + + /* Note: This cannot be rolled back. Rollback would see the + UPDATE SYS_INDEXES as two operations: DELETE and INSERT. + It would invoke btr_free_if_exists() when rolling back the + INSERT, effectively dropping all indexes of the table. */ + err = que_eval_sql( + info, + "PROCEDURE RENUMBER_TABLE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET ID = :new_id\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_VIRTUAL SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "END;\n", trx); + + return(err); +} + +/*********************************************************************//** +Do the foreign key constraint checks. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace_foreign_key_checks( +/*======================================*/ + const trx_t* trx, /*!< in: transaction handle */ + const dict_table_t* table) /*!< in: table to be discarded */ +{ + + if (srv_read_only_mode || !trx->check_foreigns) { + return(DB_SUCCESS); + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + dict_foreign_set::const_iterator it + = std::find_if(table->referenced_set.begin(), + table->referenced_set.end(), + dict_foreign_different_tables()); + + if (it == table->referenced_set.end()) { + return(DB_SUCCESS); + } + + const dict_foreign_t* foreign = *it; + FILE* ef = dict_foreign_err_file; + + ut_ad(foreign->foreign_table != table); + ut_ad(foreign->referenced_table == table); + + /* We only allow discarding a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mysql_mutex_lock(&dict_foreign_err_mutex); + + rewind(ef); + + ut_print_timestamp(ef); + + fputs(" Cannot DISCARD table ", ef); + ut_print_name(ef, trx, table->name.m_name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + putc('\n', ef); + + mysql_mutex_unlock(&dict_foreign_err_mutex); + + return(DB_CANNOT_DROP_CONSTRAINT); +} + +/*********************************************************************//** +Do the DISCARD TABLESPACE operation. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace( +/*===================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table) /*!< in/out: table to be discarded */ +{ + dberr_t err; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. The SQL layer will block all DML on the table using MDL and a + DISCARD will not start unless all existing operations on the + table to be discarded are completed. + + 1) Acquire the data dictionary latch in X mode. This will + prevent any internal operations that are not covered by + MDL or InnoDB table locks. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: we remove all entries for the tablespace in + the insert buffer tree. */ + + ibuf_delete_for_discarded_space(table->space_id); + + table_id_t new_id; + + /* Set the TABLESPACE DISCARD flag in the table definition + on disk. */ + err = row_import_update_discarded_flag(trx, table->id, true); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Update the index root pages in the system tables, on disk */ + err = row_import_update_index_root(trx, table, true); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Drop all the FTS auxiliary tables. */ + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + + fts_drop_tables(trx, *table); + } + + /* Assign a new space ID to the table definition so that purge + can ignore the changes. Update the system table on disk. */ + + err = row_mysql_table_id_reassign(table, trx, &new_id); + + if (err != DB_SUCCESS) { + return(err); + } + + /* All persistent operations successful, update the + data dictionary memory cache. */ + + dict_table_change_id_in_cache(table, new_id); + + dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + if (index) index->clear_instant_alter(); + + /* Reset the root page numbers. */ + for (; index; index = UT_LIST_GET_NEXT(indexes, index)) { + index->page = FIL_NULL; + } + + /* If the tablespace did not already exist or we couldn't + write to it, we treat that as a successful DISCARD. It is + unusable anyway. */ + return DB_SUCCESS; +} + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function renames the .ibd file and assigns a new table id for +the table. Also the file_unreadable flag is set. +@return error code or DB_SUCCESS */ +dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx) +{ + ut_ad(!is_system_tablespace(table->space_id)); + ut_ad(!table->is_temporary()); + + const auto fts_exist = table->flags2 & + (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS); + + dberr_t err; + + if (fts_exist) + { + fts_optimize_remove_table(table); + purge_sys.stop_FTS(*table); + err= fts_lock_tables(trx, *table); + if (err != DB_SUCCESS) + { +rollback: + if (fts_exist) + { + purge_sys.resume_FTS(); + fts_optimize_add_table(table); + } + trx->rollback(); + if (trx->dict_operation_lock_mode) + row_mysql_unlock_data_dictionary(trx); + return err; + } + } + + row_mysql_lock_data_dictionary(trx); + trx->op_info = "discarding tablespace"; + trx->dict_operation= true; + + /* We serialize data dictionary operations with dict_sys.latch: + this is to avoid deadlocks during data dictionary operations */ + + err= row_discard_tablespace_foreign_key_checks(trx, table); + if (err != DB_SUCCESS) + goto rollback; + + /* Note: The following cannot be rolled back. Rollback would see the + UPDATE of SYS_INDEXES.TABLE_ID as two operations: DELETE and INSERT. + It would invoke btr_free_if_exists() when rolling back the INSERT, + effectively dropping all indexes of the table. Furthermore, calls like + ibuf_delete_for_discarded_space() are already discarding data + before the transaction is committed. + + It would be better to remove the integrity-breaking + ALTER TABLE...DISCARD TABLESPACE operation altogether. */ + table->file_unreadable= true; + table->space= nullptr; + table->flags2|= DICT_TF2_DISCARDED; + err= row_discard_tablespace(trx, table); + DBUG_EXECUTE_IF("ib_discard_before_commit_crash", + log_buffer_flush_to_disk(); DBUG_SUICIDE();); + /* FTS_ tables may be deleted */ + std::vector<pfs_os_file_t> deleted; + trx->commit(deleted); + const auto space_id= table->space_id; + pfs_os_file_t d= fil_delete_tablespace(space_id); + DBUG_EXECUTE_IF("ib_discard_after_commit_crash", DBUG_SUICIDE();); + row_mysql_unlock_data_dictionary(trx); + + if (d != OS_FILE_CLOSED) + os_file_close(d); + for (pfs_os_file_t d : deleted) + os_file_close(d); + + if (fts_exist) + purge_sys.resume_FTS(); + + ibuf_delete_for_discarded_space(space_id); + buf_flush_remove_pages(space_id); + trx->op_info= ""; + return err; +} + +/****************************************************************//** +Delete a single constraint. +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_delete_constraint_low( +/*======================*/ + const char* id, /*!< in: constraint id */ + trx_t* trx) /*!< in: transaction handle */ +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", id); + + return(que_eval_sql(info, + "PROCEDURE DELETE_CONSTRAINT () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n" + "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n" + "END;\n", trx)); +} + +/****************************************************************//** +Delete a single constraint. +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_delete_constraint( +/*==================*/ + const char* id, /*!< in: constraint id */ + const char* database_name, /*!< in: database name, with the + trailing '/' */ + mem_heap_t* heap, /*!< in: memory heap */ + trx_t* trx) /*!< in: transaction handle */ +{ + dberr_t err; + + /* New format constraints have ids <databasename>/<constraintname>. */ + err = row_delete_constraint_low( + mem_heap_strcat(heap, database_name, id), trx); + + if ((err == DB_SUCCESS) && !strchr(id, '/')) { + /* Old format < 4.0.18 constraints have constraint ids + NUMBER_NUMBER. We only try deleting them if the + constraint name does not contain a '/' character, otherwise + deleting a new format constraint named 'foo/bar' from + database 'baz' would remove constraint 'bar' from database + 'foo', if it existed. */ + + err = row_delete_constraint_low(id, trx); + } + + return(err); +} + +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +dberr_t +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in/out: transaction */ + bool use_fk) /*!< in: whether to parse and enforce + FOREIGN KEY constraints */ +{ + dict_table_t* table = NULL; + dberr_t err = DB_ERROR; + mem_heap_t* heap = NULL; + const char** constraints_to_drop = NULL; + ulint n_constraints_to_drop = 0; + ibool old_is_tmp, new_is_tmp; + pars_info_t* info = NULL; + + ut_a(old_name != NULL); + ut_a(new_name != NULL); + ut_ad(trx->state == TRX_STATE_ACTIVE); + ut_ad(trx->dict_operation_lock_mode); + + if (high_level_read_only) { + return(DB_READ_ONLY); + } + + trx->op_info = "renaming table"; + + old_is_tmp = dict_table_t::is_temporary_name(old_name); + new_is_tmp = dict_table_t::is_temporary_name(new_name); + + table = dict_table_open_on_name(old_name, true, + DICT_ERR_IGNORE_FK_NOKEY); + + /* MariaDB partition engine hard codes the file name + separator as "#P#" and "#SP#". The text case is fixed even if + lower_case_table_names is set to 1 or 2. InnoDB always + normalises file names to lower case on Windows, this + can potentially cause problems when copying/moving + tables between platforms. + + 1) If boot against an installation from Windows + platform, then its partition table name could + be all be in lower case in system tables. So we + will need to check lower case name when load table. + + 2) If we boot an installation from other case + sensitive platform in Windows, we might need to + check the existence of table name without lowering + case them in the system table. */ + if (!table && lower_case_table_names == 1 + && strstr(old_name, table_name_t::part_suffix)) { + char par_case_name[MAX_FULL_NAME_LEN + 1]; +#ifndef _WIN32 + /* Check for the table using lower + case name, including the partition + separator "P" */ + memcpy(par_case_name, old_name, + strlen(old_name)); + par_case_name[strlen(old_name)] = 0; + innobase_casedn_str(par_case_name); +#else + /* On Windows platfrom, check + whether there exists table name in + system table whose name is + not being normalized to lower case */ + normalize_table_name_c_low( + par_case_name, old_name, FALSE); +#endif + table = dict_table_open_on_name(par_case_name, true, + DICT_ERR_IGNORE_FK_NOKEY); + } + + if (!table) { + err = DB_TABLE_NOT_FOUND; + goto funct_exit; + } + + ut_ad(!table->is_temporary()); + + if (!table->is_readable() && !table->space + && !(table->flags2 & DICT_TF2_DISCARDED)) { + + err = DB_TABLE_NOT_FOUND; + + ib::error() << "Table " << old_name << " does not have an .ibd" + " file in the database directory. " + << TROUBLESHOOTING_MSG; + + goto funct_exit; + + } else if (use_fk && !old_is_tmp && new_is_tmp) { + /* MySQL is doing an ALTER TABLE command and it renames the + original table to a temporary table name. We want to preserve + the original foreign key constraint definitions despite the + name change. An exception is those constraints for which + the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/ + + heap = mem_heap_create(100); + + err = dict_foreign_parse_drop_constraints( + heap, trx, table, &n_constraints_to_drop, + &constraints_to_drop); + + if (err != DB_SUCCESS) { + goto funct_exit; + } + } + + err = trx_undo_report_rename(trx, table); + + if (err != DB_SUCCESS) { + goto funct_exit; + } + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data from system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES" + " SET NAME = :new_table_name\n" + " WHERE NAME = :old_table_name;\n" + "END;\n", trx); + + if (err != DB_SUCCESS) { + // Assume the caller guarantees destination name doesn't exist. + ut_ad(err != DB_DUPLICATE_KEY); + goto rollback_and_exit; + } + + if (!new_is_tmp) { + /* Rename all constraints. */ + char new_table_name[MAX_TABLE_NAME_LEN + 1]; + char old_table_utf8[MAX_TABLE_NAME_LEN + 1]; + uint errors = 0; + + strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN); + old_table_utf8[MAX_TABLE_NAME_LEN] = '\0'; + innobase_convert_to_system_charset( + strchr(old_table_utf8, '/') + 1, + strchr(old_name, '/') +1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted from charset + my_charset_filename to UTF-8. This means that the + table name is already in UTF-8 (#mysql#50). */ + strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN); + old_table_utf8[MAX_TABLE_NAME_LEN] = '\0'; + } + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + pars_info_add_str_literal(info, "old_table_name_utf8", + old_table_utf8); + + strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN); + new_table_name[MAX_TABLE_NAME_LEN] = '\0'; + innobase_convert_to_system_charset( + strchr(new_table_name, '/') + 1, + strchr(new_name, '/') +1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted from charset + my_charset_filename to UTF-8. This means that the + table name is already in UTF-8 (#mysql#50). */ + strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN); + new_table_name[MAX_TABLE_NAME_LEN] = '\0'; + } + + pars_info_add_str_literal(info, "new_table_utf8", new_table_name); + + err = que_eval_sql( + info, + "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n" + "gen_constr_prefix CHAR;\n" + "new_db_name CHAR;\n" + "foreign_id CHAR;\n" + "new_foreign_id CHAR;\n" + "old_db_name_len INT;\n" + "old_t_name_len INT;\n" + "new_db_name_len INT;\n" + "id_len INT;\n" + "offset INT;\n" + "found INT;\n" + "BEGIN\n" + "found := 1;\n" + "old_db_name_len := INSTR(:old_table_name, '/')-1;\n" + "new_db_name_len := INSTR(:new_table_name, '/')-1;\n" + "new_db_name := SUBSTR(:new_table_name, 0,\n" + " new_db_name_len);\n" + "old_t_name_len := LENGTH(:old_table_name);\n" + "gen_constr_prefix := CONCAT(:old_table_name_utf8,\n" + " '_ibfk_');\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = :old_table_name\n" + " AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:old_table_name)\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " UPDATE SYS_FOREIGN\n" + " SET FOR_NAME = :new_table_name\n" + " WHERE ID = foreign_id;\n" + " id_len := LENGTH(foreign_id);\n" + " IF (INSTR(foreign_id, '/') > 0) THEN\n" + " IF (INSTR(foreign_id,\n" + " gen_constr_prefix) > 0)\n" + " THEN\n" + " offset := INSTR(foreign_id, '_ibfk_') - 1;\n" + " new_foreign_id :=\n" + " CONCAT(:new_table_utf8,\n" + " SUBSTR(foreign_id, offset,\n" + " id_len - offset));\n" + " ELSE\n" + " new_foreign_id :=\n" + " CONCAT(new_db_name,\n" + " SUBSTR(foreign_id,\n" + " old_db_name_len,\n" + " id_len - old_db_name_len));\n" + " END IF;\n" + " UPDATE SYS_FOREIGN\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " UPDATE SYS_FOREIGN_COLS\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n" + "WHERE REF_NAME = :old_table_name\n" + " AND TO_BINARY(REF_NAME)\n" + " = TO_BINARY(:old_table_name);\n" + "END;\n", trx); + + } else if (n_constraints_to_drop > 0) { + /* Drop some constraints of tmp tables. */ + + ulint db_name_len = dict_get_db_name_len(old_name) + 1; + char* db_name = mem_heap_strdupl(heap, old_name, + db_name_len); + ulint i; + + for (i = 0; i < n_constraints_to_drop; i++) { + err = row_delete_constraint(constraints_to_drop[i], + db_name, heap, trx); + + if (err != DB_SUCCESS) { + break; + } + } + } + + if (err == DB_SUCCESS + && (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) + && !dict_tables_have_same_db(old_name, new_name)) { + err = fts_rename_aux_tables(table, new_name, trx); + } + + switch (err) { + case DB_DUPLICATE_KEY: + ib::error() << "Table rename might cause two" + " FOREIGN KEY constraints to have the same" + " internal name in case-insensitive comparison."; + ib::info() << TROUBLESHOOTING_MSG; + /* fall through */ + rollback_and_exit: + default: + trx->error_state = DB_SUCCESS; + trx->rollback(); + trx->error_state = DB_SUCCESS; + break; + case DB_SUCCESS: + DEBUG_SYNC_C("innodb_rename_in_cache"); + /* The following call will also rename the .ibd file */ + err = dict_table_rename_in_cache( + table, span<const char>{new_name,strlen(new_name)}, + false); + if (err != DB_SUCCESS) { + goto rollback_and_exit; + } + + /* In case of copy alter, template db_name and + table_name should be renamed only for newly + created table. */ + if (table->vc_templ != NULL && !new_is_tmp) { + innobase_rename_vc_templ(table); + } + + /* We only want to switch off some of the type checking in + an ALTER TABLE, not in a RENAME. */ + dict_names_t fk_tables; + + err = dict_load_foreigns( + new_name, nullptr, trx->id, + !old_is_tmp || trx->check_foreigns, + use_fk + ? DICT_ERR_IGNORE_NONE + : DICT_ERR_IGNORE_FK_NOKEY, + fk_tables); + + if (err != DB_SUCCESS) { + if (old_is_tmp) { + /* In case of copy alter, ignore the + loading of foreign key constraint + when foreign_key_check is disabled */ + ib::error_or_warn(trx->check_foreigns) + << "In ALTER TABLE " + << ut_get_name(trx, new_name) + << " has or is referenced in foreign" + " key constraints which are not" + " compatible with the new table" + " definition."; + if (!trx->check_foreigns) { + err = DB_SUCCESS; + break; + } + } else { + ib::error() << "In RENAME TABLE table " + << ut_get_name(trx, new_name) + << " is referenced in foreign key" + " constraints which are not compatible" + " with the new table definition."; + } + + goto rollback_and_exit; + } + + /* Check whether virtual column or stored column affects + the foreign key constraint of the table. */ + if (dict_foreigns_has_s_base_col(table->foreign_set, table)) { + err = DB_NO_FK_ON_S_BASE_COL; + goto rollback_and_exit; + } + + /* Fill the virtual column set in foreign when + the table undergoes copy alter operation. */ + dict_mem_table_free_foreign_vcol_set(table); + dict_mem_table_fill_foreign_vcol_set(table); + + while (!fk_tables.empty()) { + const char *f = fk_tables.front(); + dict_sys.load_table({f, strlen(f)}); + fk_tables.pop_front(); + } + + table->data_dir_path= NULL; + } + +funct_exit: + if (table) { + table->release(); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + trx->op_info = ""; + + return(err); +} diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc new file mode 100644 index 00000000..4756cc37 --- /dev/null +++ b/storage/innobase/row/row0purge.cc @@ -0,0 +1,1304 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0purge.cc +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#include "row0purge.h" +#include "btr0cur.h" +#include "fsp0fsp.h" +#include "mach0data.h" +#include "dict0crea.h" +#include "dict0stats.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0vers.h" +#include "row0mysql.h" +#include "log0log.h" +#include "srv0mon.h" +#include "srv0start.h" +#include "handler.h" +#include "ha_innodb.h" +#include "fil0fil.h" +#include "debug_sync.h" +#include <mysql/service_thd_mdl.h> + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***********************************************************//** +Repositions the pcur in the purge node on the clustered index record, +if found. If the record is not found, close pcur. +@return TRUE if the record was found */ +static +ibool +row_purge_reposition_pcur( +/*======================*/ + btr_latch_mode mode, /*!< in: latching mode */ + purge_node_t* node, /*!< in: row purge node */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (node->found_clust) { + ut_ad(node->validate_pcur()); + + node->found_clust = + node->pcur.restore_position(mode, mtr) == + btr_pcur_t::SAME_ALL; + + } else { + node->found_clust = row_search_on_row_ref( + &node->pcur, mode, node->table, node->ref, mtr); + + if (node->found_clust) { + btr_pcur_store_position(&node->pcur, mtr); + } + } + + /* Close the current cursor if we fail to position it correctly. */ + if (!node->found_clust) { + btr_pcur_close(&node->pcur); + } + + return(node->found_clust); +} + +/***********************************************************//** +Removes a delete marked clustered index record if possible. +@retval true if the row was not found, or it was successfully removed +@retval false if the row was modified after the delete marking */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +row_purge_remove_clust_if_poss_low( +/*===============================*/ + purge_node_t* node, /*!< in/out: row purge node */ + btr_latch_mode mode) /*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */ +{ + dict_index_t* index = dict_table_get_first_index(node->table); + table_id_t table_id = 0; + index_id_t index_id = 0; + dict_table_t *table = nullptr; + pfs_os_file_t f = OS_FILE_CLOSED; + + if (table_id) { +retry: + dict_sys.lock(SRW_LOCK_CALL); + table = dict_sys.find_table(table_id); + if (!table) { + dict_sys.unlock(); + } else if (table->n_rec_locks) { + for (dict_index_t* ind = UT_LIST_GET_FIRST( + table->indexes); ind; + ind = UT_LIST_GET_NEXT(indexes, ind)) { + if (ind->id == index_id) { + lock_discard_for_index(*ind); + } + } + } + } + mtr_t mtr; + mtr.start(); + index->set_modified(mtr); + log_free_check(); + bool success = true; + + if (!row_purge_reposition_pcur(mode, node, &mtr)) { + /* The record was already removed. */ +removed: + mtr.commit(); +close_and_exit: + if (table) { + dict_sys.unlock(); + } + return success; + } + + if (node->table->id == DICT_INDEXES_ID) { + /* If this is a record of the SYS_INDEXES table, then + we have to free the file segments of the index tree + associated with the index */ + if (!table_id) { + const rec_t* rec = btr_pcur_get_rec(&node->pcur); + + table_id = mach_read_from_8(rec); + index_id = mach_read_from_8(rec + 8); + if (table_id) { + mtr.commit(); + goto retry; + } + ut_ad("corrupted SYS_INDEXES record" == 0); + } + + const uint32_t space_id = dict_drop_index_tree( + &node->pcur, nullptr, &mtr); + if (space_id) { + if (table) { + if (table->get_ref_count() == 0) { + dict_sys.remove(table); + } else if (table->space_id == space_id) { + table->space = nullptr; + table->file_unreadable = true; + } + dict_sys.unlock(); + table = nullptr; + } + f = fil_delete_tablespace(space_id); + } + + mtr.commit(); + + if (table) { + dict_sys.unlock(); + table = nullptr; + } + + if (space_id) { + ibuf_delete_for_discarded_space(space_id); + } + + mtr.start(); + index->set_modified(mtr); + + if (!row_purge_reposition_pcur(mode, node, &mtr)) { + goto removed; + } + } + + rec_t* rec = btr_pcur_get_rec(&node->pcur); + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + rec_offs* offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) { + /* Someone else has modified the record later: do not remove */ + goto func_exit; + } + + ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, offsets)); + + if (mode == BTR_MODIFY_LEAF) { + success = DB_FAIL != btr_cur_optimistic_delete( + btr_pcur_get_btr_cur(&node->pcur), 0, &mtr); + } else { + dberr_t err; + ut_ad(mode == BTR_PURGE_TREE); + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0, + false, &mtr); + success = err == DB_SUCCESS; + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + + /* Persistent cursor is closed if reposition fails. */ + if (node->found_clust) { + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + } else { + mtr_commit(&mtr); + } + + goto close_and_exit; +} + +/***********************************************************//** +Removes a clustered index record if it has not been modified after the delete +marking. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended because of running out +of file space. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +row_purge_remove_clust_if_poss( +/*===========================*/ + purge_node_t* node) /*!< in/out: row purge node */ +{ + if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) { + return(true); + } + + for (ulint n_tries = 0; + n_tries < BTR_CUR_RETRY_DELETE_N_TIMES; + n_tries++) { + if (row_purge_remove_clust_if_poss_low(node, BTR_PURGE_TREE)) { + return(true); + } + + std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME); + } + + return(false); +} + +/** Determines if it is possible to remove a secondary index entry. +Removal is possible if the secondary index entry does not refer to any +not delete marked version of a clustered index record where DB_TRX_ID +is newer than the purge view. + +NOTE: This function should only be called by the purge thread, only +while holding a latch on the leaf page of the secondary index entry +(or keeping the buffer pool watch on the page). It is possible that +this function first returns true and then false, if a user transaction +inserts a record that the secondary index entry would refer to. +However, in that case, the user transaction would also re-insert the +secondary index entry after purge has removed it and released the leaf +page latch. +@param[in,out] node row purge node +@param[in] index secondary index +@param[in] entry secondary index entry +@param[in,out] sec_pcur secondary index cursor or NULL + if it is called for purge buffering + operation. +@param[in,out] sec_mtr mini-transaction which holds + secondary index entry or NULL if it is + called for purge buffering operation. +@param[in] is_tree true=pessimistic purge, + false=optimistic (leaf-page only) +@return true if the secondary index record can be purged */ +bool +row_purge_poss_sec( + purge_node_t* node, + dict_index_t* index, + const dtuple_t* entry, + btr_pcur_t* sec_pcur, + mtr_t* sec_mtr, + bool is_tree) +{ + bool can_delete; + mtr_t mtr; + + ut_ad(!dict_index_is_clust(index)); + + mtr_start(&mtr); + + can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr) + || !row_vers_old_has_index_entry(true, + btr_pcur_get_rec(&node->pcur), + &mtr, index, entry, + node->roll_ptr, node->trx_id); + + /* Persistent cursor is closed if reposition fails. */ + if (node->found_clust) { + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + } else { + mtr.commit(); + } + + ut_ad(mtr.has_committed()); + + return can_delete; +} + +/*************************************************************** +Removes a secondary index entry if possible, by modifying the +index tree. Does not try to buffer the delete. +@return TRUE if success or if not found */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ibool +row_purge_remove_sec_if_poss_tree( +/*==============================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry) /*!< in: index entry */ +{ + btr_pcur_t pcur; + ibool success = TRUE; + dberr_t err; + mtr_t mtr; + + log_free_check(); + mtr.start(); + index->set_modified(mtr); + pcur.btr_cur.page_cur.index = index; + + if (index->is_spatial()) { + if (!rtr_search(entry, BTR_PURGE_TREE, &pcur, &mtr)) { + goto found; + } + goto func_exit; + } + + switch (row_search_index_entry(entry, BTR_PURGE_TREE, &pcur, &mtr)) { + case ROW_NOT_FOUND: + /* Not found. This is a legitimate condition. In a + rollback, InnoDB will remove secondary recs that would + be purged anyway. Then the actual purge will not find + the secondary index record. Also, the purge itself is + eager: if it comes to consider a secondary index + record, and notices it does not need to exist in the + index, it will remove it. Then if/when the purge + comes to consider the secondary index record a second + time, it will not exist any more in the index. */ + + /* fputs("PURGE:........sec entry not found\n", stderr); */ + /* dtuple_print(stderr, entry); */ + goto func_exit; + case ROW_FOUND: + break; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + } + + /* We should remove the index record if no later version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should do nothing. */ + +found: + if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) { + + /* Remove the index record, which should have been + marked for deletion. */ + if (!rec_get_deleted_flag(btr_cur_get_rec( + btr_pcur_get_btr_cur(&pcur)), + dict_table_is_comp(index->table))) { + ib::error() + << "tried to purge non-delete-marked record" + " in index " << index->name + << " of table " << index->table->name + << ": tuple: " << *entry + << ", record: " << rec_index_print( + btr_cur_get_rec( + btr_pcur_get_btr_cur(&pcur)), + index); + + ut_ad(0); + + goto func_exit; + } + + btr_cur_pessimistic_delete(&err, FALSE, + btr_pcur_get_btr_cur(&pcur), + 0, false, &mtr); + switch (UNIV_EXPECT(err, DB_SUCCESS)) { + case DB_SUCCESS: + break; + case DB_OUT_OF_FILE_SPACE: + success = FALSE; + break; + default: + ut_error; + } + } + +func_exit: + btr_pcur_close(&pcur); // FIXME: need this? + mtr.commit(); + + return(success); +} + +/*************************************************************** +Removes a secondary index entry without modifying the index tree, +if possible. +@retval true if success or if not found +@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +row_purge_remove_sec_if_poss_leaf( +/*==============================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry) /*!< in: index entry */ +{ + mtr_t mtr; + btr_pcur_t pcur; + bool success = true; + + log_free_check(); + ut_ad(index->table == node->table); + ut_ad(!index->table->is_temporary()); + mtr.start(); + index->set_modified(mtr); + + pcur.btr_cur.page_cur.index = index; + + /* Set the purge node for the call to row_purge_poss_sec(). */ + pcur.btr_cur.purge_node = node; + if (index->is_spatial()) { + pcur.btr_cur.thr = NULL; + if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, &mtr)) { + goto found; + } + goto func_exit; + } + + /* Set the query thread, so that ibuf_insert_low() will be + able to invoke thd_get_trx(). */ + pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node)); + + switch (row_search_index_entry(entry, index->has_virtual() + ? BTR_MODIFY_LEAF : BTR_PURGE_LEAF, + &pcur, &mtr)) { + case ROW_FOUND: +found: + /* Before attempting to purge a record, check + if it is safe to do so. */ + if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) { + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* Only delete-marked records should be purged. */ + if (!rec_get_deleted_flag( + btr_cur_get_rec(btr_cur), + dict_table_is_comp(index->table))) { + + ib::error() + << "tried to purge non-delete-marked" + " record" " in index " << index->name + << " of table " << index->table->name + << ": tuple: " << *entry + << ", record: " + << rec_index_print( + btr_cur_get_rec(btr_cur), + index); + mtr.commit(); + dict_set_corrupted(index, "purge"); + goto cleanup; + } + + if (index->is_spatial()) { + const buf_block_t* block = btr_cur_get_block( + btr_cur); + + if (block->page.id().page_no() + != index->page + && page_get_n_recs(block->page.frame) < 2 + && !lock_test_prdt_page_lock( + btr_cur->rtr_info + && btr_cur->rtr_info->thr + ? thr_get_trx( + btr_cur->rtr_info->thr) + : nullptr, + block->page.id())) { + /* this is the last record on page, + and it has a "page" lock on it, + which mean search is still depending + on it, so do not delete */ + DBUG_LOG("purge", + "skip purging last" + " record on page " + << block->page.id()); + goto func_exit; + } + } + + success = btr_cur_optimistic_delete(btr_cur, 0, &mtr) + != DB_FAIL; + } + + /* (The index entry is still needed, + or the deletion succeeded) */ + /* fall through */ + case ROW_NOT_DELETED_REF: + /* The index entry is still needed. */ + case ROW_BUFFERED: + /* The deletion was buffered. */ + case ROW_NOT_FOUND: + /* The index entry does not exist, nothing to do. */ +func_exit: + mtr.commit(); +cleanup: + btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set? + return(success); + } + + ut_error; + return(false); +} + +/***********************************************************//** +Removes a secondary index entry if possible. */ +UNIV_INLINE MY_ATTRIBUTE((nonnull(1,2))) +void +row_purge_remove_sec_if_poss( +/*=========================*/ + purge_node_t* node, /*!< in: row purge node */ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry) /*!< in: index entry */ +{ + ibool success; + ulint n_tries = 0; + + /* fputs("Purge: Removing secondary record\n", stderr); */ + + if (!entry) { + /* The node->row must have lacked some fields of this + index. This is possible when the undo log record was + written before this index was created. */ + return; + } + + if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) { + + return; + } +retry: + success = row_purge_remove_sec_if_poss_tree(node, index, entry); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/***********************************************************//** +Purges a delete marking of a record. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended because of +running out of file space */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +row_purge_del_mark( +/*===============*/ + purge_node_t* node) /*!< in/out: row purge node */ +{ + if (node->index) + { + mem_heap_t *heap= mem_heap_create(1024); + + do + { + if (node->index->type & (DICT_FTS | DICT_CORRUPT)) + continue; + if (!node->index->is_committed()) + continue; + dtuple_t* entry= row_build_index_entry_low(node->row, nullptr, + node->index, heap, + ROW_BUILD_FOR_PURGE); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); + } + while ((node->index= dict_table_get_next_index(node->index))); + + mem_heap_free(heap); + } + + bool result= row_purge_remove_clust_if_poss(node); + +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("enable_row_purge_del_mark_exit_sync_point", + debug_sync_set_action + (current_thd, + STRING_WITH_LEN("now SIGNAL row_purge_del_mark_finished")); + ); +#endif + + return result; +} + +/** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record +whose old history can no longer be observed. +@param[in,out] node purge node +@param[in,out] mtr mini-transaction (will be started and committed) */ +static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr) +{ + /* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */ + mtr->start(); + + if (row_purge_reposition_pcur(BTR_MODIFY_LEAF, node, mtr)) { + dict_index_t* index = dict_table_get_first_index( + node->table); + ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; + rec_t* rec = btr_pcur_get_rec(&node->pcur); + mem_heap_t* heap = NULL; + /* Reserve enough offsets for the PRIMARY KEY and 2 columns + so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; + rec_offs_init(offsets_); + rec_offs* offsets = rec_get_offsets( + rec, index, offsets_, index->n_core_fields, + trx_id_pos + 2, &heap); + ut_ad(heap == NULL); + + ut_ad(dict_index_get_nth_field(index, trx_id_pos) + ->col->mtype == DATA_SYS); + ut_ad(dict_index_get_nth_field(index, trx_id_pos) + ->col->prtype == (DATA_TRX_ID | DATA_NOT_NULL)); + ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1) + ->col->mtype == DATA_SYS); + ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1) + ->col->prtype == (DATA_ROLL_PTR | DATA_NOT_NULL)); + + /* Only update the record if DB_ROLL_PTR matches (the + record has not been modified after this transaction + became purgeable) */ + if (node->roll_ptr + == row_get_rec_roll_ptr(rec, index, offsets)) { + ut_ad(!rec_get_deleted_flag( + rec, rec_offs_comp(offsets)) + || rec_is_alter_metadata(rec, *index)); + DBUG_LOG("purge", "reset DB_TRX_ID=" + << ib::hex(row_get_rec_trx_id( + rec, index, offsets))); + + index->set_modified(*mtr); + buf_block_t* block = btr_pcur_get_block(&node->pcur); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + page_zip_write_trx_id_and_roll_ptr( + block, rec, offsets, trx_id_pos, + 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS, + mtr); + } else { + ulint len; + byte* ptr = rec_get_nth_field( + rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + size_t offs = page_offset(ptr); + mtr->memset(block, offs, DATA_TRX_ID_LEN, 0); + offs += DATA_TRX_ID_LEN; + mtr->write<1,mtr_t::MAYBE_NOP>( + *block, block->page.frame + offs, + 0x80U); + mtr->memset(block, offs + 1, + DATA_ROLL_PTR_LEN - 1, 0); + } + } + } + + mtr->commit(); +} + +/***********************************************************//** +Purges an update of an existing record. Also purges an update of a delete +marked record if that record contained an externally stored field. */ +static +void +row_purge_upd_exist_or_extern_func( +/*===============================*/ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + purge_node_t* node, /*!< in: row purge node */ + const trx_undo_rec_t* undo_rec) /*!< in: record to purge */ +{ + mem_heap_t* heap; + + ut_ad(!node->table->skip_alter_undo); + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC + || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) + || !node->index) { + + goto skip_secondaries; + } + + heap = mem_heap_create(1024); + + do { + if (node->index->type & (DICT_FTS | DICT_CORRUPT)) { + continue; + } + + if (!node->index->is_committed()) { + continue; + } + + if (row_upd_changes_ord_field_binary(node->index, node->update, + thr, NULL, NULL)) { + /* Build the older version of the index entry */ + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, + heap, ROW_BUILD_FOR_PURGE); + row_purge_remove_sec_if_poss(node, node->index, entry); + + ut_ad(node->table); + + mem_heap_empty(heap); + } + } while ((node->index = dict_table_get_next_index(node->index))); + + mem_heap_free(heap); + +skip_secondaries: + mtr_t mtr; + dict_index_t* index = dict_table_get_first_index(node->table); + /* Free possible externally stored fields */ + for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { + + const upd_field_t* ufield + = upd_get_nth_field(node->update, i); + + if (dfield_is_ext(&ufield->new_val)) { + bool is_insert; + ulint rseg_id; + uint32_t page_no; + uint16_t offset; + + /* We use the fact that new_val points to + undo_rec and get thus the offset of + dfield data inside the undo record. Then we + can calculate from node->roll_ptr the file + address of the new_val data */ + + const uint16_t internal_offset = uint16_t( + static_cast<const byte*> + (dfield_get_data(&ufield->new_val)) + - undo_rec); + + ut_a(internal_offset < srv_page_size); + + trx_undo_decode_roll_ptr(node->roll_ptr, + &is_insert, &rseg_id, + &page_no, &offset); + + const trx_rseg_t &rseg = trx_sys.rseg_array[rseg_id]; + ut_ad(rseg.is_persistent()); + + mtr.start(); + + /* We have to acquire an SX-latch to the clustered + index tree (exclude other tree changes) */ + + mtr_sx_lock_index(index, &mtr); + + index->set_modified(mtr); + + /* NOTE: we must also acquire a U latch to the + root page of the tree. We will need it when we + free pages from the tree. If the tree is of height 1, + the tree X-latch does NOT protect the root page, + because it is also a leaf page. Since we will have a + latch on an undo log page, we would break the + latching order if we would only later latch the + root page of such a tree! */ + + dberr_t err; + if (!btr_root_block_get(index, RW_SX_LATCH, &mtr, + &err)) { + } else if (buf_block_t* block = + buf_page_get(page_id_t(rseg.space->id, + page_no), + 0, RW_X_LATCH, &mtr)) { + block->page.set_accessed(); + buf_page_make_young_if_needed(&block->page); + + byte* data_field = block->page.frame + + offset + internal_offset; + + ut_a(dfield_get_len(&ufield->new_val) + >= BTR_EXTERN_FIELD_REF_SIZE); + btr_free_externally_stored_field( + index, + data_field + + dfield_get_len(&ufield->new_val) + - BTR_EXTERN_FIELD_REF_SIZE, + NULL, NULL, block, 0, false, &mtr); + } + + mtr.commit(); + } + } + + row_purge_reset_trx_id(node, &mtr); +} + +#ifdef UNIV_DEBUG +# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \ + row_purge_upd_exist_or_extern_func(thr,node,undo_rec) +#else /* UNIV_DEBUG */ +# define row_purge_upd_exist_or_extern(thr,node,undo_rec) \ + row_purge_upd_exist_or_extern_func(node,undo_rec) +#endif /* UNIV_DEBUG */ + +/** Build a partial row from an update undo log record for purge. +Any columns which occur as ordering in any index of the table are present. +Any missing columns are indicated by col->mtype == DATA_MISSING. + +@param ptr remaining part of the undo log record +@param index clustered index +@param node purge node +@return pointer to remaining part of undo record */ +static byte *row_purge_get_partial(const byte *ptr, const dict_index_t &index, + purge_node_t *node) +{ + bool first_v_col= true; + bool is_undo_log= true; + + ut_ad(index.is_primary()); + ut_ad(index.n_uniq == node->ref->n_fields); + + node->row= dtuple_create_with_vcol(node->heap, index.table->n_cols, + index.table->n_v_cols); + + /* Mark all columns in the row uninitialized, so that + we can distinguish missing fields from fields that are SQL NULL. */ + for (ulint i= 0; i < index.table->n_cols; i++) + node->row->fields[i].type.mtype= DATA_MISSING; + + dtuple_init_v_fld(node->row); + + for (const upd_field_t *uf= node->update->fields, *const ue= + node->update->fields + node->update->n_fields; uf != ue; uf++) + { + if (!uf->old_v_val) + { + const dict_col_t &c= *dict_index_get_nth_col(&index, uf->field_no); + if (!c.is_dropped()) + node->row->fields[c.ind]= uf->new_val; + } + } + + const byte *end_ptr= ptr + mach_read_from_2(ptr); + ptr+= 2; + + while (ptr != end_ptr) + { + dfield_t *dfield; + const byte *field; + const dict_col_t *col; + uint32_t len, orig_len, field_no= mach_read_next_compressed(&ptr); + + if (field_no >= REC_MAX_N_FIELDS) + { + ptr= trx_undo_read_v_idx(index.table, ptr, first_v_col, &is_undo_log, + &field_no); + first_v_col= false; + + ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + if (field_no == FIL_NULL) + continue; /* there no longer is an index on the virtual column */ + + dict_v_col_t *vcol= dict_table_get_nth_v_col(index.table, field_no); + col =&vcol->m_col; + dfield= dtuple_get_nth_v_field(node->row, vcol->v_pos); + dict_col_copy_type(&vcol->m_col, &dfield->type); + } + else + { + ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + col= dict_index_get_nth_col(&index, field_no); + if (col->is_dropped()) + continue; + dfield= dtuple_get_nth_field(node->row, col->ind); + ut_ad(dfield->type.mtype == DATA_MISSING || + dict_col_type_assert_equal(col, &dfield->type)); + ut_ad(dfield->type.mtype == DATA_MISSING || + dfield->len == len || + (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD)); + dict_col_copy_type(dict_table_get_nth_col(index.table, col->ind), + &dfield->type); + } + + dfield_set_data(dfield, field, len); + + if (len == UNIV_SQL_NULL || len < UNIV_EXTERN_STORAGE_FIELD) + continue; + + spatial_status_t spatial_status= static_cast<spatial_status_t> + ((len & SPATIAL_STATUS_MASK) >> SPATIAL_STATUS_SHIFT); + len&= ~SPATIAL_STATUS_MASK; + + /* Keep compatible with 5.7.9 format. */ + if (spatial_status == SPATIAL_UNKNOWN) + spatial_status= dict_col_get_spatial_status(col); + + switch (UNIV_EXPECT(spatial_status, SPATIAL_NONE)) { + case SPATIAL_ONLY: + ut_ad(len - UNIV_EXTERN_STORAGE_FIELD == DATA_MBR_LEN); + dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD); + break; + + case SPATIAL_MIXED: + dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD - DATA_MBR_LEN); + break; + + default: + dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD); + break; + } + + dfield_set_ext(dfield); + dfield_set_spatial_status(dfield, spatial_status); + + if (!col->ord_part || spatial_status == SPATIAL_ONLY || + node->rec_type == TRX_UNDO_UPD_DEL_REC) + continue; + /* If the prefix of this BLOB column is indexed, ensure that enough + prefix is stored in the undo log record. */ + ut_a(dfield_get_len(dfield) >= BTR_EXTERN_FIELD_REF_SIZE); + ut_a(dict_table_has_atomic_blobs(index.table) || + dfield_get_len(dfield) >= + REC_ANTELOPE_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE); + } + + for (ulint i= 0; i < index.n_uniq; i++) + { + dfield_t &field= node->row->fields[index.fields[i].col->ind]; + if (field.type.mtype == DATA_MISSING) + field= node->ref->fields[i]; + } + + return const_cast<byte*>(ptr); +} + +MY_ATTRIBUTE((nonnull,warn_unused_result)) +/** Parses the row reference and other info in a modify undo log record. +@param[in] node row undo node +@param[in] undo_rec record to purge +@param[in] thr query thread +@param[out] updated_extern true if an externally stored field was + updated +@return true if purge operation required */ +static +bool +row_purge_parse_undo_rec( + purge_node_t* node, + const trx_undo_rec_t* undo_rec, + que_thr_t* thr, + bool* updated_extern) +{ + dict_index_t* clust_index; + undo_no_t undo_no; + table_id_t table_id; + roll_ptr_t roll_ptr; + byte info_bits; + byte type; + + const byte* ptr = trx_undo_rec_get_pars( + undo_rec, &type, &node->cmpl_info, + updated_extern, &undo_no, &table_id); + + node->rec_type = type; + + switch (type) { + case TRX_UNDO_RENAME_TABLE: + return false; + case TRX_UNDO_EMPTY: + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + /* These records do not store any transaction identifier. */ + node->trx_id = TRX_ID_MAX; + break; + default: +#ifdef UNIV_DEBUG + ut_ad("unknown undo log record type" == 0); + return false; + case TRX_UNDO_UPD_DEL_REC: + case TRX_UNDO_UPD_EXIST_REC: + case TRX_UNDO_DEL_MARK_REC: +#endif /* UNIV_DEBUG */ + ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id, + &roll_ptr, &info_bits); + break; + } + + auto &tables_entry= node->tables[table_id]; + node->table = tables_entry.first; + if (!node->table) { + return false; + } + +#ifndef DBUG_OFF + if (MDL_ticket* mdl = tables_entry.second) { + static_cast<MDL_context*>(thd_mdl_context(current_thd)) + ->lock_warrant = mdl->get_ctx(); + } +#endif + ut_ad(!node->table->is_temporary()); + + clust_index = dict_table_get_first_index(node->table); + + if (clust_index->is_corrupted()) { + /* The table was corrupt in the data dictionary. + dict_set_corrupted() works on an index, and + we do not have an index to call it with. */ + DBUG_ASSERT(table_id == node->table->id); + return false; + } + + switch (type) { + case TRX_UNDO_INSERT_METADATA: + node->ref = &trx_undo_metadata; + return true; + case TRX_UNDO_EMPTY: + node->ref = nullptr; + return true; + } + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + if (type == TRX_UNDO_INSERT_REC) { + return(true); + } + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, + node->trx_id, + roll_ptr, info_bits, + node->heap, &(node->update)); + + /* Read to the partial row the fields that occur in indexes */ + + if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + ut_ad(!(node->update->info_bits & REC_INFO_MIN_REC_FLAG)); + ptr = row_purge_get_partial(ptr, *clust_index, node); + } else if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) { + node->ref = &trx_undo_metadata; + } + + return(true); +} + +/** Purges the parsed record. +@param[in] node row purge node +@param[in] undo_rec record to purge +@param[in] thr query thread +@param[in] updated_extern whether external columns were updated +@return true if purged, false if skipped */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +row_purge_record_func( + purge_node_t* node, + const trx_undo_rec_t* undo_rec, +#if defined UNIV_DEBUG || defined WITH_WSREP + const que_thr_t*thr, +#endif /* UNIV_DEBUG || WITH_WSREP */ + bool updated_extern) +{ + ut_ad(!node->found_clust); + ut_ad(!node->table->skip_alter_undo); + ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr)); + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + bool purged = true; + + switch (node->rec_type) { + case TRX_UNDO_EMPTY: + break; + case TRX_UNDO_DEL_MARK_REC: + purged = row_purge_del_mark(node); + if (purged) { + if (node->table->stat_initialized + && srv_stats_include_delete_marked) { + dict_stats_update_if_needed( + node->table, *thr->graph->trx); + } + MONITOR_INC(MONITOR_N_DEL_ROW_PURGE); + } + break; + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS; + /* fall through */ + default: + if (!updated_extern) { + mtr_t mtr; + row_purge_reset_trx_id(node, &mtr); + break; + } + /* fall through */ + case TRX_UNDO_UPD_EXIST_REC: + row_purge_upd_exist_or_extern(thr, node, undo_rec); + MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN); + break; + } + + if (node->found_clust) { + node->found_clust = false; + btr_pcur_close(&node->pcur); + } + + return(purged); +} + +#if defined UNIV_DEBUG || defined WITH_WSREP +# define row_purge_record(node,undo_rec,thr,updated_extern) \ + row_purge_record_func(node,undo_rec,thr,updated_extern) +#else /* UNIV_DEBUG || WITH_WSREP */ +# define row_purge_record(node,undo_rec,thr,updated_extern) \ + row_purge_record_func(node,undo_rec,updated_extern) +#endif /* UNIV_DEBUG || WITH_WSREP */ + +/***********************************************************//** +Fetches an undo log record and does the purge for the recorded operation. +If none left, or the current purge completed, returns the control to the +parent node, which is always a query thread node. */ +static MY_ATTRIBUTE((nonnull)) +void +row_purge( +/*======*/ + purge_node_t* node, /*!< in: row purge node */ + const trx_undo_rec_t* undo_rec, /*!< in: record to purge */ + que_thr_t* thr) /*!< in: query thread */ +{ + if (undo_rec != reinterpret_cast<trx_undo_rec_t*>(-1)) { + bool updated_extern; + + while (row_purge_parse_undo_rec( + node, undo_rec, thr, &updated_extern)) { + + bool purged = row_purge_record( + node, undo_rec, thr, updated_extern); + + if (purged + || srv_shutdown_state > SRV_SHUTDOWN_INITIATED) { + return; + } + + /* Retry the purge in a second. */ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } +} + +inline void purge_node_t::start() +{ + ut_ad(in_progress); + DBUG_ASSERT(common.type == QUE_NODE_PURGE); + + row= nullptr; + ref= nullptr; + index= nullptr; + update= nullptr; + found_clust= false; + rec_type= 0; + cmpl_info= 0; +} + +/** Reset the state at end +@return the query graph parent */ +inline que_node_t *purge_node_t::end(THD *thd) +{ + DBUG_ASSERT(common.type == QUE_NODE_PURGE); + ut_ad(undo_recs.empty()); + ut_d(in_progress= false); + innobase_reset_background_thd(thd); +#ifndef DBUG_OFF + static_cast<MDL_context*>(thd_mdl_context(thd))->lock_warrant= nullptr; +#endif + mem_heap_empty(heap); + return common.parent; +} + + +/***********************************************************//** +Does the purge operation. +@return query thread to run next */ +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + purge_node_t* node; + + node = static_cast<purge_node_t*>(thr->run_node); + + node->start(); + + while (!node->undo_recs.empty()) { + trx_purge_rec_t purge_rec = node->undo_recs.front(); + node->undo_recs.pop(); + node->roll_ptr = purge_rec.roll_ptr; + + row_purge(node, purge_rec.undo_rec, thr); + } + + thr->run_node = node->end(current_thd); + return(thr); +} + +#ifdef UNIV_DEBUG +/***********************************************************//** +Validate the persisent cursor. The purge node has two references +to the clustered index record - one via the ref member, and the +other via the persistent cursor. These two references must match +each other if the found_clust flag is set. +@return true if the stored copy of persistent cursor is consistent +with the ref member.*/ +bool +purge_node_t::validate_pcur() +{ + if (!found_clust) { + return(true); + } + + if (index == NULL) { + return(true); + } + + if (index->type == DICT_FTS) { + return(true); + } + + if (!pcur.old_rec) { + return(true); + } + + dict_index_t* clust_index = pcur.index(); + + rec_offs* offsets = rec_get_offsets( + pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields, + pcur.old_n_fields, &heap); + + /* Here we are comparing the purge ref record and the stored initial + part in persistent cursor. Both cases we store n_uniq fields of the + cluster index and so it is fine to do the comparison. We note this + dependency here as pcur and ref belong to different modules. */ + int st = cmp_dtuple_rec(ref, pcur.old_rec, clust_index, offsets); + + if (st != 0) { + ib::error() << "Purge node pcur validation failed"; + ib::error() << rec_printer(ref).str(); + ib::error() << rec_printer(pcur.old_rec, offsets).str(); + return(false); + } + + return(true); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc new file mode 100644 index 00000000..e927096f --- /dev/null +++ b/storage/innobase/row/row0quiesce.cc @@ -0,0 +1,715 @@ +/***************************************************************************** + +Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0quiesce.cc +Quiesce a tablespace. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0quiesce.h" +#include "row0mysql.h" +#include "buf0flu.h" +#include "ibuf0ibuf.h" +#include "srv0start.h" +#include "trx0purge.h" + +#ifdef HAVE_MY_AES_H +#include <my_aes.h> +#endif + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_index_fields( +/*===========================*/ + const dict_index_t* index, /*!< in: write the meta data for + this index */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte row[sizeof(ib_uint32_t) * 2]; + + for (ulint i = 0; i < index->n_fields; ++i) { + byte* ptr = row; + const dict_field_t* field = &index->fields[i]; + + mach_write_to_4(ptr, field->prefix_len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, field->fixed_len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_9", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing index fields."); + + return(DB_IO_ERROR); + } + + const char* field_name = field->name ? field->name : ""; + /* Include the NUL byte in the length. */ + ib_uint32_t len = static_cast<ib_uint32_t>(strlen(field_name) + 1); + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_10", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(field_name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing index column."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file index information. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_indexes( +/*======================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + ulint n_indexes = 0; + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index; index = UT_LIST_GET_NEXT(indexes, index)) { + n_indexes += index->is_committed(); + } + + { + byte row[sizeof(ib_uint32_t)]; + + /* Write the number of indexes in the table. */ + mach_write_to_4(row, n_indexes); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_11", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing index count."); + + return(DB_IO_ERROR); + } + } + + dberr_t err = DB_SUCCESS; + + /* Write the index meta data. */ + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0 && err == DB_SUCCESS; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (!index->is_committed()) { + continue; + } + + ut_ad(n_indexes); ut_d(n_indexes--); + + byte* ptr; + byte row[sizeof(index_id_t) + + sizeof(ib_uint32_t) * 8]; + + ptr = row; + + ut_ad(sizeof(index_id_t) == 8); + mach_write_to_8(ptr, index->id); + ptr += sizeof(index_id_t); + + mach_write_to_4(ptr, table->space_id); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->page); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->type); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->trx_id_offset); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_user_defined_cols); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_uniq); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_nullable); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_fields); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_12", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing index meta-data."); + + return(DB_IO_ERROR); + } + + /* Write the length of the index name. + NUL byte is included in the length. */ + ib_uint32_t len = static_cast<ib_uint32_t>(strlen(index->name) + 1); + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_1", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(index->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing index name."); + + return(DB_IO_ERROR); + } + + err = row_quiesce_write_index_fields(index, file, thd); + } + + ut_ad(!n_indexes); + return(err); +} + +/*********************************************************************//** +Write the meta data (table columns) config file. Serialise the contents of +dict_col_t structure, along with the column name. All fields are serialized +as ib_uint32_t. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_table( +/*====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 7]; + + col = table->cols; + + for (ulint i = 0; i < table->n_cols; ++i, ++col) { + byte* ptr = row; + + mach_write_to_4(ptr, col->prtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->len); + ptr += sizeof(ib_uint32_t); + + /* FIXME: This will not work if mbminlen>4. + This field is also redundant, because the lengths + are a property of the character set encoding, which + in turn is encodedin prtype above. */ + mach_write_to_4(ptr, ulint(col->mbmaxlen * 5 + col->mbminlen)); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ind); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ord_part); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->max_prefix); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_2", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing table column data."); + + return(DB_IO_ERROR); + } + + /* Write out the column name as [len, byte array]. The len + includes the NUL byte. */ + ib_uint32_t len; + const char* col_name; + + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + + /* Include the NUL byte in the length. */ + len = static_cast<ib_uint32_t>(strlen(col_name) + 1); + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_3", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(col_name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing column name."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file header. +@return DB_SUCCESS or error code. */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_header( +/*=====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Write the meta-data version number. */ + mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing meta-data version number."); + + return(DB_IO_ERROR); + } + + /* Write the server hostname. */ + ib_uint32_t len; + const char* hostname = server_get_hostname(); + + /* Play it safe and check for NULL. */ + if (hostname == 0) { + static const char NullHostname[] = "Hostname unknown"; + + ib::warn() << "Unable to determine server hostname."; + + hostname = NullHostname; + } + + /* The server hostname includes the NUL byte. */ + len = static_cast<ib_uint32_t>(strlen(hostname) + 1); + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(hostname, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing hostname."); + + return(DB_IO_ERROR); + } + + /* The table name includes the NUL byte. */ + ut_a(table->name.m_name != NULL); + len = static_cast<ib_uint32_t>(strlen(table->name.m_name) + 1); + + /* Write the table name. */ + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(table->name.m_name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing table name."); + + return(DB_IO_ERROR); + } + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Write the next autoinc value. */ + mach_write_to_8(row, table->autoinc); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file));); + + if (fwrite(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing table autoinc value."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + /* Write the system page size. */ + mach_write_to_4(ptr, srv_page_size); + ptr += sizeof(ib_uint32_t); + + /* Write the table->flags. */ + mach_write_to_4(ptr, table->flags); + ptr += sizeof(ib_uint32_t); + + /* Write the number of columns in the table. */ + mach_write_to_4(ptr, table->n_cols); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), + "while writing table meta-data."); + + return(DB_IO_ERROR); + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the table meta data after quiesce. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_cfg( +/*==================*/ + dict_table_t* table, /*!< in: write the meta data for + this table */ + THD* thd) /*!< in/out: session */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + ib::info() << "Writing table metadata to '" << name << "'"; + + FILE* file = fopen(name, "w+b"); + + if (file == NULL) { + ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE, + name, errno, strerror(errno)); + + err = DB_IO_ERROR; + } else { + err = row_quiesce_write_header(table, file, thd); + + if (err == DB_SUCCESS) { + err = row_quiesce_write_table(table, file, thd); + } + + if (err == DB_SUCCESS) { + err = row_quiesce_write_indexes(table, file, thd); + } + + if (fflush(file) != 0) { + + char msg[BUFSIZ]; + + snprintf(msg, sizeof(msg), "%s flush() failed", name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), msg); + } + + if (fclose(file) != 0) { + char msg[BUFSIZ]; + + snprintf(msg, sizeof(msg), "%s flose() failed", name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + (ulong) errno, strerror(errno), msg); + } + } + + return(err); +} + +/*********************************************************************//** +Check whether a table has an FTS index defined on it. +@return true if an FTS index exists on the table */ +static +bool +row_quiesce_table_has_fts_index( +/*============================*/ + const dict_table_t* table) /*!< in: quiesce this table */ +{ + bool exists = false; + + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + exists = true; + break; + } + } + + return(exists); +} + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ut_a(trx->mysql_thd != 0); + ut_a(srv_n_purge_threads > 0); + ut_ad(!srv_read_only_mode); + + ut_a(trx->mysql_thd != 0); + + ut_ad(table->space != NULL); + ib::info() << "Sync to disk of " << table->name << " started."; + + if (srv_undo_sources) { + purge_sys.stop(); + } + + for (ulint count = 0; + ibuf_merge_space(table->space_id); + ++count) { + if (trx_is_interrupted(trx)) { + goto aborted; + } + if (!(count % 20)) { + ib::info() << "Merging change buffer entries for " + << table->name; + } + } + + while (buf_flush_list_space(table->space)) { + if (trx_is_interrupted(trx)) { + goto aborted; + } + } + + if (!trx_is_interrupted(trx)) { + /* Ensure that all asynchronous IO is completed. */ + os_aio_wait_until_no_pending_writes(true); + table->space->flush<false>(); + + if (row_quiesce_write_cfg(table, trx->mysql_thd) + != DB_SUCCESS) { + ib::warn() << "There was an error writing to the" + " meta data file"; + } else { + ib::info() << "Table " << table->name + << " flushed to disk"; + } + } else { +aborted: + ib::warn() << "Quiesce aborted!"; + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Cleanup after table quiesce. */ +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ulint count = 0; + + ut_a(trx->mysql_thd != 0); + + /* We need to wait for the operation to complete if the + transaction has been killed. */ + + while (table->quiesce != QUIESCE_COMPLETE) { + + /* Print a warning after every minute. */ + if (!(count % 60)) { + ib::warn() << "Waiting for quiesce of " << table->name + << " to complete"; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + ++count; + } + + if (!opt_bootstrap) { + /* Remove the .cfg file now that the user has resumed + normal operations. Otherwise it will cause problems when + the user tries to drop the database (remove directory). */ + char cfg_name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name)); + + os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL); + + ib::info() << "Deleting the meta-data file '" << cfg_name << "'"; + } + + if (srv_undo_sources) { + purge_sys.resume(); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or error code. */ +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(srv_n_purge_threads > 0); + + if (srv_read_only_mode) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + return(DB_UNSUPPORTED); + + } else if (table->is_temporary()) { + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_CANNOT_DISCARD_TEMPORARY_TABLE); + + return(DB_UNSUPPORTED); + } else if (table->space_id == TRX_SYS_SPACE) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + table->name.m_name); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); + + return(DB_UNSUPPORTED); + } else if (row_quiesce_table_has_fts_index(table)) { + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on tables that have an FTS index." + " FTS auxiliary tables will not be flushed."); + + } else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + /* If this flag is set then the table may not have any active + FTS indexes but it will still have the auxiliary tables. */ + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on a table that had an FTS index," + " created on a hidden column, the" + " auxiliary tables haven't been dropped as yet." + " FTS auxiliary tables will not be flushed."); + } + + dict_index_t* clust_index = dict_table_get_first_index(table); + + for (dict_index_t* index = dict_table_get_next_index(clust_index); + index != NULL; + index = dict_table_get_next_index(index)) { + index->lock.x_lock(SRW_LOCK_CALL); + } + + clust_index->lock.x_lock(SRW_LOCK_CALL); + + switch (state) { + case QUIESCE_START: + break; + + case QUIESCE_COMPLETE: + ut_a(table->quiesce == QUIESCE_START); + break; + + case QUIESCE_NONE: + ut_a(table->quiesce == QUIESCE_COMPLETE); + break; + } + + table->quiesce = state; + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + index->lock.x_unlock(); + } + + return(DB_SUCCESS); +} + diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc new file mode 100644 index 00000000..4a00b2a4 --- /dev/null +++ b/storage/innobase/row/row0row.cc @@ -0,0 +1,1720 @@ +/***************************************************************************** + +Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0row.cc +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" +#include "data0type.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "ut0mem.h" +#include "gis0geo.h" +#include "row0mysql.h" + +/** Build a spatial index key. +@param[in] index spatial index +@param[in] ext externally stored column prefixes, or NULL +@param[in,out] dfield field of the tuple to be copied +@param[in] dfield2 field of the tuple to copy +@param[in] flag ROW_BUILD_NORMAL, ROW_BUILD_FOR_PURGE or + ROW_BUILD_FOR_UNDO +@param[in,out] heap memory heap from which the memory + of the field entry is allocated. +@retval false if undo log is logged before spatial index creation. */ +static bool row_build_spatial_index_key( + const dict_index_t* index, + const row_ext_t* ext, + dfield_t* dfield, + const dfield_t* dfield2, + ulint flag, + mem_heap_t* heap) +{ + if (dfield2->type.mtype == DATA_MISSING) { + return false; + } + + double* mbr; + + dfield_copy(dfield, dfield2); + dfield->type.prtype |= DATA_GIS_MBR; + + /* Allocate memory for mbr field */ + mbr = static_cast<double*>(mem_heap_alloc(heap, DATA_MBR_LEN)); + + /* Set mbr field data. */ + dfield_set_data(dfield, mbr, DATA_MBR_LEN); + + const fil_space_t* space = index->table->space; + + if (UNIV_UNLIKELY(!dfield2->data || !space)) { + /* FIXME: dfield contains uninitialized data, + but row_build_index_entry_low() will not return NULL. + This bug is inherited from MySQL 5.7.5 + commit b66ad511b61fffe75c58d0a607cdb837c6e6c821. */ + return true; + } + + const byte* dptr = NULL; + ulint dlen = 0; + ulint flen = 0; + double tmp_mbr[SPDIMS * 2]; + mem_heap_t* temp_heap = NULL; + + if (!dfield_is_ext(dfield2)) { + dptr = static_cast<const byte*>(dfield_get_data(dfield2)); + dlen = dfield_get_len(dfield2); + ut_ad(dptr != &data_error); + goto write_mbr; + } + + if (flag == ROW_BUILD_FOR_PURGE) { + const byte* ptr = static_cast<const byte*>( + dfield_get_data(dfield2)); + + switch (dfield_get_spatial_status(dfield2)) { + case SPATIAL_ONLY: + ut_ad(dfield_get_len(dfield2) == DATA_MBR_LEN); + break; + + case SPATIAL_MIXED: + ptr += dfield_get_len(dfield2); + break; + + case SPATIAL_UNKNOWN: + ut_ad(0); + /* fall through */ + case SPATIAL_NONE: + /* Undo record is logged before + spatial index is created.*/ + return false; + } + + memcpy(mbr, ptr, DATA_MBR_LEN); + return true; + } + + if (flag == ROW_BUILD_FOR_UNDO + && dict_table_has_atomic_blobs(index->table)) { + /* For ROW_FORMAT=DYNAMIC or COMPRESSED, a prefix of + off-page records is stored in the undo log record (for + any column prefix indexes). For SPATIAL INDEX, we + must ignore this prefix. The full column value is + stored in the BLOB. For non-spatial index, we would + have already fetched a necessary prefix of the BLOB, + available in the "ext" parameter. + + Here, for SPATIAL INDEX, we are fetching the full + column, which is potentially wasting a lot of I/O, + memory, and possibly involving a concurrency problem, + similar to ones that existed before the introduction + of row_ext_t. + + MDEV-11657 FIXME: write the MBR directly to the undo + log record, and avoid recomputing it here! */ + flen = BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(dfield_get_len(dfield2) >= BTR_EXTERN_FIELD_REF_SIZE); + dptr = static_cast<const byte*>(dfield_get_data(dfield2)) + + dfield_get_len(dfield2) + - BTR_EXTERN_FIELD_REF_SIZE; + } else { + flen = dfield_get_len(dfield2); + dptr = static_cast<const byte*>(dfield_get_data(dfield2)); + } + + temp_heap = mem_heap_create(1000); + + dptr = btr_copy_externally_stored_field( + &dlen, dptr, ext ? ext->zip_size : space->zip_size(), + flen, temp_heap); + +write_mbr: + if (dlen <= GEO_DATA_HEADER_SIZE) { + for (uint i = 0; i < SPDIMS; i += 2) { + tmp_mbr[i] = DBL_MAX; + tmp_mbr[i + 1] = -DBL_MAX; + } + } else { + rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, + uint(dlen - GEO_DATA_HEADER_SIZE), + SPDIMS, tmp_mbr); + } + + dfield_write_mbr(dfield, tmp_mbr); + if (temp_heap) { + mem_heap_free(temp_heap); + } + + return true; +} + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap, /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ + ulint flag) /*!< in: ROW_BUILD_NORMAL, + ROW_BUILD_FOR_PURGE + or ROW_BUILD_FOR_UNDO */ +{ + dtuple_t* entry; + ulint entry_len; + ulint i = 0; + ulint num_v = 0; + + entry_len = dict_index_get_n_fields(index); + + if (flag == ROW_BUILD_FOR_INSERT && dict_index_is_clust(index)) { + num_v = dict_table_get_n_v_cols(index->table); + entry = dtuple_create_with_vcol(heap, entry_len, num_v); + } else { + entry = dtuple_create(heap, entry_len); + } + + if (dict_index_is_ibuf(index)) { + dtuple_set_n_fields_cmp(entry, entry_len); + /* There may only be externally stored columns + in a clustered index B-tree of a user table. */ + ut_a(!ext); + } else { + dtuple_set_n_fields_cmp( + entry, dict_index_get_n_unique_in_tree(index)); + if (dict_index_is_spatial(index)) { + /* Set the MBR field */ + if (!row_build_spatial_index_key( + index, ext, + dtuple_get_nth_field(entry, 0), + dtuple_get_nth_field( + row, + dict_index_get_nth_field(index, i) + ->col->ind), flag, heap)) { + return NULL; + } + + i = 1; + } + } + + for (; i < entry_len; i++) { + const dict_field_t& f = index->fields[i]; + dfield_t* dfield = dtuple_get_nth_field(entry, i); + + if (f.col->is_dropped()) { + ut_ad(index->is_primary()); + ut_ad(index->is_instant()); + ut_ad(!f.col->is_virtual()); + dict_col_copy_type(f.col, &dfield->type); + if (f.col->is_nullable()) { + dfield_set_null(dfield); + } else { + dfield_set_data(dfield, field_ref_zero, + f.fixed_len); + } + continue; + } + + const dfield_t* dfield2; + + if (f.col->is_virtual()) { + const dict_v_col_t* v_col + = reinterpret_cast<const dict_v_col_t*>(f.col); + + ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row)); + dfield2 = dtuple_get_nth_v_field(row, v_col->v_pos); + + ut_ad(dfield_is_null(dfield2) || + dfield_get_len(dfield2) == 0 || dfield2->data); + ut_ad(!dfield_is_ext(dfield2)); + if (UNIV_UNLIKELY(dfield2->type.mtype + == DATA_MISSING)) { + ut_ad(flag == ROW_BUILD_FOR_PURGE); + return(NULL); + } + } else { + dfield2 = dtuple_get_nth_field(row, f.col->ind); + if (UNIV_UNLIKELY(dfield2->type.mtype + == DATA_MISSING)) { + /* The field has not been initialized in + the row. This should be from + trx_undo_rec_get_partial_row(). */ + return(NULL); + } + + ut_ad(!(dfield2->type.prtype & DATA_VIRTUAL)); + } + + compile_time_assert(DATA_MISSING == 0); + + *dfield = *dfield2; + + if (dfield_is_null(dfield)) { + continue; + } + + ut_ad(!(index->type & DICT_FTS)); + + ulint len = dfield_get_len(dfield); + + if (f.prefix_len == 0 + && (!dfield_is_ext(dfield) + || dict_index_is_clust(index))) { + /* The *dfield = *dfield2 above suffices for + columns that are stored in-page, or for + clustered index record columns that are not + part of a column prefix in the PRIMARY KEY. */ + continue; + } + + /* If the column is stored externally (off-page) in + the clustered index, it must be an ordering field in + the secondary index. If !atomic_blobs, the only way + we may have a secondary index pointing to a clustered + index record with an off-page column is when it is a + column prefix index. If atomic_blobs, also fully + indexed long columns may be stored off-page. */ + ut_ad(f.col->ord_part); + + if (ext && !f.col->is_virtual()) { + /* See if the column is stored externally. */ + const byte* buf = row_ext_lookup(ext, f.col->ind, + &len); + if (UNIV_LIKELY_NULL(buf)) { + if (UNIV_UNLIKELY(buf == field_ref_zero)) { + return(NULL); + } + dfield_set_data(dfield, buf, len); + } + + if (f.prefix_len == 0) { + /* If ROW_FORMAT=DYNAMIC or + ROW_FORMAT=COMPRESSED, we can have a + secondary index on an entire column + that is stored off-page in the + clustered index. As this is not a + prefix index (prefix_len == 0), + include the entire off-page column in + the secondary index record. */ + continue; + } + } else if (dfield_is_ext(dfield)) { + /* This table is either in + (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT) + or a purge record where the ordered part of + the field is not external. + In ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, + the maximum column prefix + index length is 767 bytes, and the clustered + index record contains a 768-byte prefix of + each off-page column. */ + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + len -= BTR_EXTERN_FIELD_REF_SIZE; + dfield_set_len(dfield, len); + } + + /* If a column prefix index, take only the prefix. */ + if (f.prefix_len) { + len = dtype_get_at_most_n_mbchars( + f.col->prtype, + f.col->mbminlen, f.col->mbmaxlen, + f.prefix_len, len, + static_cast<char*>(dfield_get_data(dfield))); + dfield_set_len(dfield, len); + } + } + + for (i = num_v; i--; ) { + ut_ad(index->is_primary()); + ut_ad(flag == ROW_BUILD_FOR_INSERT); + dfield_t* dfield = dtuple_get_nth_v_field(entry, i); + const dict_v_col_t* v_col = dict_table_get_nth_v_col( + index->table, i); + ut_ad(!v_col->m_col.is_dropped()); + ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row)); + const dfield_t* dfield2 = dtuple_get_nth_v_field( + row, v_col->v_pos); + ut_ad(dfield_is_null(dfield2) || + dfield_get_len(dfield2) == 0 || dfield2->data); + ut_ad(dfield2->type.mtype != DATA_MISSING); + *dfield = *dfield2; + } + + return entry; +} + +/** An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index, with possible indexing on ongoing +addition of new virtual columns. +@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA; +@param[in] index clustered index +@param[in] rec record in the clustered index +@param[in] offsets rec_get_offsets(rec,index) or NULL +@param[in] col_table table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead +@param[in] defaults default values of added/changed columns, or NULL +@param[in] add_v new virtual columns added + along with new indexes +@param[in] col_map mapping of old column + numbers to new ones, or NULL +@param[in] ext cache of externally stored column + prefixes, or NULL +@param[in] heap memory heap from which + the memory needed is allocated +@return own: row built; */ +static inline +dtuple_t* +row_build_low( + ulint type, + const dict_index_t* index, + const rec_t* rec, + const rec_offs* offsets, + const dict_table_t* col_table, + const dtuple_t* defaults, + const dict_add_v_col_t* add_v, + const ulint* col_map, + row_ext_t** ext, + mem_heap_t* heap) +{ + const byte* copy; + dtuple_t* row; + ulint n_ext_cols; + ulint* ext_cols = NULL; /* remove warning */ + ulint len; + byte* buf; + ulint j; + mem_heap_t* tmp_heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_ad(index != NULL); + ut_ad(rec != NULL); + ut_ad(heap != NULL); + ut_ad(dict_index_is_clust(index)); + ut_ad(!col_map || col_table); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* Some blob refs can be NULL during crash recovery before + trx_rollback_active() has completed execution, or when a concurrently + executing insert or update has committed the B-tree mini-transaction + but has not yet managed to restore the cursor position for writing + the big_rec. Note that the mini-transaction can be committed multiple + times, and the cursor restore can happen multiple times for single + insert or update statement. */ + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, + offsets))); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (type != ROW_COPY_POINTERS) { + /* Take a copy of rec to heap */ + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + + copy = rec_copy(buf, rec, offsets); + } else { + copy = rec; + } + + n_ext_cols = rec_offs_n_extern(offsets); + if (n_ext_cols) { + ext_cols = static_cast<ulint*>( + mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols)); + } + + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(copy, index, true, const_cast<rec_offs*>(offsets)); + + if (!col_table) { + ut_ad(!col_map); + ut_ad(!defaults); + col_table = index->table; + } + + if (defaults) { + ut_ad(col_map); + row = dtuple_copy(defaults, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(col_table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else if (add_v != NULL) { + row = dtuple_create_with_vcol( + heap, dict_table_get_n_cols(col_table), + dict_table_get_n_v_cols(col_table) + add_v->n_v_col); + dict_table_copy_types(row, col_table); + + for (ulint i = 0; i < add_v->n_v_col; i++) { + dict_col_copy_type( + &add_v->v_col[i].m_col, + dfield_get_type(dtuple_get_nth_v_field( + row, i + col_table->n_v_def))); + } + } else { + row = dtuple_create_with_vcol( + heap, dict_table_get_n_cols(col_table), + dict_table_get_n_v_cols(col_table)); + dict_table_copy_types(row, col_table); + } + + dtuple_set_info_bits(row, rec_get_info_bits( + copy, rec_offs_comp(offsets))); + + j = 0; + + const dict_field_t* ind_field = index->fields; + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + if (i == index->first_user_field() + && rec_is_alter_metadata(rec, *index)) { + ut_ad(rec_offs_nth_extern(offsets, i)); + ut_d(ulint len); + ut_d(rec_get_nth_field_offs(offsets, i, &len)); + ut_ad(len == FIELD_REF_SIZE); + continue; + } + + if (UNIV_UNLIKELY(ind_field + >= &index->fields[index->n_fields])) { + ut_ad(rec_is_metadata(rec, *index)); + continue; + } + + const dict_col_t* col = dict_field_get_col(ind_field); + + if ((ind_field++)->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + + if (col->is_dropped()) { + continue; + } + + ulint col_no = dict_col_get_no(col); + + if (col_map) { + col_no = col_map[col_no]; + + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } + } + + dfield_t* dfield = dtuple_get_nth_field(row, col_no); + + const void* field = rec_get_nth_field( + copy, offsets, i, &len); + if (len == UNIV_SQL_DEFAULT) { + field = index->instant_field_value(i, &len); + if (field && type != ROW_COPY_POINTERS) { + field = mem_heap_dup(heap, field, len); + } + } + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + + col = dict_table_get_nth_col(col_table, col_no); + + if (col->ord_part) { + /* We will have to fetch prefixes of + externally stored columns that are + referenced by column prefixes. */ + ext_cols[j++] = col_no; + } + } + } + + rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets)); + + ut_ad(dtuple_check_typed(row)); + + if (!ext) { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored + column. No cache is needed. + + During online table rebuild, + row_log_table_apply_delete_low() + may use a cache that was set up by + row_log_table_delete(). */ + + } else if (j) { + *ext = row_ext_create(j, ext_cols, *index->table, row, + heap); + } else { + *ext = NULL; + } + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(row); +} + + +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead */ + const dtuple_t* defaults, + /*!< in: default values of + added and changed columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + return(row_build_low(type, index, rec, offsets, col_table, + defaults, NULL, col_map, ext, heap)); +} + +/** An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index, with possible indexing on ongoing +addition of new virtual columns. +@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA; +@param[in] index clustered index +@param[in] rec record in the clustered index +@param[in] offsets rec_get_offsets(rec,index) or NULL +@param[in] col_table table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead +@param[in] defaults default values of added, changed columns, or NULL +@param[in] add_v new virtual columns added + along with new indexes +@param[in] col_map mapping of old column + numbers to new ones, or NULL +@param[in] ext cache of externally stored column + prefixes, or NULL +@param[in] heap memory heap from which + the memory needed is allocated +@return own: row built; */ +dtuple_t* +row_build_w_add_vcol( + ulint type, + const dict_index_t* index, + const rec_t* rec, + const rec_offs* offsets, + const dict_table_t* col_table, + const dtuple_t* defaults, + const dict_add_v_col_t* add_v, + const ulint* col_map, + row_ext_t** ext, + mem_heap_t* heap) +{ + return(row_build_low(type, index, rec, offsets, col_table, + defaults, add_v, col_map, ext, heap)); +} + +/** Convert an index record to a data tuple. +@tparam metadata whether the index->instant_field_value() needs to be accessed +@tparam mblob 1 if rec_is_alter_metadata(); +2 if we want converted metadata corresponding to info_bits +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[out] n_ext number of externally stored columns +@param[in,out] heap memory heap for allocations +@param[in] info_bits (only used if mblob=2) +@param[in] pad (only used if mblob=2) +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +template<bool metadata, int mblob = 0> +static inline +dtuple_t* +row_rec_to_index_entry_impl( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits = 0, + bool pad = false) +{ + ut_ad(rec != NULL); + ut_ad(heap != NULL); + ut_ad(index != NULL); + ut_ad(!mblob || index->is_primary()); + ut_ad(!mblob || !index->table->is_temporary()); + ut_ad(!mblob || !dict_index_is_spatial(index)); + compile_time_assert(!mblob || metadata); + compile_time_assert(mblob <= 2); + /* Because this function may be invoked by row0merge.cc + on a record whose header is in different format, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + + const bool got = mblob == 2 && rec_is_alter_metadata(rec, *index); + ulint rec_len = rec_offs_n_fields(offsets); + if (mblob == 2) { + ut_ad(info_bits == REC_INFO_METADATA_ALTER + || info_bits == REC_INFO_METADATA_ADD); + if (pad) { + ut_ad(rec_len <= ulint(index->n_fields + got)); + rec_len = ulint(index->n_fields) + + (info_bits == REC_INFO_METADATA_ALTER); + } else if (got) { + rec_len = std::min(rec_len, + ulint(index->n_fields + got)); + } else if (info_bits == REC_INFO_METADATA_ALTER) { + ut_ad(rec_len <= index->n_fields); + rec_len++; + } + } else { + ut_ad(info_bits == 0); + ut_ad(!pad); + } + dtuple_t* entry = dtuple_create(heap, rec_len); + dfield_t* dfield = entry->fields; + + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + ut_ad(mblob == 2 + || rec_len == dict_index_get_n_fields(index) + uint(mblob == 1) + /* a record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + || (!index->table->is_temporary() + && index->table->id == DICT_INDEXES_ID + && rec_len + 1 == dict_index_get_n_fields(index))); + + ulint i; + for (i = 0; i < (mblob ? index->first_user_field() : rec_len); + i++, dfield++) { + dict_col_copy_type(dict_index_get_nth_col(index, i), + &dfield->type); + if (!mblob + && dict_index_is_spatial(index) + && DATA_GEOMETRY_MTYPE(dfield->type.mtype)) { + dfield->type.prtype |= DATA_GIS_MBR; + } + + ulint len; + const byte* field = metadata + ? rec_get_nth_cfield(rec, index, offsets, i, &len) + : rec_get_nth_field(rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + } + } + + if (mblob) { + ulint len; + const byte* field; + ulint j = i; + + if (mblob == 2) { + const bool want = info_bits == REC_INFO_METADATA_ALTER; + if (got == want) { + if (got) { + goto copy_metadata; + } + } else { + if (want) { + /* Allocate a placeholder for + adding metadata in an update. */ + len = FIELD_REF_SIZE; + field = static_cast<byte*>( + mem_heap_zalloc(heap, len)); + /* In reality there is one fewer + field present in the record. */ + rec_len--; + goto init_metadata; + } + + /* Skip the undesired metadata blob + (for example, when rolling back an + instant ALTER TABLE). */ + i++; + } + goto copy_user_fields; + } +copy_metadata: + ut_ad(rec_offs_nth_extern(offsets, i)); + field = rec_get_nth_field(rec, offsets, i++, &len); +init_metadata: + dfield->type.metadata_blob_init(); + ut_ad(len == FIELD_REF_SIZE); + dfield_set_data(dfield, field, len); + dfield_set_ext(dfield++); +copy_user_fields: + for (; i < rec_len; i++, dfield++) { + dict_col_copy_type(dict_index_get_nth_col(index, j++), + &dfield->type); + if (mblob == 2 && pad + && i >= rec_offs_n_fields(offsets)) { + field = index->instant_field_value(j - 1, + &len); + dfield_set_data(dfield, field, len); + continue; + } + + field = rec_get_nth_field(rec, offsets, i, &len); + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + } + } + } + + if (mblob == 2) { + ulint n_fields = ulint(dfield - entry->fields); + ut_ad(entry->n_fields >= n_fields); + entry->n_fields = n_fields; + } + ut_ad(dfield == entry->fields + entry->n_fields); + ut_ad(dtuple_check_typed(entry)); + return entry; +} + +/** Convert an index record to a data tuple. +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[in,out] heap memory heap for allocations */ +dtuple_t* +row_rec_to_index_entry_low( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap) +{ + return row_rec_to_index_entry_impl<false>(rec, index, offsets, heap); +} + +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built */ +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ +{ + ut_ad(rec != NULL); + ut_ad(heap != NULL); + ut_ad(index != NULL); + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* Take a copy of rec to heap */ + const rec_t* copy_rec = rec_copy( + static_cast<byte*>(mem_heap_alloc(heap, + rec_offs_size(offsets))), + rec, offsets); + + rec_offs_make_valid(copy_rec, index, true, + const_cast<rec_offs*>(offsets)); + + dtuple_t* entry = rec_is_alter_metadata(copy_rec, *index) + ? row_rec_to_index_entry_impl<true,1>( + copy_rec, index, offsets, heap) + : row_rec_to_index_entry_impl<true>( + copy_rec, index, offsets, heap); + + rec_offs_make_valid(rec, index, true, + const_cast<rec_offs*>(offsets)); + + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); + + return(entry); +} + +/** Convert a metadata record to a data tuple. +@param[in] rec metadata record +@param[in] index clustered index after instant ALTER TABLE +@param[in] offsets rec_get_offsets(rec) +@param[in,out] heap memory heap for allocations +@param[in] info_bits the info_bits after an update +@param[in] pad whether to pad to index->n_fields */ +dtuple_t* +row_metadata_to_tuple( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits, + bool pad) +{ + ut_ad(info_bits == REC_INFO_METADATA_ALTER + || info_bits == REC_INFO_METADATA_ADD); + ut_ad(rec_is_metadata(rec, *index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + const rec_t* copy_rec = rec_copy( + static_cast<byte*>(mem_heap_alloc(heap, + rec_offs_size(offsets))), + rec, offsets); + + rec_offs_make_valid(copy_rec, index, true, + const_cast<rec_offs*>(offsets)); + + dtuple_t* entry = info_bits == REC_INFO_METADATA_ALTER + || rec_is_alter_metadata(copy_rec, *index) + ? row_rec_to_index_entry_impl<true,2>( + copy_rec, index, offsets, heap, info_bits, pad) + : row_rec_to_index_entry_impl<true>( + copy_rec, index, offsets, heap); + + rec_offs_make_valid(rec, index, true, + const_cast<rec_offs*>(offsets)); + + dtuple_set_info_bits(entry, info_bits); + return entry; +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dtuple_t* ref; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + byte* buf; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* tmp_heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index != NULL); + ut_ad(rec != NULL); + ut_ad(heap != NULL); + ut_ad(!dict_index_is_clust(index)); + + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &tmp_heap); + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); + + rec = rec_copy(buf, rec, offsets); + rec_offs_make_valid(rec, index, true, offsets); + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + ut_ad(!rec_offs_nth_default(offsets, pos)); + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminlen, + dtype->mbmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(ref); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) + or NULL */ +{ + const dict_index_t* clust_index; + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_ad(!dict_index_is_clust(index)); + ut_a(index->table); + + clust_index = dict_table_get_first_index(index->table); + ut_ad(clust_index); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + ut_ad(!rec_offs_nth_default(offsets, pos)); + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminlen, + dtype->mbmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************//** +Searches the clustered index record for a row, if we have the row reference. +@return TRUE if found */ +bool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(dtuple_check_typed(ref)); + + dict_index_t *index = dict_table_get_first_index(table); + btr_pcur_init(pcur); + pcur->btr_cur.page_cur.index = index; + + if (UNIV_UNLIKELY(ref->info_bits != 0)) { + ut_ad(ref->is_metadata()); + ut_ad(ref->n_fields <= index->n_uniq); + if (pcur->open_leaf(true, index, mode, mtr) != DB_SUCCESS + || !btr_pcur_move_to_next_user_rec(pcur, mtr)) { + return false; + } + /* We do not necessarily have index->is_instant() here, + because we could be executing a rollback of an + instant ADD COLUMN operation. The function + rec_is_metadata() asserts index->is_instant(); + we do not want to call it here. */ + return rec_get_info_bits(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table)) + & REC_INFO_MIN_REC_FLAG; + } else { + ut_a(ref->n_fields == index->n_uniq); + if (btr_pcur_open(ref, PAGE_CUR_LE, mode, pcur, mtr) + != DB_SUCCESS) { + return false; + } + } + + return !page_rec_is_infimum(btr_pcur_get_rec(pcur)) + && btr_pcur_get_low_match(pcur) == dtuple_get_n_fields(ref); +} + +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +rec_t* +row_get_clust_rec( +/*==============*/ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* ref; + dict_table_t* table; + btr_pcur_t pcur; + + ut_ad(!dict_index_is_clust(index)); + + table = index->table; + + heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap); + + auto found = row_search_on_row_ref(&pcur, mode, table, ref, mtr); + + mem_heap_free(heap); + + *clust_index = dict_table_get_first_index(table); + return found ? btr_pcur_get_rec(&pcur) : nullptr; +} + +/***************************************************************//** +Searches an index record. +@return whether the record was found or buffered */ +enum row_search_result +row_search_index_entry( +/*===================*/ + const dtuple_t* entry, /*!< in: index entry */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint n_fields; + ulint low_match; + rec_t* rec; + + ut_ad(dtuple_check_typed(entry)); + + if (btr_pcur_open(entry, PAGE_CUR_LE, mode, pcur, mtr) != DB_SUCCESS) { + return ROW_NOT_FOUND; + } + + switch (btr_pcur_get_btr_cur(pcur)->flag) { + case BTR_CUR_DELETE_REF: + ut_ad(!(~mode & BTR_DELETE)); + return(ROW_NOT_DELETED_REF); + + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + return(ROW_BUFFERED); + + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + break; + } + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + n_fields = dtuple_get_n_fields(entry); + + if (page_rec_is_infimum(rec)) { + + return(ROW_NOT_FOUND); + } else if (low_match != n_fields) { + + return(ROW_NOT_FOUND); + } + + return(ROW_FOUND); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_INT using "prtype" and writes the result to "buf". +If the data is in unknown format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). +@return number of bytes that were written */ +static +ulint +row_raw_format_int( +/*===============*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint prtype, /*!< in: precise type */ + char* buf, /*!< out: output buffer */ + ulint buf_size, /*!< in: output buffer size + in bytes */ + ibool* format_in_hex) /*!< out: should the data be + formatted in hex */ +{ + ulint ret; + + if (data_len <= sizeof(ib_uint64_t)) { + + ib_uint64_t value; + ibool unsigned_type = prtype & DATA_UNSIGNED; + + value = mach_read_int_type( + (const byte*) data, data_len, unsigned_type); + + ret = (ulint) snprintf( + buf, buf_size, + unsigned_type ? "%llu" : "%lld", (longlong) value)+1; + } else { + + *format_in_hex = TRUE; + ret = 0; + } + + return(ut_min(ret, buf_size)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the +result to "buf". +If the data is in binary format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). +@return number of bytes that were written */ +static +ulint +row_raw_format_str( +/*===============*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint prtype, /*!< in: precise type */ + char* buf, /*!< out: output buffer */ + ulint buf_size, /*!< in: output buffer size + in bytes */ + ibool* format_in_hex) /*!< out: should the data be + formatted in hex */ +{ + ulint charset_coll; + + if (buf_size == 0) { + + return(0); + } + + /* we assume system_charset_info is UTF-8 */ + + charset_coll = dtype_get_charset_coll(prtype); + + if (UNIV_LIKELY(dtype_is_utf8(prtype))) { + + return(ut_str_sql_format(data, data_len, buf, buf_size)); + } + /* else */ + + if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) { + + *format_in_hex = TRUE; + return(0); + } + /* else */ + + return(innobase_raw_format(data, data_len, charset_coll, + buf, buf_size)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint mtype; + ulint prtype; + ulint ret; + ibool format_in_hex; + + ut_ad(data_len != UNIV_SQL_DEFAULT); + + if (buf_size == 0) { + + return(0); + } + + if (data_len == UNIV_SQL_NULL) { + + ret = snprintf((char*) buf, buf_size, "NULL") + 1; + + return(ut_min(ret, buf_size)); + } + + mtype = dict_field->col->mtype; + prtype = dict_field->col->prtype; + + format_in_hex = FALSE; + + switch (mtype) { + case DATA_INT: + + ret = row_raw_format_int(data, data_len, prtype, + buf, buf_size, &format_in_hex); + if (format_in_hex) { + + goto format_in_hex; + } + break; + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + + ret = row_raw_format_str(data, data_len, prtype, + buf, buf_size, &format_in_hex); + if (format_in_hex) { + + goto format_in_hex; + } + + break; + /* XXX support more data types */ + default: + format_in_hex: + + if (UNIV_LIKELY(buf_size > 2)) { + + memcpy(buf, "0x", 2); + buf += 2; + buf_size -= 2; + ret = 2 + ut_raw_to_hex(data, data_len, + buf, buf_size); + } else { + + buf[0] = '\0'; + ret = 1; + } + } + + return(ret); +} + +#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT + +#ifdef HAVE_UT_CHRONO_T + +void +test_row_raw_format_int() +{ + ulint ret; + char buf[128]; + ibool format_in_hex; + ulint i; + +#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\ + ret_expected, buf_expected, format_in_hex_expected)\ + do {\ + ibool ok = TRUE;\ + ulint i;\ + memset(buf, 'x', 10);\ + buf[10] = '\0';\ + format_in_hex = FALSE;\ + fprintf(stderr, "TESTING \"\\x");\ + for (i = 0; i < data_len; i++) {\ + fprintf(stderr, "%02hhX", data[i]);\ + }\ + fprintf(stderr, "\", %lu, %lu, %lu\n",\ + (ulint) data_len, (ulint) prtype,\ + (ulint) buf_size);\ + ret = row_raw_format_int(data, data_len, prtype,\ + buf, buf_size, &format_in_hex);\ + if (ret != ret_expected) {\ + fprintf(stderr, "expected ret %lu, got %lu\n",\ + (ulint) ret_expected, ret);\ + ok = FALSE;\ + }\ + if (strcmp((char*) buf, buf_expected) != 0) {\ + fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\ + buf_expected, buf);\ + ok = FALSE;\ + }\ + if (format_in_hex != format_in_hex_expected) {\ + fprintf(stderr, "expected format_in_hex %d, got %d\n",\ + (int) format_in_hex_expected,\ + (int) format_in_hex);\ + ok = FALSE;\ + }\ + if (ok) {\ + fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\ + (ulint) ret, buf, (int) format_in_hex);\ + } else {\ + return;\ + }\ + } while (0) + +#if 1 + /* min values for signed 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, 0, + buf, sizeof(buf), 5, "-128", 0); + + CALL_AND_TEST("\x00\x00", 2, 0, + buf, sizeof(buf), 7, "-32768", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, 0, + buf, sizeof(buf), 9, "-8388608", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, 0, + buf, sizeof(buf), 12, "-2147483648", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0, + buf, sizeof(buf), 14, "-549755813888", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0, + buf, sizeof(buf), 17, "-140737488355328", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0, + buf, sizeof(buf), 19, "-36028797018963968", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0, + buf, sizeof(buf), 21, "-9223372036854775808", 0); + + /* min values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + /* max values for signed 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, 0, + buf, sizeof(buf), 4, "127", 0); + + CALL_AND_TEST("\xFF\xFF", 2, 0, + buf, sizeof(buf), 6, "32767", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, 0, + buf, sizeof(buf), 8, "8388607", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0, + buf, sizeof(buf), 11, "2147483647", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0, + buf, sizeof(buf), 13, "549755813887", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0, + buf, sizeof(buf), 16, "140737488355327", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0, + buf, sizeof(buf), 18, "36028797018963967", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0, + buf, sizeof(buf), 20, "9223372036854775807", 0); + + /* max values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED, + buf, sizeof(buf), 4, "255", 0); + + CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "65535", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED, + buf, sizeof(buf), 9, "16777215", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED, + buf, sizeof(buf), 11, "4294967295", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED, + buf, sizeof(buf), 14, "1099511627775", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED, + buf, sizeof(buf), 16, "281474976710655", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED, + buf, sizeof(buf), 18, "72057594037927935", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED, + buf, sizeof(buf), 21, "18446744073709551615", 0); + + /* some random values */ + + CALL_AND_TEST("\x52", 1, 0, + buf, sizeof(buf), 4, "-46", 0); + + CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED, + buf, sizeof(buf), 3, "14", 0); + + CALL_AND_TEST("\x62\xCE", 2, 0, + buf, sizeof(buf), 6, "-7474", 0); + + CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "10710", 0); + + CALL_AND_TEST("\x7F\xFF\x90", 3, 0, + buf, sizeof(buf), 5, "-112", 0); + + CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED, + buf, sizeof(buf), 6, "41238", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0, + buf, sizeof(buf), 3, "-9", 0); + + CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED, + buf, sizeof(buf), 3, "92", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0, + buf, sizeof(buf), 6, "-9117", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED, + buf, sizeof(buf), 6, "91234", 0); +#endif + + /* speed test */ + + ut_chrono_t ch(__func__); + + for (i = 0; i < 1000000; i++) { + row_raw_format_int("\x23", 1, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x23", 1, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + } +} + +#endif /* HAVE_UT_CHRONO_T */ + +#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */ diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc new file mode 100644 index 00000000..6c76dd91 --- /dev/null +++ b/storage/innobase/row/row0sel.cc @@ -0,0 +1,6947 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***************************************************//** +@file row/row0sel.cc +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "gis0rtree.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" +#include "buf0lru.h" +#include "srv0srv.h" +#include "srv0mon.h" +#include "sql_error.h" +#ifdef WITH_WSREP +#include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */ +#endif + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/********************************************************************//** +Returns TRUE if the user-defined column in a secondary index record +is alphabetically the same as the corresponding BLOB column in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! +@return whether the columns are equal */ +static +bool +row_sel_sec_rec_is_for_blob( +/*========================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint mbminlen, /*!< in: minimum length of + a character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of + a character, in bytes */ + const byte* clust_field, /*!< in: the locally stored part of + the clustered index column, including + the BLOB pointer; the clustered + index record must be covered by + a lock or a page latch to protect it + against deletion (rollback or purge) */ + ulint clust_len, /*!< in: length of clust_field */ + const byte* sec_field, /*!< in: column in secondary index */ + ulint sec_len, /*!< in: length of sec_field */ + ulint prefix_len, /*!< in: index column prefix length + in bytes, or 0 for full column */ + dict_table_t* table) /*!< in: table */ +{ + ulint len; + byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN + 1]; + + /* This function should never be invoked on tables in + ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they + should always contain enough prefix in the clustered index record. */ + ut_ad(dict_table_has_atomic_blobs(table)); + ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(!prefix_len || prefix_len >= sec_len); + ut_a(prefix_len <= sizeof buf); + + if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) { + /* The externally stored field was not written yet. + This record should only be seen by + trx_rollback_recovered() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return false; + } + + len = btr_copy_externally_stored_field_prefix( + buf, prefix_len ? prefix_len : sizeof buf, + table->space->zip_size(), + clust_field, clust_len); + + if (len == 0) { + /* The BLOB was being deleted as the server crashed. + There should not be any secondary index records + referring to this clustered index record, because + btr_free_externally_stored_field() is called after all + secondary index entries of the row have been purged. */ + return false; + } + + if (prefix_len) { + len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen, + prefix_len, len, + reinterpret_cast<const char*> + (buf)); + } else if (len >= sizeof buf) { + ut_ad("too long column" == 0); + return false; + } + + return !cmp_data(mtype, prtype, false, buf, len, sec_field, sec_len); +} + +/** Function to read the secondary spatial index, calculate +the minimum bounding rectangle for clustered index record +and secondary index record and compare it. +@param sec_rec secondary index record +@param sec_index spatial secondary index +@param clust_rec clustered index record +@param clust_index clustered index +@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the + corresponding fields in the clustered record, when compared with + collation; +@retval DB_SUCCESS if not equal */ +static +dberr_t +row_sel_spatial_sec_rec_is_for_clust_rec( + const rec_t *sec_rec, const dict_index_t *sec_index, + const rec_t *clust_rec, dict_index_t *clust_index) +{ + mem_heap_t *heap= mem_heap_create(256); + rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *clust_offs= clust_offsets_; + ulint clust_len; + + rec_offs_init(clust_offsets_); + ulint clust_pos= dict_col_get_clust_pos( + dict_index_get_nth_col(sec_index, 0), clust_index); + clust_offs= rec_get_offsets(clust_rec, clust_index, clust_offs, + clust_index->n_core_fields, clust_pos + 1, + &heap); + ut_ad(sec_index->n_user_defined_cols == 1); + const byte *clust_field= rec_get_nth_field(clust_rec, clust_offs, + clust_pos, &clust_len); + if (clust_len == UNIV_SQL_NULL || clust_len < GEO_DATA_HEADER_SIZE) + { + ut_ad("corrupted geometry column" == 0); +err_exit: + mem_heap_free(heap); + return DB_SUCCESS; + } + + /* For externally stored field, we need to get full + geo data to generate the MBR for comparing. */ + if (rec_offs_nth_extern(clust_offs, clust_pos)) + { + clust_field= btr_copy_externally_stored_field( + &clust_len, clust_field, sec_index->table->space->zip_size(), + clust_len, heap); + if (clust_field == NULL) + { + ut_ad("corrupted geometry blob" == 0); + goto err_exit; + } + } + + ut_ad(clust_len >= GEO_DATA_HEADER_SIZE); + rtr_mbr_t tmp_mbr; + rtr_mbr_t sec_mbr; + + rtree_mbr_from_wkb( + clust_field + GEO_DATA_HEADER_SIZE, + static_cast<uint>(clust_len - GEO_DATA_HEADER_SIZE), + SPDIMS, reinterpret_cast<double*>(&tmp_mbr)); + + rtr_read_mbr(sec_rec, &sec_mbr); + + mem_heap_free(heap); + return MBR_EQUAL_CMP(&sec_mbr, &tmp_mbr) + ? DB_SUCCESS_LOCKED_REC + : DB_SUCCESS; +} + +/** Returns TRUE if the user-defined column values in a secondary index record +are alphabetically the same as the corresponding columns in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! +@param[in] sec_rec secondary index record +@param[in] sec_index secondary index +@param[in] clust_rec clustered index record; + must be protected by a page s-latch +@param[in] clust_index clustered index +@param[in] thr query thread +@retval DB_COMPUTE_VALUE_FAILED in case of virtual column value computation + failure. +@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the + corresponding fields in the clustered record, when compared with + collation; +@retval DB_SUCCESS if not equal or if the clustered record has been marked + for deletion */ +static +dberr_t +row_sel_sec_rec_is_for_clust_rec( + const rec_t* sec_rec, + dict_index_t* sec_index, + const rec_t* clust_rec, + dict_index_t* clust_index, + que_thr_t* thr) +{ + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(clust_index->table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(rec_get_trx_id(clust_rec, clust_index)); + + /* The clustered index record is delete-marked; + it is not visible in the read view. Besides, + if there are any externally stored columns, + some of them may have already been purged. */ + return DB_SUCCESS; + } + + if (dict_index_is_spatial(sec_index)) { + return row_sel_spatial_sec_rec_is_for_clust_rec( + sec_rec, sec_index, clust_rec, + clust_index); + } + + const byte* sec_field; + ulint sec_len; + const byte* clust_field; + ulint n; + ulint i; + mem_heap_t* heap = mem_heap_create(256); + rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs sec_offsets_[REC_OFFS_SMALL_SIZE]; + rec_offs* clust_offs = clust_offsets_; + rec_offs* sec_offs = sec_offsets_; + + rec_offs_init(clust_offsets_); + rec_offs_init(sec_offsets_); + + ib_vcol_row vc(heap); + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + sec_index->n_fields, + ULINT_UNDEFINED, &heap); + + n = dict_index_get_n_ordering_defined_by_user(sec_index); + + for (i = 0; i < n; i++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint clust_pos = 0; + ulint clust_len = 0; + ulint len; + + ifield = dict_index_get_nth_field(sec_index, i); + col = dict_field_get_col(ifield); + + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); + + const bool is_virtual = col->is_virtual(); + + /* For virtual column, its value will need to be + reconstructed from base column in cluster index */ + if (is_virtual) { + const dict_v_col_t* v_col; + dfield_t* vfield; + row_ext_t* ext; + + byte *record = vc.record(thr_get_trx(thr)->mysql_thd, + clust_index, + &thr->prebuilt->m_mysql_table); + + v_col = reinterpret_cast<const dict_v_col_t*>(col); + + dtuple_t* row = row_build( + ROW_COPY_POINTERS, + clust_index, clust_rec, + clust_offs, + NULL, NULL, NULL, &ext, heap); + + vfield = innobase_get_computed_value( + row, v_col, clust_index, + &heap, NULL, NULL, + thr_get_trx(thr)->mysql_thd, + thr->prebuilt->m_mysql_table, + record, NULL, NULL, + true); + + if (vfield == NULL) { + innobase_report_computed_value_failed(row); + return DB_COMPUTE_VALUE_FAILED; + } + len = clust_len = vfield->len; + clust_field = static_cast<byte*>(vfield->data); + } else { + clust_pos = dict_col_get_clust_pos(col, clust_index); + + clust_field = rec_get_nth_cfield( + clust_rec, clust_index, clust_offs, + clust_pos, &clust_len); + if (clust_len == UNIV_SQL_NULL) { + if (sec_len == UNIV_SQL_NULL) { + continue; + } + return DB_SUCCESS; + } + if (sec_len == UNIV_SQL_NULL) { + return DB_SUCCESS; + } + + len = clust_len; + ulint prefix_len = ifield->prefix_len; + if (rec_offs_nth_extern(clust_offs, clust_pos)) { + /* BLOB can contain prefix. */ + len -= BTR_EXTERN_FIELD_REF_SIZE; + if (!len) { + goto compare_blobs; + } + } + + if (prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, + col->mbmaxlen, prefix_len, len, + reinterpret_cast<const char*>( + clust_field)); + if (len < sec_len) { + goto check_for_blob; + } + } else { +check_for_blob: + if (rec_offs_nth_extern(clust_offs, + clust_pos)) { +compare_blobs: + if (!row_sel_sec_rec_is_for_blob( + col->mtype, col->prtype, + col->mbminlen, + col->mbmaxlen, + clust_field, clust_len, + sec_field, sec_len, + prefix_len, + clust_index->table)) { + return DB_SUCCESS; + } + + continue; + } + } + } + + if (cmp_data(col->mtype, col->prtype, false, + clust_field, len, sec_field, sec_len)) { + return DB_SUCCESS; + } + } + + return DB_SUCCESS_LOCKED_REC; +} + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + sel_node_t* node; + + node = static_cast<sel_node_t*>( + mem_heap_alloc(heap, sizeof(sel_node_t))); + + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->plans = NULL; + + return(node); +} + +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /*!< in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/*********************************************************************//** +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /*!< in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/*********************************************************************//** +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /*!< in: first variable in a list of + variables */ + sel_node_t* node) /*!< in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + for (exp = node->select_list; + var != 0; + var = static_cast<sym_node_t*>(que_node_get_next(var))) { + + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + } +} + +/*********************************************************************//** +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /*!< in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + for (func_node = static_cast<func_node_t*>(node->select_list); + func_node != 0; + func_node = static_cast<func_node_t*>( + que_node_get_next(func_node))) { + + eval_node_set_int_val(func_node, 0); + } + + node->aggregate_already_fetched = FALSE; +} + +/*********************************************************************//** +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /*!< in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/*********************************************************************//** +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /*!< in: record index */ + const rec_t* rec, /*!< in: record in a clustered or non-clustered + index; must be protected by a page latch */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + sym_node_t* column) /*!< in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + mem_heap_t* heap = NULL; + ibool needs_copy; + + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + if (UNIV_UNLIKELY(rec_offs_nth_extern( + offsets, field_no) != 0)) { + + /* Copy an externally stored field to the + temporary heap, if possible. */ + + heap = mem_heap_create(1); + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + index->table->space->zip_size(), + field_no, &len, heap); + + /* data == NULL means that the + externally stored field was not + written yet. This record + should only be seen by + trx_rollback_recovered() or any + TRX_ISO_READ_UNCOMMITTED + transactions. The InnoDB SQL parser + (the sole caller of this function) + does not implement READ UNCOMMITTED, + and it is not involved during rollback. */ + ut_a(data); + ut_a(len != UNIV_SQL_NULL); + + needs_copy = TRUE; + } else { + data = rec_get_nth_cfield(rec, index, offsets, + field_no, &len); + needs_copy = column->copy_val; + } + + if (needs_copy) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /*!< in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = static_cast<sel_buf_t*>( + ut_malloc_nokey(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t))); + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + sel_buf->len = 0; + sel_buf->val_buf_size = 0; + } +} + +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + ut_free(sel_buf->data); + } + } + + ut_free(prefetch_buf); +} + +/*********************************************************************//** +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_dequeue_prefetched_row( +/*=======================*/ + plan_t* plan) /*!< in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); + ut_d(dfield_set_null(val)); + + goto next_col; + } + + ut_ad(column->prefetch_buf); + ut_ad(!dfield_is_ext(val)); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = static_cast<byte*>(dfield_get_data(val)); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/*********************************************************************//** +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_enqueue_prefetched_row( +/*=======================*/ + plan_t* plan) /*!< in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + for (column = UT_LIST_GET_FIRST(plan->columns); + column != 0; + column = UT_LIST_GET_NEXT(col_var_list, column)) { + + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + continue; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = static_cast<byte*>(dfield_get_data(val)); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; + } +} + +/*********************************************************************//** +Builds a previous version of a clustered index record for a consistent read +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_sel_build_prev_vers( +/*====================*/ + ReadView* read_view, /*!< in: read view */ + dict_index_t* index, /*!< in: plan node for table */ + rec_t* rec, /*!< in: record in a clustered index */ + rec_offs** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, plan->index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t** old_vers_heap, /*!< out: old version heap to use */ + rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err; + + if (*old_vers_heap) { + mem_heap_empty(*old_vers_heap); + } else { + *old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, index, offsets, read_view, offset_heap, + *old_vers_heap, old_vers, NULL); + return(err); +} + +/*********************************************************************//** +Builds the last committed version of a clustered index record for a +semi-consistent read. */ +static +void +row_sel_build_committed_vers_for_mysql( +/*===================================*/ + dict_index_t* clust_index, /*!< in: clustered index */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ + const rec_t* rec, /*!< in: record in a clustered index */ + rec_offs** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + const rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + dtuple_t** vrow, /*!< out: to be filled with old virtual + column version if any */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create( + rec_offs_size(*offsets)); + } + + row_vers_build_for_semi_consistent_read(prebuilt->trx, + rec, mtr, clust_index, offsets, offset_heap, + prebuilt->old_vers_heap, old_vers, vrow); +} + +/*********************************************************************//** +Tests the conditions which determine when the index segment we are searching +through has been exhausted. +@return TRUE if row passed the tests */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + plan_t* plan) /*!< in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + for (cond = UT_LIST_GET_FIRST(plan->end_conds); + cond != 0; + cond = UT_LIST_GET_NEXT(cond_list, cond)) { + + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(static_cast<sym_node_t*>(cond->args)); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + } + + return(TRUE); +} + +/*********************************************************************//** +Tests the other conditions. +@return TRUE if row passed the tests */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + plan_t* plan) /*!< in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/** Check that a clustered index record is visible in a consistent read view. +@param rec clustered index record (in leaf page, or in memory) +@param index clustered index +@param offsets rec_get_offsets(rec, index) +@param view consistent read view +@retval DB_SUCCESS if rec is visible in view +@retval DB_SUCCESS_LOCKED_REC if rec is not visible in view +@retval DB_CORRUPTION if the DB_TRX_ID is corrupted */ +static dberr_t row_sel_clust_sees(const rec_t *rec, const dict_index_t &index, + const rec_offs *offsets, + const ReadView &view) +{ + ut_ad(index.is_primary()); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, &index, offsets)); + ut_ad(!rec_is_metadata(rec, index)); + ut_ad(!index.table->is_temporary()); + + const trx_id_t id= row_get_rec_trx_id(rec, &index, offsets); + + if (view.changes_visible(id)) + return DB_SUCCESS; + if (UNIV_LIKELY(id < view.low_limit_id() || id < trx_sys.get_max_trx_id())) + return DB_SUCCESS_LOCKED_REC; + + ib::warn() << "A transaction id in a record of table " << index.table->name + << " is newer than the system-wide maximum."; + return DB_CORRUPTION; +} + +/*********************************************************************//** +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_sel_get_clust_rec( +/*==================*/ + sel_node_t* node, /*!< in: select_node */ + plan_t* plan, /*!< in: plan node for table */ + rec_t* rec, /*!< in: record in a non-clustered index */ + que_thr_t* thr, /*!< in: query thread */ + rec_t** out_rec,/*!< out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /*!< in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + *out_rec = NULL; + + offsets = rec_get_offsets(rec, plan->pcur.index(), offsets, + plan->pcur.index()->n_core_fields, + ULINT_UNDEFINED, &heap); + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); + + index = dict_table_get_first_index(plan->table); + plan->clust_pcur.old_rec = nullptr; + plan->clust_pcur.btr_cur.page_cur.index = index; + dberr_t err = btr_pcur_open_with_no_init(plan->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + &plan->clust_pcur, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&(plan->clust_pcur)) + < dict_index_get_n_unique(index)) { + + if (!node->read_view || + !rec_get_deleted_flag(rec, plan->table->not_redundant())) { + err = DB_CORRUPTION; + } + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.cc + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + goto err_exit; + } + + offsets = rec_get_offsets(clust_rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + trx_t* trx = thr_get_trx(thr); + + /* At READ UNCOMMITTED or READ COMMITTED isolation level + we lock only the record, i.e., next-key locking is + not used. */ + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(&plan->clust_pcur), + clust_rec, index, offsets, + node->row_lock_mode, + trx->isolation_level <= TRX_ISO_READ_COMMITTED + ? LOCK_REC_NOT_GAP : LOCK_ORDINARY, + thr); + + switch (err) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + /* Declare the variable uninitialized. + It should be set to DB_SUCCESS at func_exit. */ + MEM_UNDEFINED(&err, sizeof err); + break; + default: + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + err = row_sel_clust_sees(clust_rec, *index, offsets, + *node->read_view); + + switch (err) { + default: + goto err_exit; + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + err = row_sel_build_prev_vers( + node->read_view, index, clust_rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + goto err_exit; + } + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if (old_vers || rec_get_deleted_flag(rec, dict_table_is_comp( + plan->table))) { + err = row_sel_sec_rec_is_for_clust_rec(rec, + plan->index, clust_rec, + index, thr); + if (err != DB_SUCCESS_LOCKED_REC) { + goto err_exit; + } + } + } + + /* Fetch the columns needed in test conditions. The clustered + index record is protected by a page latch that was acquired + when plan->clust_pcur was positioned. The latch will not be + released until mtr->commit(). */ + + ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets))); + row_sel_fetch_columns(index, clust_rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + *out_rec = clust_rec; + err = DB_SUCCESS; +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/*********************************************************************//** +Sets a lock on a page of R-Tree record. This is all or none action, +mostly due to we cannot reposition a record in R-Tree (with the +nature of splitting) +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +UNIV_INLINE +dberr_t +sel_set_rtr_rec_lock( +/*=================*/ + btr_pcur_t* pcur, /*!< in: cursor */ + const rec_t* first_rec,/*!< in: record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + unsigned mode, /*!< in: lock mode */ + unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOC_REC_NOT_GAP */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + matched_rec_t* match = pcur->btr_cur.rtr_info->matches; + mem_heap_t* heap = NULL; + dberr_t err = DB_SUCCESS; + trx_t* trx = thr_get_trx(thr); + buf_block_t* cur_block = btr_pcur_get_block(pcur); + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* my_offsets = const_cast<rec_offs*>(offsets); + rec_t* rec = const_cast<rec_t*>(first_rec); + rtr_rec_vector* match_rec; + rtr_rec_vector::iterator end; + + rec_offs_init(offsets_); + + if (match->locked || page_rec_is_supremum(first_rec)) { + return(DB_SUCCESS_LOCKED_REC); + } + + ut_ad(page_align(first_rec) == cur_block->page.frame); + ut_ad(match->valid); + + match->block.page.lock.x_lock(); +retry: + cur_block = btr_pcur_get_block(pcur); + ut_ad(match->block.page.lock.have_x() + || match->block.page.lock.have_s()); + ut_ad(page_is_leaf(cur_block->page.frame)); + + err = lock_sec_rec_read_check_and_lock( + 0, cur_block, rec, index, my_offsets, + static_cast<lock_mode>(mode), type, thr); + + if (err == DB_LOCK_WAIT) { +re_scan: + mtr->commit(); + trx->error_state = err; + thr->lock_state = QUE_THR_LOCK_ROW; + if (row_mysql_handle_errors( + &err, trx, thr, NULL)) { + thr->lock_state = QUE_THR_LOCK_NOLOCK; + mtr->start(); + + mysql_mutex_lock(&match->rtr_match_mutex); + if (!match->valid && match->matched_recs->empty()) { + mysql_mutex_unlock(&match->rtr_match_mutex); + err = DB_RECORD_NOT_FOUND; + goto func_end; + } + mysql_mutex_unlock(&match->rtr_match_mutex); + + /* MDEV-14059 FIXME: why re-latch the block? + pcur is already positioned on it! */ + cur_block = buf_page_get_gen( + btr_pcur_get_block(pcur)->page.id(), + btr_pcur_get_block(pcur)->zip_size(), + RW_X_LATCH, NULL, BUF_GET, mtr, &err); + if (!cur_block) { + goto func_end; + } + } else { + mtr->start(); + goto func_end; + } + + DEBUG_SYNC_C("rtr_set_lock_wait"); + + if (!match->valid) { + /* Page got deleted */ + mtr->commit(); + mtr->start(); + err = DB_RECORD_NOT_FOUND; + goto func_end; + } + + match->matched_recs->clear(); + // FIXME: check for !cur_block + + rtr_cur_search_with_match( + cur_block, index, + pcur->btr_cur.rtr_info->search_tuple, + pcur->btr_cur.rtr_info->search_mode, + &pcur->btr_cur.page_cur, + pcur->btr_cur.rtr_info); + + if (!page_is_leaf(buf_block_get_frame(cur_block))) { + /* Page got splitted and promoted (only for + root page it is possible). Release the + page and ask for a re-search */ + mtr->commit(); + mtr->start(); + err = DB_RECORD_NOT_FOUND; + goto func_end; + } + + rec = btr_pcur_get_rec(pcur); + my_offsets = offsets_; + my_offsets = rec_get_offsets(rec, index, my_offsets, + index->n_fields, + ULINT_UNDEFINED, &heap); + + /* No match record */ + if (page_rec_is_supremum(rec) || !match->valid) { + mtr->commit(); + mtr->start(); + err = DB_RECORD_NOT_FOUND; + goto func_end; + } + + goto retry; + } + + my_offsets = offsets_; + match_rec = match->matched_recs; + end = match_rec->end(); + + for (rtr_rec_vector::iterator it = match_rec->begin(); + it != end; ++it) { + rtr_rec_t* rtr_rec = &(*it); + + my_offsets = rec_get_offsets( + rtr_rec->r_rec, index, my_offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + err = lock_sec_rec_read_check_and_lock( + 0, &match->block, rtr_rec->r_rec, index, + my_offsets, static_cast<lock_mode>(mode), + type, thr); + + if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) { + rtr_rec->locked = true; + } else if (err == DB_LOCK_WAIT) { + goto re_scan; + } else { + goto func_end; + } + } + + match->locked = true; + +func_end: + match->block.page.lock.x_unlock(); + if (heap != NULL) { + mem_heap_free(heap); + } + + ut_ad(err != DB_LOCK_WAIT); + + return(err); +} + +/*********************************************************************//** +Sets a lock on a record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +UNIV_INLINE +dberr_t +sel_set_rec_lock( +/*=============*/ + btr_pcur_t* pcur, /*!< in: cursor */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + unsigned mode, /*!< in: lock mode */ + unsigned type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOC_REC_NOT_GAP */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + trx_t* trx; + dberr_t err = DB_SUCCESS; + const buf_block_t* block; + + block = btr_pcur_get_block(pcur); + + trx = thr_get_trx(thr); + + if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000 + && buf_pool.running_out()) { + return DB_LOCK_TABLE_FULL; + } + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, + static_cast<lock_mode>(mode), type, thr); + } else { + + if (dict_index_is_spatial(index)) { + if (type == LOCK_GAP || type == LOCK_ORDINARY) { + ut_ad(0); + ib::error() << "Incorrectly request GAP lock " + "on RTree"; + return(DB_SUCCESS); + } + err = sel_set_rtr_rec_lock(pcur, rec, index, offsets, + mode, type, thr, mtr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, + static_cast<lock_mode>(mode), type, thr); + } + } + + return(err); +} + +/*********************************************************************//** +Opens a pcur to a table index. */ +MY_ATTRIBUTE((warn_unused_result, nonnull)) +static +dberr_t +row_sel_open_pcur( +/*==============*/ + plan_t* plan, /*!< in: table plan */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint i; + + ut_ad(!plan->n_rows_prefetched); + ut_ad(!plan->n_rows_fetched); + ut_ad(!plan->cursor_at_end); + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + plan->pcur.old_rec = nullptr; + plan->pcur.btr_cur.page_cur.index = index; + + dberr_t err; + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + err = btr_pcur_open_with_no_init(plan->tuple, + plan->mode, BTR_SEARCH_LEAF, + &plan->pcur, mtr); + } else { + err = plan->pcur.open_leaf(plan->asc, index, BTR_SEARCH_LEAF, + mtr); + } + + plan->pcur_is_open = err == DB_SUCCESS; + return err; +} + +/*********************************************************************//** +Restores a stored pcur position to a table index. +@return TRUE if the cursor should be moved to the next record after we +return from this function (moved to the previous, in the case of a +descending cursor) without processing again the current cursor +record */ +static +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + plan_t* plan, /*!< in: table plan */ + mtr_t* mtr) /*!< in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = + plan->pcur.restore_position(BTR_SEARCH_LEAF, mtr) == + btr_pcur_t::SAME_ALL; + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(TRUE); +} + +/*********************************************************************//** +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /*!< in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +#ifdef BTR_CUR_HASH_ADAPT +/*********************************************************************//** +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). +@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + sel_node_t* node, /*!< in: select node for a consistent read */ + plan_t* plan, /*!< in: plan for a unique search in clustered + index */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index = plan->index; + + ut_ad(!index->table->is_temporary()); + ut_ad(node->read_view); + ut_ad(node->read_view->is_open()); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); + + if (row_sel_open_pcur(plan, mtr) != DB_SUCCESS) { + return SEL_RETRY; + } + + const rec_t* rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) { + return SEL_RETRY; + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + return SEL_EXHAUSTED; + } + + if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) { + /* See row_search_mvcc() for a comment on bulk_trx_id */ + if (!node->read_view->changes_visible(bulk_trx_id)) { + return SEL_EXHAUSTED; + } + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_is_clust(index)) { + if (row_sel_clust_sees(rec, *index, offsets, *node->read_view) + != DB_SUCCESS) { + return SEL_RETRY; + } + } else if (!srv_read_only_mode) { + trx_id_t trx_id = page_get_max_trx_id(page_align(rec)); + ut_ad(trx_id); + if (!node->read_view->sees(trx_id)) { + return SEL_RETRY; + } + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) { + return SEL_EXHAUSTED; + } + + /* Fetch the columns needed in test conditions. The index + record is protected by a page latch that was acquired when + plan->pcur was positioned. The latch will not be released + until mtr->commit(). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + return SEL_EXHAUSTED; + } + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + plan->n_rows_fetched++; + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return SEL_FOUND; +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/*********************************************************************//** +Performs a select step. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_sel( +/*====*/ + sel_node_t* node, /*!< in: select node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + dberr_t err; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + const trx_t* trx = thr_get_trx(thr); + + ut_ad(thr->run_node == node); + ut_ad(!node->read_view || node->read_view == &trx->read_view); + ut_ad(!node->read_view || node->read_view->is_open()); + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_dequeue_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr.start(); + +#ifdef BTR_CUR_HASH_ADAPT + if (node->read_view && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust) { + switch (row_sel_try_search_shortcut(node, plan, &mtr)) { + case SEL_FOUND: + goto next_table; + case SEL_EXHAUSTED: + goto table_exhausted; + default: + ut_ad(0); + /* fall through */ + case SEL_RETRY: + break; + } + + plan_reset_cursor(plan); + + mtr.commit(); + mtr.start(); + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + err = row_sel_open_pcur(plan, &mtr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto mtr_commit_exit; + } + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + + if (!node->read_view + || trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + } else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) { + /* See row_search_mvcc() for a comment on bulk_trx_id */ + if (!trx->read_view.changes_visible(bulk_trx_id)) { + goto table_exhausted; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && !page_rec_is_supremum(rec)) { + + /* Do not support "descending search" for Spatial index */ + ut_ad(!dict_index_is_spatial(index)); + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!node->read_view) { + const rec_t* next_rec = page_rec_get_next_const(rec); + if (UNIV_UNLIKELY(!next_rec)) { + err = DB_CORRUPTION; + goto lock_wait_or_error; + } + unsigned lock_type; + + offsets = rec_get_offsets(next_rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + /* At READ UNCOMMITTED or READ COMMITTED + isolation level, we lock only the record, + i.e., next-key locking is not used. */ + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + if (page_rec_is_supremum(next_rec)) { + goto skip_lock; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(&plan->pcur, + next_rec, index, offsets, + node->row_lock_mode, + lock_type, thr, &mtr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + break; + default: + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + goto lock_wait_or_error; + } + } + } + +skip_lock: + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (rec_is_metadata(rec, *index)) { + /* Skip the metadata pseudo-record. */ + cost_counter++; + goto next_rec; + } + + if (!node->read_view) { + /* Try to place a lock on the index record */ + unsigned lock_type; + + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + /* At READ UNCOMMITTED or READ COMMITTED isolation level, + we lock only the record, i.e., next-key locking is + not used. */ + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + || dict_index_is_spatial(index)) { + + if (page_rec_is_supremum(rec)) { + + goto next_rec; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(&plan->pcur, + rec, index, offsets, + node->row_lock_mode, lock_type, + thr, &mtr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + if (page_rec_is_supremum(rec)) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (node->read_view) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (dict_index_is_clust(index)) { + const trx_id_t id = row_get_rec_trx_id( + rec, index, offsets); + + if (!node->read_view->changes_visible(id)) { + if (id >= node->read_view->low_limit_id() + && id >= trx_sys.get_max_trx_id()) { + err = DB_CORRUPTION; + goto lock_wait_or_error; + } + + err = row_sel_build_prev_vers( + node->read_view, index, rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The record does not exist + in our read view. Skip it, but + first attempt to determine + whether the index segment we + are searching through has been + exhausted. */ + + offsets = rec_get_offsets( + rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + /* Fetch the columns needed in + test conditions. The clustered + index record is protected by a + page latch that was acquired + by row_sel_open_pcur() or + row_sel_restore_pcur_pos(). + The latch will not be released + until mtr.commit(). */ + + row_sel_fetch_columns( + index, rec, offsets, + UT_LIST_GET_FIRST( + plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!srv_read_only_mode) { + trx_id_t trx_id = page_get_max_trx_id(page_align(rec)); + ut_ad(trx_id); + if (!node->read_view->sees(trx_id)) { + cons_read_requires_clust_rec = TRUE; + } + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions. The record is + protected by a page latch that was acquired by + row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch + will not be released until mtr.commit(). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table)) + && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(node->read_view); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(plan->table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing update_undo log record. */ + ut_ad(rec_get_trx_id(clust_rec, + dict_table_get_first_index( + plan->table))); + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_enqueue_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_dequeue_prefetched_row(plan); + + goto next_table; + } + +next_rec: + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&plan->pcur)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr.commit(); + + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + err = DB_SUCCESS; + goto func_exit; + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr.commit(); + + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_dequeue_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + err = DB_SUCCESS; + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + } else { + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + } + + goto func_exit; + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + err = DB_SUCCESS; + goto mtr_commit_exit; + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr.commit(); + + mtr_has_extra_clust_latch = FALSE; + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); +mtr_commit_exit: + mtr.commit(); + +func_exit: + if (heap != NULL) { + mem_heap_free(heap); + } + return(err); +} + +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* node; + + ut_ad(thr); + + node = static_cast<sel_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started_xa(thr_get_trx(thr), false); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + trx_t *trx = thr_get_trx(thr); + /* Assign a read view for the query */ + trx->read_view.open(trx); + node->read_view = trx->read_view.is_open() ? + &trx->read_view : NULL; + } else { + sym_node_t* table_node; + lock_mode i_lock_mode; + + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + for (table_node = node->table_list; + table_node != 0; + table_node = static_cast<sym_node_t*>( + que_node_get_next(table_node))) { + + dberr_t err = lock_table( + table_node->table, nullptr, + i_lock_mode, thr); + + if (err != DB_SUCCESS) { + trx_t* trx; + + trx = thr_get_trx(thr); + trx->error_state = err; + + return(NULL); + } + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor + && UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + dberr_t err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err != DB_SUCCESS) { + thr_get_trx(thr)->error_state = err; + + return(NULL); + } + + return(thr); +} + +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = static_cast<fetch_node_t*>(thr->run_node); + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + if (node->into_list) { + sel_assign_into_var_values(node->into_list, + sel_node); + } else { + ibool ret = (*node->func->func)( + sel_node, node->func->arg); + + if (!ret) { + sel_node->state + = SEL_NODE_NO_MORE_ROWS; + } + } + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + ib::error() << "fetch called on a closed cursor"; + + thr_get_trx(thr)->error_state = DB_ERROR; + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = static_cast<row_printf_node_t*>(thr->run_node); + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + fputs(" ::: ", stderr); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. A counterpart of this function is +ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions; NOTE that dtuple->data + may end up pointing inside buf so + do not discard that buffer while + the tuple is being used. See + row_mysql_store_col_in_innobase_format() + in the case of DATA_INT */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len) /*!< in: MySQL key value length */ +{ + byte* original_buf = buf; + const byte* original_key_ptr = key_ptr; + dict_field_t* field; + dfield_t* dfield; + ulint data_offset; + ulint data_len; + ulint data_field_len; + ibool is_null; + const byte* key_end; + ulint n_fields = 0; + + /* For documentation of the key value storage format in MySQL, see + ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + field = dict_index_get_nth_field(index, 0); + + if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) { + /* A special case: we are looking for a position in the + generated clustered index which InnoDB automatically added + to a table with no primary key: the first and the only + ordering column is ROW_ID which InnoDB stored to the key_ptr + buffer. */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + + ulint type = dfield_get_type(dfield)->mtype; + ut_a(field->col->mtype == type); + + data_offset = 0; + is_null = FALSE; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + data_offset = 1; + + if (*key_ptr != 0) { + dfield_set_null(dfield); + + is_null = TRUE; + } + } + + /* Calculate data length and data field total length */ + if (DATA_LARGE_MTYPE(type) || DATA_GEOMETRY_MTYPE(type)) { + + /* For R-tree index, data length should be the + total size of the wkb data.*/ + if (dict_index_is_spatial(index)) { + ut_ad(DATA_GEOMETRY_MTYPE(type)); + data_len = key_len; + data_field_len = data_offset + data_len; + } else { + /* The key field is a column prefix of a BLOB + or TEXT. */ + + ut_a(field->prefix_len > 0); + + /* MySQL stores the actual data length to the + first 2 bytes after the optional SQL NULL + marker byte. The storage format is + little-endian, that is, the most significant + byte at a higher address. In UTF-8, MySQL + seems to reserve field->prefix_len bytes for + storing this field in the key value buffer, + even though the actual value only takes data + len bytes from the start. */ + + data_len = ulint(key_ptr[data_offset]) + | ulint(key_ptr[data_offset + 1]) << 8; + data_field_len = data_offset + 2 + + field->prefix_len; + + data_offset += 2; + + /* Now that we know the length, we store the + column value like it would be a fixed char + field */ + } + + + } else if (field->prefix_len > 0) { + /* Looks like MySQL pads unused end bytes in the + prefix with space. Therefore, also in UTF-8, it is ok + to compare with a prefix containing full prefix_len + bytes, and no need to take at most prefix_len / 3 + UTF-8 characters from the start. + If the prefix is used as the upper end of a LIKE + 'abc%' query, then MySQL pads the end with chars + 0xff. TODO: in that case does it any harm to compare + with the full prefix_len bytes. How do characters + 0xff in UTF-8 behave? */ + + data_len = field->prefix_len; + data_field_len = data_offset + data_len; + } else { + data_len = dfield_get_type(dfield)->len; + data_field_len = data_offset + data_len; + } + + if ((dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR) + && (type != DATA_INT)) { + /* In a MySQL key value format, a true VARCHAR is + always preceded by 2 bytes of a length field. + dfield_get_type(dfield)->len returns the maximum + 'payload' len in bytes. That does not include the + 2 bytes that tell the actual data length. + + We added the check != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! */ + + data_len += 2; + data_field_len += 2; + } + + /* Storing may use at most data_len bytes of buf */ + + if (UNIV_LIKELY(!is_null)) { + buf = row_mysql_store_col_in_innobase_format( + dfield, buf, + FALSE, /* MySQL key value format col */ + key_ptr + data_offset, data_len, + dict_table_is_comp(index->table)); + ut_a(buf <= original_buf + buf_len); + } + + key_ptr += data_field_len; + + if (UNIV_UNLIKELY(key_ptr > key_end)) { + /* The last field in key was not a complete key field + but a prefix of it. + + Print a warning about this! HA_READ_PREFIX_LAST does + not currently work in InnoDB with partial-field key + value prefixes. Since MySQL currently uses a padding + trick to calculate LIKE 'abc%' type queries there + should never be partial-field prefixes in searches. */ + + ib::warn() << "Using a partial-field key prefix in" + " search, index " << index->name + << " of table " << index->table->name + << ". Last data field length " + << data_field_len << " bytes, key ptr now" + " exceeds key end by " << (key_ptr - key_end) + << " bytes. Key value in the MariaDB format:"; + + ut_print_buf(stderr, original_key_ptr, key_len); + putc('\n', stderr); + + if (!is_null) { + ulint len = dfield_get_len(dfield); + dfield_set_len(dfield, len + - (ulint) (key_ptr - key_end)); + } + ut_ad(0); + } + + n_fields++; + field++; + dfield++; + } + + ut_a(buf <= original_buf + buf_len); + + /* We set the length of tuple to n_fields: we assume that the memory + area allocated for it is big enough (usually bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/**************************************************************//** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */ +void +row_sel_field_store_in_mysql_format_func( + byte* dest, + const mysql_row_templ_t* templ, +#ifdef UNIV_DEBUG + const dict_index_t* index, + ulint field_no, +#endif /* UNIV_DEBUG */ + const byte* data, + ulint len) +{ +#ifdef UNIV_DEBUG + const dict_field_t* field + = templ->is_virtual + ? NULL : dict_index_get_nth_field(index, field_no); +#endif /* UNIV_DEBUG */ + + ut_ad(len != UNIV_SQL_NULL); + MEM_CHECK_DEFINED(data, len); + MEM_CHECK_ADDRESSABLE(dest, templ->mysql_col_len); + MEM_UNDEFINED(dest, templ->mysql_col_len); + + byte* pad = dest + len; + + switch (templ->type) { + const byte* field_end; + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field_end = dest + templ->mysql_col_len; + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, templ->mysql_length_bytes); + /* Copy the actual data. Leave the rest of the + buffer uninitialized. */ + memcpy(dest, data, len); + break; + } + + /* Copy the actual data */ + memcpy(dest, data, len); + + /* Pad with trailing spaces. */ + + if (pad == field_end) { + break; + } + + if (UNIV_UNLIKELY(templ->type == DATA_FIXBINARY)) { + memset(pad, 0, field_end - pad); + break; + } + + ut_ad(templ->mbminlen <= templ->mbmaxlen); + + /* We treat some Unicode charset strings specially. */ + switch (templ->mbminlen) { + case 4: + /* InnoDB should never have stripped partial + UTF-32 characters. */ + ut_a(!(len & 3)); + break; + case 2: + /* A space char is two bytes, + 0x0020 in UCS2 and UTF-16 */ + + if (UNIV_UNLIKELY(len & 1)) { + /* A 0x20 has been stripped from the column. + Pad it back. */ + + if (pad < field_end) { + *pad++ = 0x20; + } + } + } + + row_mysql_pad_col(templ->mbminlen, pad, + ulint(field_end - pad)); + break; + + case DATA_BLOB: + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, templ->mysql_col_len, data, + len); + break; + + case DATA_GEOMETRY: + /* We store all geometry data as BLOB data at server layer. */ + row_mysql_store_geometry(dest, templ->mysql_col_len, data, len); + break; + + case DATA_MYSQL: + memcpy(dest, data, len); + + ut_ad(templ->mysql_col_len >= len); + ut_ad(templ->mbmaxlen >= templ->mbminlen); + + /* If field_no equals to templ->icp_rec_field_no, + we are examining a row pointed by "icp_rec_field_no". + There is possibility that icp_rec_field_no refers to + a field in a secondary index while templ->rec_field_no + points to field in a primary index. The length + should still be equal, unless the field pointed + by icp_rec_field_no has a prefix */ + ut_ad(templ->mbmaxlen > templ->mbminlen + || templ->mysql_col_len == len + || (field_no == templ->icp_rec_field_no + && field->prefix_len > 0)); + + /* The following assertion would fail for old tables + containing UTF-8 ENUM columns due to Bug #9526. */ + ut_ad(!templ->mbmaxlen + || !(templ->mysql_col_len % templ->mbmaxlen)); + ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len + || (field_no == templ->icp_rec_field_no + && field->prefix_len > 0) + || templ->rec_field_is_prefix); + + ut_ad(templ->is_virtual + || !(field->prefix_len % templ->mbmaxlen)); + + if (templ->mbminlen == 1 && templ->mbmaxlen != 1) { + /* Pad with spaces. This undoes the stripping + done in row0mysql.cc, function + row_mysql_store_col_in_innobase_format(). */ + + memset(pad, 0x20, templ->mysql_col_len - len); + } + break; + + default: +#ifdef UNIV_DEBUG + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + /* fall through */ + + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: +#endif /* UNIV_DEBUG */ + ut_ad((templ->is_virtual && !field) + || (field && field->prefix_len + ? field->prefix_len == len + : templ->mysql_col_len == len)); + memcpy(dest, data, len); + break; + + case DATA_INT: + /* Convert InnoDB big-endian integer to little-endian + format, sign bit restored to 2's complement form */ + DBUG_ASSERT(templ->mysql_col_len == len); + + byte* ptr = pad; + do *--ptr = *data++; while (ptr != dest); + if (!templ->is_unsigned) { + pad[-1] ^= 0x80; + } + } +} + +/** Convert a field in the Innobase format to a field in the MySQL format. +@param[out] mysql_rec record in the MySQL format +@param[in,out] prebuilt prebuilt struct +@param[in] rec InnoDB record; must be protected + by a page latch +@param[in] index index of rec +@param[in] offsets array returned by rec_get_offsets() +@param[in] field_no templ->rec_field_no or + templ->clust_rec_field_no + or templ->icp_rec_field_no +@param[in] templ row template +*/ +static MY_ATTRIBUTE((warn_unused_result)) +ibool +row_sel_store_mysql_field( + byte* mysql_rec, + row_prebuilt_t* prebuilt, + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + ulint field_no, + const mysql_row_templ_t*templ) +{ + DBUG_ENTER("row_sel_store_mysql_field_func"); + + const byte* data; + ulint len; + + ut_ad(prebuilt->default_rec); + ut_ad(templ); + ut_ad(templ >= prebuilt->mysql_template); + ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]); + ut_ad(field_no == templ->clust_rec_field_no + || field_no == templ->rec_field_no + || field_no == templ->icp_rec_field_no); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) { + + mem_heap_t* heap; + /* Copy an externally stored field to a temporary heap */ + + ut_ad(field_no == templ->clust_rec_field_no); + + if (DATA_LARGE_MTYPE(templ->type)) { + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + srv_page_size); + } + + heap = prebuilt->blob_heap; + } else { + heap = mem_heap_create(srv_page_size); + } + + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + + data = btr_rec_copy_externally_stored_field( + rec, offsets, prebuilt->table->space->zip_size(), + field_no, &len, heap); + + if (UNIV_UNLIKELY(!data)) { + /* The externally stored field was not written + yet. This record should only be seen by + trx_rollback_recovered() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + + if (heap != prebuilt->blob_heap) { + mem_heap_free(heap); + } + + ut_a(prebuilt->trx->isolation_level + == TRX_ISO_READ_UNCOMMITTED); + DBUG_RETURN(FALSE); + } + + ut_a(len != UNIV_SQL_NULL); + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, index, field_no, data, len); + + if (heap != prebuilt->blob_heap) { + mem_heap_free(heap); + } + } else { + /* The field is stored in the index record, or + in the metadata for instant ADD COLUMN. */ + data = rec_get_nth_cfield(rec, index, offsets, field_no, &len); + + if (len == UNIV_SQL_NULL) { + /* MySQL assumes that the field for an SQL + NULL value is set to the default value. */ + ut_ad(templ->mysql_null_bit_mask); + + MEM_CHECK_DEFINED(prebuilt->default_rec + + templ->mysql_col_offset, + templ->mysql_col_len); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + memcpy(mysql_rec + templ->mysql_col_offset, + (const byte*) prebuilt->default_rec + + templ->mysql_col_offset, + templ->mysql_col_len); + DBUG_RETURN(TRUE); + } + + if (DATA_LARGE_MTYPE(templ->type) + || DATA_GEOMETRY_MTYPE(templ->type)) { + + /* It is a BLOB field locally stored in the + InnoDB record: we MUST copy its contents to + prebuilt->blob_heap here because + row_sel_field_store_in_mysql_format() stores a + pointer to the data, and the data passed to us + will be invalid as soon as the + mini-transaction is committed and the page + latch on the clustered index page is + released. */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + srv_page_size); + DBUG_PRINT("anna", ("blob_heap allocated: %p", + prebuilt->blob_heap)); + } + + data = static_cast<byte*>( + mem_heap_dup(prebuilt->blob_heap, data, len)); + } + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, index, field_no, data, len); + } + + ut_ad(len != UNIV_SQL_NULL); + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ + mysql_rec[templ->mysql_null_byte_offset] + &= static_cast<byte>(~templ->mysql_null_bit_mask); + } + + DBUG_RETURN(TRUE); +} + +/** Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. +@param[out] mysql_rec row in the MySQL format +@param[in] prebuilt cursor +@param[in] rec Innobase record in the index + which was described in prebuilt's + template, or in the clustered index; + must be protected by a page latch +@param[in] vrow virtual columns +@param[in] rec_clust whether index must be the clustered index +@param[in] index index of rec +@param[in] offsets array returned by rec_get_offsets(rec) +@retval true on success +@retval false if not all columns could be retrieved */ +MY_ATTRIBUTE((warn_unused_result)) +static bool row_sel_store_mysql_rec( + byte* mysql_rec, + row_prebuilt_t* prebuilt, + const rec_t* rec, + const dtuple_t* vrow, + bool rec_clust, + const dict_index_t* index, + const rec_offs* offsets) +{ + DBUG_ENTER("row_sel_store_mysql_rec"); + + ut_ad(rec_clust || index == prebuilt->index); + ut_ad(!rec_clust || dict_index_is_clust(index)); + + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + + for (ulint i = 0; i < prebuilt->n_template; i++) { + const mysql_row_templ_t*templ = &prebuilt->mysql_template[i]; + + if (templ->is_virtual && dict_index_is_clust(index)) { + /* Skip virtual columns if it is not a covered + search or virtual key read is not requested. */ + if (!rec_clust + || !prebuilt->index->has_virtual() + || !prebuilt->read_just_key) { + /* Initialize the NULL bit. */ + if (templ->mysql_null_bit_mask) { +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + } + continue; + } + + dict_v_col_t* col; + col = dict_table_get_nth_v_col( + index->table, templ->clust_rec_field_no); + + ut_ad(vrow); + + const dfield_t* dfield = dtuple_get_nth_v_field( + vrow, col->v_pos); + + if (dfield_get_type(dfield)->mtype == DATA_MISSING) { + ut_ad("no ha_innopart in MariaDB" == 0); + continue; + } + + if (dfield->len == UNIV_SQL_NULL) { +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + memcpy(mysql_rec + + templ->mysql_col_offset, + (const byte*) prebuilt->default_rec + + templ->mysql_col_offset, + templ->mysql_col_len); + } else { + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, index, templ->clust_rec_field_no, + (const byte*)dfield->data, dfield->len); + if (templ->mysql_null_bit_mask) { + mysql_rec[ + templ->mysql_null_byte_offset] + &= static_cast<byte> + (~templ->mysql_null_bit_mask); + } + } + + continue; + } + + const ulint field_no + = rec_clust + ? templ->clust_rec_field_no + : templ->rec_field_no; + /* We should never deliver column prefixes to the SQL layer, + except for evaluating handler_index_cond_check() + or handler_rowid_filter_check(). */ + /* ...actually, we do want to do this in order to + support the prefix query optimization. + + ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len + == 0); + + ...so we disable this assert. */ + + if (!row_sel_store_mysql_field(mysql_rec, prebuilt, + rec, index, offsets, + field_no, templ)) { + + DBUG_RETURN(false); + } + } + + /* FIXME: We only need to read the doc_id if an FTS indexed + column is being updated. + NOTE, the record can be cluster or secondary index record. + if secondary index is used then FTS_DOC_ID column should be part + of this index. */ + if (dict_table_has_fts_index(prebuilt->table)) { + if (dict_index_is_clust(index) + || prebuilt->fts_doc_id_in_read_set) { + prebuilt->fts_doc_id = fts_get_doc_id_from_rec( + rec, index, offsets); + } + } + + DBUG_RETURN(true); +} + +static void row_sel_reset_old_vers_heap(row_prebuilt_t *prebuilt) +{ + if (prebuilt->old_vers_heap) + mem_heap_empty(prebuilt->old_vers_heap); + else + prebuilt->old_vers_heap= mem_heap_create(200); +} + +/*********************************************************************//** +Builds a previous version of a clustered index record for a consistent read +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct */ + dict_index_t* clust_index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in a clustered index */ + rec_offs** offsets, /*!< in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /*!< in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /*!< out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + dtuple_t** vrow, /*!< out: dtuple to hold old virtual + column data */ + mtr_t* mtr) /*!< in: mtr */ +{ + row_sel_reset_old_vers_heap(prebuilt); + + return row_vers_build_for_consistent_read( + rec, mtr, clust_index, offsets, + &prebuilt->trx->read_view, offset_heap, + prebuilt->old_vers_heap, old_vers, vrow); +} + +/** Helper class to cache clust_rec and old_vers */ +class Row_sel_get_clust_rec_for_mysql +{ + const rec_t *cached_clust_rec; + rec_t *cached_old_vers; + lsn_t cached_lsn; + page_id_t cached_page_id; + +#ifdef UNIV_DEBUG + void check_eq(const dict_index_t *index, const rec_offs *offsets) const + { + rec_offs vers_offs[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS]; + rec_offs_init(vers_offs); + mem_heap_t *heap= nullptr; + + ut_ad(rec_offs_validate(cached_clust_rec, index, offsets)); + ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets)); + ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs, + index->n_core_fields, + index->db_trx_id(), &heap)); + ut_ad(!heap); + for (auto n= index->db_trx_id(); n--; ) + { + const dict_col_t *col= dict_index_get_nth_col(index, n); + ulint len1, len2; + const byte *b1= rec_get_nth_field(cached_clust_rec, offsets, n, &len1); + const byte *b2= rec_get_nth_field(cached_old_vers, vers_offs, n, &len2); + ut_ad(!cmp_data(col->mtype, col->prtype, false, b1, len1, b2, len2)); + } + } +#endif + +public: + Row_sel_get_clust_rec_for_mysql() : + cached_clust_rec(NULL), cached_old_vers(NULL), cached_lsn(0), + cached_page_id(page_id_t(0,0)) {} + + dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index, + const rec_t *rec, que_thr_t *thr, const rec_t **out_rec, + rec_offs **offsets, mem_heap_t **offset_heap, + dtuple_t **vrow, mtr_t *mtr); +}; + +/*********************************************************************//** +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ +dberr_t +Row_sel_get_clust_rec_for_mysql::operator()( +/*============================*/ + row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */ + dict_index_t* sec_index,/*!< in: secondary index where rec resides */ + const rec_t* rec, /*!< in: record in a non-clustered index; if + this is a locking read, then rec is not + allowed to be delete-marked, and that would + not make sense either */ + que_thr_t* thr, /*!< in: query thread */ + const rec_t** out_rec,/*!< out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + rec_offs** offsets,/*!< in: offsets returned by + rec_get_offsets(rec, sec_index); + out: offsets returned by + rec_get_offsets(out_rec, clust_index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + dtuple_t** vrow, /*!< out: virtual column to fill */ + mtr_t* mtr) /*!< in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + rec_t* old_vers; + trx_t* trx; + + prebuilt->clust_pcur->old_rec = nullptr; + *out_rec = NULL; + trx = thr_get_trx(thr); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, + sec_index, *offsets); + + clust_index = dict_table_get_first_index(sec_index->table); + prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index; + + dberr_t err = btr_pcur_open_with_no_init(prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + + const rec_t* clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + + prebuilt->clust_pcur->trx_if_known = trx; + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(prebuilt->clust_pcur) + < dict_index_get_n_unique(clust_index)) { + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(prebuilt->pcur); + + /* If this is a spatial index scan, and we are reading + from a shadow buffer, the record could be already + deleted (due to rollback etc.). So get the original + page and verify that */ + if (dict_index_is_spatial(sec_index) + && btr_cur->rtr_info->matches + && (page_align(rec) + == btr_cur->rtr_info->matches->block.page.frame + || rec != btr_pcur_get_rec(prebuilt->pcur))) { +#ifdef UNIV_DEBUG + rtr_info_t* rtr_info = btr_cur->rtr_info; + mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex); + /* The page could be deallocated (by rollback etc.) */ + if (!rtr_info->matches->valid) { + mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); + clust_rec = NULL; + goto func_exit; + } + mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); + + if (rec_get_deleted_flag(rec, + dict_table_is_comp(sec_index->table)) + && prebuilt->select_lock_type == LOCK_NONE) { + + clust_rec = NULL; + goto func_exit; + } + + if (rec != btr_pcur_get_rec(prebuilt->pcur)) { + clust_rec = NULL; + goto func_exit; + } + + /* FIXME: Why is this block not the + same as btr_pcur_get_block(prebuilt->pcur), + and is it not unsafe to use RW_NO_LATCH here? */ + buf_block_t* block = buf_page_get_gen( + btr_pcur_get_block(prebuilt->pcur)->page.id(), + btr_pcur_get_block(prebuilt->pcur)->zip_size(), + RW_NO_LATCH, NULL, BUF_GET, mtr, &err); + ut_ad(block); // FIXME: avoid crash + mem_heap_t* heap = mem_heap_create(256); + dtuple_t* tuple = dict_index_build_data_tuple( + rec, sec_index, true, + sec_index->n_fields, heap); + page_cur_t page_cursor; + page_cursor.block = block; + page_cursor.index = sec_index; + ulint up_match = 0, low_match = 0; + ut_ad(!page_cur_search_with_match(tuple, PAGE_CUR_LE, + &up_match, + &low_match, + &page_cursor, + nullptr)); + ut_ad(low_match < dtuple_get_n_fields_cmp(tuple)); + mem_heap_free(heap); + err = DB_SUCCESS; +#endif /* UNIV_DEBUG */ + } else if (!rec_get_deleted_flag(rec, + dict_table_is_comp(sec_index->table)) + || prebuilt->select_lock_type != LOCK_NONE) { + /* In a rare case it is possible that no clust + rec is found for a delete-marked secondary index + record: if row_undo_mod_clust() has already removed + the clust rec, while purge is still cleaning and + removing secondary index records associated with + earlier versions of the clustered index record. + In that case we know that the clustered index + record did not exist in the read view of trx. */ + ib::error() << "Clustered record for sec rec not found" + " index " << sec_index->name + << " of table " << sec_index->table->name; + + fputs("InnoDB: sec index record ", stderr); + rec_print(stderr, rec, sec_index); + fputs("\n" + "InnoDB: clust index record ", stderr); + rec_print(stderr, clust_rec, clust_index); + err = DB_CORRUPTION; + } + + clust_rec = NULL; + goto func_exit; + } + + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, offset_heap); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; we are searching + the clust rec with a unique condition, hence + we set a LOCK_REC_NOT_GAP type lock */ + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(prebuilt->clust_pcur), + clust_rec, clust_index, *offsets, + prebuilt->select_lock_type, + LOCK_REC_NOT_GAP, + thr); + + switch (err) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + break; + default: + return err; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED + || clust_index->table->is_temporary()) { + } else { + /* If the isolation level allows reading of + uncommitted data, then we never look for an + earlier version */ + err = row_sel_clust_sees(clust_rec, *clust_index, + *offsets, trx->read_view); + } + + switch (err) { + default: + return err; + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + const buf_page_t& bpage = btr_pcur_get_block( + prebuilt->clust_pcur)->page; + + const lsn_t lsn = mach_read_from_8( + page_align(clust_rec) + FIL_PAGE_LSN); + + if (lsn != cached_lsn + || bpage.id() != cached_page_id + || clust_rec != cached_clust_rec) { + /* The following call returns 'offsets' associated with + 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + prebuilt, clust_index, + clust_rec, offsets, offset_heap, &old_vers, + vrow, mtr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + cached_lsn = lsn; + cached_page_id = bpage.id(); + cached_clust_rec = clust_rec; + cached_old_vers = old_vers; + } else { + err = DB_SUCCESS; + old_vers = cached_old_vers; + + /* The offsets need not be same for the latest + version of clust_rec and its old version + old_vers. Re-calculate the offsets for old_vers. */ + + if (old_vers) { + ut_d(check_eq(clust_index, *offsets)); + *offsets = rec_get_offsets( + old_vers, clust_index, *offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, offset_heap); + } + } + + if (old_vers == NULL) { + return err; + } + + clust_rec = old_vers; + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + /* And for spatial index, since the rec is from shadow buffer, + so we need to check if it's exactly match the clust_rec. */ + if (clust_rec + && (old_vers + || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED + || dict_index_is_spatial(sec_index) + || rec_get_deleted_flag(rec, dict_table_is_comp( + sec_index->table)))) { + err = row_sel_sec_rec_is_for_clust_rec(rec, sec_index, + clust_rec, clust_index, thr); + switch (err) { + case DB_SUCCESS: + clust_rec = NULL; + break; + case DB_SUCCESS_LOCKED_REC: + break; + default: + return err; + } + } + + err = DB_SUCCESS; + } + +func_exit: + *out_rec = clust_rec; + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* We may use the cursor in update or in unlock_row(): + store its position */ + + btr_pcur_store_position(prebuilt->clust_pcur, mtr); + } + + return err; +} + +/** Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on may have been deleted. +Then we may have to move the cursor one step up or down. +@param[out] same_user_rec true if we were able to restore the cursor on a user +record with the same ordering prefix in in the B-tree index +@param[in] latch_mode latch mode wished in restoration +@param[in] pcur cursor whose position has been stored +@param[in] moves_up true if the cursor moves up in the index +@param[in,out] mtr mtr; CAUTION: may commit mtr temporarily! +@return true if we may need to process the record the cursor is now +positioned on (i.e. we should not go to the next record yet) */ +static bool sel_restore_position_for_mysql(bool *same_user_rec, + btr_latch_mode latch_mode, + btr_pcur_t *pcur, + bool moves_up, mtr_t *mtr) +{ + auto status = pcur->restore_position(latch_mode, mtr); + + *same_user_rec = status == btr_pcur_t::SAME_ALL; + + ut_ad(!*same_user_rec || pcur->rel_pos == BTR_PCUR_ON); +#ifdef UNIV_DEBUG + if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) { + ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE + || pcur->rel_pos == BTR_PCUR_AFTER); + } else { + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad((pcur->rel_pos == BTR_PCUR_ON) + == btr_pcur_is_on_user_rec(pcur)); + } +#endif /* UNIV_DEBUG */ + + /* The position may need be adjusted for rel_pos and moves_up. */ + + switch (pcur->rel_pos) { + case BTR_PCUR_ON: + if (!*same_user_rec && moves_up) { + if (status == btr_pcur_t::SAME_UNIQ) + return true; +next: + if (btr_pcur_move_to_next(pcur, mtr) + && rec_is_metadata(btr_pcur_get_rec(pcur), + *pcur->index())) { + btr_pcur_move_to_next(pcur, mtr); + } + + return true; + } + return(!*same_user_rec); + case BTR_PCUR_AFTER_LAST_IN_TREE: + case BTR_PCUR_BEFORE_FIRST_IN_TREE: + return true; + case BTR_PCUR_AFTER: + /* positioned to record after pcur->old_rec. */ + pcur->pos_state = BTR_PCUR_IS_POSITIONED; +prev: + if (btr_pcur_is_on_user_rec(pcur) && !moves_up + && !rec_is_metadata(btr_pcur_get_rec(pcur), + *pcur->index())) { + if (!btr_pcur_move_to_prev(pcur, mtr)) { + return true; + } + } + return true; + case BTR_PCUR_BEFORE: + /* For non optimistic restoration: + The position is now set to the record before pcur->old_rec. + + For optimistic restoration: + The position also needs to take the previous search_mode into + consideration. */ + + switch (pcur->pos_state) { + case BTR_PCUR_IS_POSITIONED_OPTIMISTIC: + pcur->pos_state = BTR_PCUR_IS_POSITIONED; + if (pcur->search_mode == PAGE_CUR_GE) { + /* Positioned during Greater or Equal search + with BTR_PCUR_BEFORE. Optimistic restore to + the same record. If scanning for lower then + we must move to previous record. + This can happen with: + HANDLER READ idx a = (const); + HANDLER READ idx PREV; */ + goto prev; + } + return true; + case BTR_PCUR_IS_POSITIONED: + if (moves_up && btr_pcur_is_on_user_rec(pcur)) { + goto next; + } + return true; + case BTR_PCUR_WAS_POSITIONED: + case BTR_PCUR_NOT_POSITIONED: + break; + } + } + ut_ad(0); + return true; +} + +/********************************************************************//** +Copies a cached field for MySQL from the fetch cache. */ +static +void +row_sel_copy_cached_field_for_mysql( +/*================================*/ + byte* buf, /*!< in/out: row buffer */ + const byte* cache, /*!< in: cached row */ + const mysql_row_templ_t*templ) /*!< in: column template */ +{ + ulint len; + + buf += templ->mysql_col_offset; + cache += templ->mysql_col_offset; + + MEM_CHECK_ADDRESSABLE(buf, templ->mysql_col_len); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR + && (templ->type != DATA_INT)) { + /* Check for != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! + Find the actual length of the true VARCHAR field. */ + row_mysql_read_true_varchar( + &len, cache, templ->mysql_length_bytes); + len += templ->mysql_length_bytes; + MEM_UNDEFINED(buf, templ->mysql_col_len); + } else { + len = templ->mysql_col_len; + } + + memcpy(buf, cache, len); +} + +/** Copy used fields from cached row. +Copy cache record field by field, don't touch fields that +are not covered by current key. +@param[out] buf Where to copy the MySQL row. +@param[in] cached_rec What to copy (in MySQL row format). +@param[in] prebuilt prebuilt struct. */ +void +row_sel_copy_cached_fields_for_mysql( + byte* buf, + const byte* cached_rec, + row_prebuilt_t* prebuilt) +{ + const mysql_row_templ_t*templ; + ulint i; + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + + /* Skip virtual columns */ + if (templ->is_virtual) { + continue; + } + + row_sel_copy_cached_field_for_mysql( + buf, cached_rec, templ); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) { +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + buf[templ->mysql_null_byte_offset] + ^= (buf[templ->mysql_null_byte_offset] + ^ cached_rec[templ->mysql_null_byte_offset]) + & (byte) templ->mysql_null_bit_mask; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + } + } +} + +/********************************************************************//** +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_dequeue_cached_row_for_mysql( +/*=================================*/ + byte* buf, /*!< in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */ +{ + ulint i; + const mysql_row_templ_t*templ; + const byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len); + + MEM_CHECK_ADDRESSABLE(buf, prebuilt->mysql_row_len); + + cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first]; + + if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) { + row_sel_copy_cached_fields_for_mysql(buf, cached_rec, prebuilt); + } else if (prebuilt->mysql_prefix_len > 63) { + /* The record is long. Copy it field by field, in case + there are some long VARCHAR column of which only a + small length is being used. */ + MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len); + + /* First copy the NULL bits. */ + memcpy(buf, cached_rec, prebuilt->null_bitmap_len); + /* Then copy the requested fields. */ + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + + /* Skip virtual columns */ + if (templ->is_virtual + && !(dict_index_has_virtual(prebuilt->index) + && prebuilt->read_just_key)) { + continue; + } + + row_sel_copy_cached_field_for_mysql( + buf, cached_rec, templ); + } + } else { + memcpy(buf, cached_rec, prebuilt->mysql_prefix_len); + } + + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/********************************************************************//** +Initialise the prefetch cache. */ +UNIV_INLINE +void +row_sel_prefetch_cache_init( +/*========================*/ + row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */ +{ + ulint i; + ulint sz; + byte* ptr; + + /* Reserve space for the magic number. */ + sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8); + ptr = static_cast<byte*>(ut_malloc_nokey(sz)); + + for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) { + + /* A user has reported memory corruption in these + buffers in Linux. Put magic numbers there to help + to track a possible bug. */ + + mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N); + ptr += 4; + + prebuilt->fetch_cache[i] = ptr; + ptr += prebuilt->mysql_row_len; + + mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N); + ptr += 4; + } +} + +/********************************************************************//** +Get the last fetch cache buffer from the queue. +@return pointer to buffer. */ +UNIV_INLINE +byte* +row_sel_fetch_last_buf( +/*===================*/ + row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */ +{ + ut_ad(!prebuilt->templ_contains_blob); + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + ut_ad(prebuilt->n_fetch_cached == 0); + + row_sel_prefetch_cache_init(prebuilt); + } + + ut_ad(prebuilt->fetch_cache_first == 0); + MEM_UNDEFINED(prebuilt->fetch_cache[prebuilt->n_fetch_cached], + prebuilt->mysql_row_len); + + return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]); +} + +/********************************************************************//** +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_enqueue_cache_row_for_mysql( +/*================================*/ + byte* mysql_rec, /*!< in/out: MySQL record */ + row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */ +{ + /* For non ICP code path the row should already exist in the + next fetch cache slot. */ + + if (prebuilt->pk_filter || prebuilt->idx_cond) { + memcpy(row_sel_fetch_last_buf(prebuilt), mysql_rec, + prebuilt->mysql_row_len); + } + + ++prebuilt->n_fetch_cached; +} + +#ifdef BTR_CUR_HASH_ADAPT +/*********************************************************************//** +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). We assume that the search +mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx, +btr search latch has been locked in S-mode if AHI is enabled. +@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ +static +ulint +row_sel_try_search_shortcut_for_mysql( +/*==================================*/ + const rec_t** out_rec,/*!< out: record if found */ + row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */ + rec_offs** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + mtr_t* mtr) /*!< in: started mtr */ +{ + dict_index_t* index = prebuilt->index; + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + const rec_t* rec; + + ut_ad(index->is_primary()); + ut_ad(!index->table->is_temporary()); + ut_ad(!prebuilt->templ_contains_blob); + ut_ad(trx->read_view.is_open()); + pcur->old_rec = nullptr; + + if (btr_pcur_open_with_no_init(search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, pcur, mtr) + != DB_SUCCESS) { + return SEL_RETRY; + } + + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) { + return SEL_RETRY; + } + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) { + return SEL_EXHAUSTED; + } + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + } else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) { + /* See row_search_mvcc() for a comment on bulk_trx_id */ + if (!trx->read_view.changes_visible(bulk_trx_id)) { + return SEL_EXHAUSTED; + } + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, + ULINT_UNDEFINED, heap); + + if (row_sel_clust_sees(rec, *index, *offsets, trx->read_view) + != DB_SUCCESS) { + return SEL_RETRY; + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + return SEL_EXHAUSTED; + } + + *out_rec = rec; + + return SEL_FOUND; +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/*********************************************************************//** +Check a pushed-down index condition. +@return CHECK_ABORTED_BY_USER, CHECK_NEG, CHECK_POS, or CHECK_OUT_OF_RANGE */ +static +check_result_t +row_search_idx_cond_check( +/*======================*/ + byte* mysql_rec, /*!< out: record + in MySQL format (invalid unless + prebuilt->idx_cond!=NULL and + we return ICP_MATCH) */ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + for the table handle */ + const rec_t* rec, /*!< in: InnoDB record */ + const rec_offs* offsets) /*!< in: rec_get_offsets() */ +{ + ulint i; + + ut_ad(rec_offs_validate(rec, prebuilt->index, offsets)); + + if (!prebuilt->idx_cond) { + if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) { + return(CHECK_POS); + } + } else { + MONITOR_INC(MONITOR_ICP_ATTEMPTS); + } + + /* Convert to MySQL format those fields that are needed for + evaluating the index condition. */ + + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { + mem_heap_empty(prebuilt->blob_heap); + } + + for (i = 0; i < prebuilt->idx_cond_n_cols; i++) { + const mysql_row_templ_t*templ = &prebuilt->mysql_template[i]; + + /* Skip virtual columns */ + if (templ->is_virtual) { + continue; + } + + if (!row_sel_store_mysql_field(mysql_rec, prebuilt, + rec, prebuilt->index, offsets, + templ->icp_rec_field_no, + templ)) { + return(CHECK_NEG); + } + } + + /* We assume that the index conditions on + case-insensitive columns are case-insensitive. The + case of such columns may be wrong in a secondary + index, if the case of the column has been updated in + the past, or a record has been deleted and a record + inserted in a different case. */ + check_result_t result = prebuilt->idx_cond + ? handler_index_cond_check(prebuilt->idx_cond) + : CHECK_POS; + + switch (result) { + case CHECK_POS: + if (handler_rowid_filter_is_active(prebuilt->pk_filter)) { + ut_ad(!prebuilt->index->is_primary()); + if (prebuilt->clust_index_was_generated) { + ulint len; + dict_index_t* index = prebuilt->index; + const byte* data = rec_get_nth_field( + rec, offsets, index->n_fields - 1, + &len); + ut_ad(dict_index_get_nth_col(index, + index->n_fields - 1) + ->prtype == (DATA_ROW_ID | DATA_NOT_NULL)); + ut_ad(len == DATA_ROW_ID_LEN); + memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN); + } + result = handler_rowid_filter_check(prebuilt->pk_filter); + switch (result) { + case CHECK_NEG: + MONITOR_INC(MONITOR_ICP_NO_MATCH); + return(result); + case CHECK_OUT_OF_RANGE: + MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE); + return(result); + case CHECK_POS: + break; + default: + return(result); + } + } + /* Convert the remaining fields to MySQL format. + If this is a secondary index record, we must defer + this until we have fetched the clustered index record. */ + if (!prebuilt->need_to_access_clustered + || dict_index_is_clust(prebuilt->index)) { + if (!row_sel_store_mysql_rec( + mysql_rec, prebuilt, rec, NULL, false, + prebuilt->index, offsets)) { + ut_ad(dict_index_is_clust(prebuilt->index)); + return(CHECK_NEG); + } + } + MONITOR_INC(MONITOR_ICP_MATCH); + return(result); + case CHECK_NEG: + MONITOR_INC(MONITOR_ICP_NO_MATCH); + return(result); + case CHECK_OUT_OF_RANGE: + MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE); + return(result); + case CHECK_ERROR: + case CHECK_ABORTED_BY_USER: + return(result); + } + + ut_error; + return(result); +} + +/** Extract virtual column data from a virtual index record and fill a dtuple +@param[in] rec the virtual (secondary) index record +@param[in] index the virtual index +@param[in,out] vrow the dtuple where data extract to +@param[in] heap memory heap to allocate memory +*/ +static +void +row_sel_fill_vrow( + const rec_t* rec, + dict_index_t* index, + dtuple_t** vrow, + mem_heap_t* heap) +{ + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(!(*vrow)); + ut_ad(heap); + ut_ad(!dict_index_is_clust(index)); + ut_ad(!index->is_instant()); + ut_ad(page_rec_is_leaf(rec)); + + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + *vrow = dtuple_create_with_vcol( + heap, 0, dict_table_get_n_v_cols(index->table)); + + /* Initialize all virtual row's mtype to DATA_MISSING */ + dtuple_init_v_fld(*vrow); + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field; + const dict_col_t* col; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + + if (col->is_virtual()) { + const byte* data; + ulint len; + + data = rec_get_nth_field(rec, offsets, i, &len); + + const dict_v_col_t* vcol = reinterpret_cast< + const dict_v_col_t*>(col); + + dfield_t* dfield = dtuple_get_nth_v_field( + *vrow, vcol->v_pos); + dfield_set_data(dfield, data, len); + dict_col_copy_type(col, dfield_get_type(dfield)); + } + } +} + +/** Return the record field length in characters. +@param[in] col table column of the field +@param[in] field_no field number +@param[in] rec physical record +@param[in] offsets field offsets in the physical record +@return field length in characters. */ +static +size_t +rec_field_len_in_chars( + const dict_col_t* col, + const ulint field_no, + const rec_t* rec, + const rec_offs* offsets) +{ + const ulint cset = dtype_get_charset_coll(col->prtype); + const CHARSET_INFO* cs = all_charsets[cset]; + ulint rec_field_len; + const char* rec_field = reinterpret_cast<const char *>( + rec_get_nth_field( + rec, offsets, field_no, &rec_field_len)); + + if (UNIV_UNLIKELY(!cs)) { + ib::warn() << "Missing collation " << cset; + return SIZE_T_MAX; + } + + return cs->numchars(rec_field, rec_field + rec_field_len); +} + +/** Avoid the clustered index lookup if all the following conditions +are true: +1) all columns are in secondary index +2) all values for columns that are prefix-only indexes are shorter +than the prefix size. This optimization can avoid many IOs for certain schemas. +@return true, to avoid clustered index lookup. */ +static +bool row_search_with_covering_prefix( + row_prebuilt_t* prebuilt, + const rec_t* rec, + const rec_offs* offsets) +{ + const dict_index_t* index = prebuilt->index; + ut_ad(!dict_index_is_clust(index)); + + /* In ha_innobase::build_template() we choose to access the + whole row when using exclusive row locks or In case of fts + query, we need to read from clustered index */ + if (prebuilt->select_lock_type == LOCK_X || prebuilt->in_fts_query + || !index->is_btree()) { + return false; + } + + /** Optimization only applicable if there the number of secondary index + fields are greater than or equal to number of clustered index fields. */ + if (prebuilt->n_template > index->n_fields) { + return false; + } + + /* We can avoid a clustered index lookup if + all of the following hold: + (1) all columns are in the secondary index + (2) all values for columns that are prefix-only + indexes are shorter than the prefix size + This optimization can avoid many IOs for certain schemas. */ + for (ulint i = 0; i < prebuilt->n_template; i++) { + mysql_row_templ_t* templ = prebuilt->mysql_template + i; + ulint j = templ->rec_prefix_field_no; + ut_ad(!templ->mbminlen == !templ->mbmaxlen); + + /** Condition (1) : is the field in the index. */ + if (j == ULINT_UNDEFINED) { + return false; + } + + /** Condition (2): If this is a prefix index then + row's value size shorter than prefix length. */ + + if (!templ->rec_field_is_prefix + || rec_offs_nth_sql_null(offsets, j)) { + continue; + } + + const dict_field_t* field = dict_index_get_nth_field(index, j); + + if (!field->prefix_len) { + continue; + } + + const ulint rec_size = rec_offs_nth_size(offsets, j); + + if (rec_size >= field->prefix_len) { + /* Shortest representation string by the + byte length of the record is longer than the + maximum possible index prefix. */ + return false; + } + + if (templ->mbminlen != templ->mbmaxlen + && rec_field_len_in_chars(field->col, j, rec, offsets) + >= field->prefix_len / templ->mbmaxlen) { + /* No of chars to store the record exceeds + the index prefix character length. */ + return false; + } + } + + /* If prefix index optimization condition satisfied then + for all columns above, use rec_prefix_field_no instead of + rec_field_no, and skip the clustered lookup below. */ + for (ulint i = 0; i < prebuilt->n_template; i++) { + mysql_row_templ_t* templ = prebuilt->mysql_template + i; + templ->rec_field_no = templ->rec_prefix_field_no; + ut_a(templ->rec_field_no != ULINT_UNDEFINED); + } + + return true; +} + +/** Searches for rows in the database using cursor. +Function is mainly used for tables that are shared across connections and +so it employs technique that can help re-construct the rows that +transaction is suppose to see. +It also has optimization such as pre-caching the rows, using AHI, etc. + +@param[out] buf buffer for the fetched row in MySQL format +@param[in] mode search mode PAGE_CUR_L +@param[in,out] prebuilt prebuilt struct for the table handler; + this contains the info to search_tuple, + index; if search tuple contains 0 field then + we position the cursor at start or the end of + index, depending on 'mode' +@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX +@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; + Note: if this is != 0, then prebuilt must has a + pcur with stored position! In opening of a + cursor 'direction' should be 0. +@return DB_SUCCESS or error code */ +dberr_t +row_search_mvcc( + byte* buf, + page_cur_mode_t mode, + row_prebuilt_t* prebuilt, + ulint match_mode, + ulint direction) +{ + DBUG_ENTER("row_search_mvcc"); + DBUG_ASSERT(prebuilt->index->table == prebuilt->table); + + dict_index_t* index = prebuilt->index; + ibool comp = dict_table_is_comp(prebuilt->table); + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + const rec_t* UNINIT_VAR(rec); + dtuple_t* vrow = NULL; + const rec_t* result_rec = NULL; + const rec_t* clust_rec; + Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql; + ibool unique_search = FALSE; + ulint mtr_extra_clust_savepoint = 0; + bool moves_up = false; + /* if the returned record was locked and we did a semi-consistent + read (fetch the newest committed version), then this is set to + TRUE */ + ulint next_offs; + bool same_user_rec; + ibool table_lock_waited = FALSE; + byte* next_buf = 0; + bool spatial_search = false; + + ut_ad(index && pcur && search_tuple); + ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); + + /* We don't support FTS queries from the HANDLER interfaces, because + we implemented FTS as reversed inverted index with auxiliary tables. + So anything related to traditional index query would not apply to + it. */ + if (prebuilt->index->type & DICT_FTS) { + DBUG_RETURN(DB_END_OF_INDEX); + } + + if (!prebuilt->table->space) { + DBUG_RETURN(DB_TABLESPACE_DELETED); + } else if (!prebuilt->table->is_readable()) { + if (fil_space_crypt_t* crypt_data = + prebuilt->table->space->crypt_data) { + if (crypt_data->should_encrypt()) { + DBUG_RETURN(DB_DECRYPTION_FAILED); + } + } + DBUG_RETURN(DB_CORRUPTION); + } else if (!prebuilt->index_usable) { + DBUG_RETURN(DB_MISSING_HISTORY); + } else if (prebuilt->index->is_corrupted()) { + DBUG_RETURN(DB_CORRUPTION); + } + + pcur->btr_cur.page_cur.index = index; + + /* We need to get the virtual column values stored in secondary + index key, if this is covered index scan or virtual key read is + requested. */ + bool need_vrow = prebuilt->read_just_key + && prebuilt->index->has_virtual(); + + /* Reset the new record lock info if READ UNCOMMITTED or + READ COMMITED isolation level is used. Then + we are able to remove the record locks set here on an individual + row. */ + prebuilt->new_rec_locks = 0; + + /*-------------------------------------------------------------*/ + /* PHASE 1: Try to pop the row from the prefetch cache */ + + if (UNIV_UNLIKELY(direction == 0)) { + trx->op_info = "starting index read"; + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + trx->op_info = "fetching rows"; + + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) { + if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) { + ut_error; + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) { + row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + trx->op_info = ""; + DBUG_RETURN(DB_SUCCESS); + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { +early_not_found: + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + trx->op_info = ""; + DBUG_RETURN(DB_RECORD_NOT_FOUND); + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + /* In a search where at most one record in the index may match, we + can use a LOCK_REC_NOT_GAP type record lock when locking a + non-delete-marked matching record. + + Note that in a unique secondary index there may be different + delete-marked versions of a record where only the primary key + values differ: thus in a secondary index we must use next-key + locks when locking delete-marked records. */ + + if (match_mode == ROW_SEL_EXACT + && dict_index_is_unique(index) + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index) + && (dict_index_is_clust(index) + || !dtuple_contains_null(search_tuple))) { + + /* Note above that a UNIQUE secondary index can contain many + rows with the same key value if one of the columns is the SQL + null. A clustered index under MySQL can never contain null + columns because we demand that all the columns in primary key + are non-null. */ + + unique_search = TRUE; + + /* Even if the condition is unique, MySQL seems to try to + retrieve also a second row if a primary key contains more than + 1 column. Return immediately if this is not a HANDLER + command. */ + + if (UNIV_UNLIKELY(direction != 0 + && !prebuilt->used_in_HANDLER)) { + goto early_not_found; + } + } + + /* We don't support sequencial scan for Rtree index, because it + is no meaning to do so. */ + if (dict_index_is_spatial(index) && !RTREE_SEARCH_MODE(mode)) { + trx->op_info = ""; + DBUG_RETURN(DB_END_OF_INDEX); + } + + /* if the query is a plain locking SELECT, and the isolation level + is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ + bool did_semi_consistent_read = false; + mtr_t mtr; + mtr.start(); + + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + +#ifdef BTR_CUR_HASH_ADAPT + /*-------------------------------------------------------------*/ + /* PHASE 2: Try fast adaptive hash index search if possible */ + + /* Next test if this is the special case where we can use the fast + adaptive hash index to try the search. Since we must release the + search system latch when we retrieve an externally stored field, we + cannot use the adaptive hash index in a search in the case the row + may be long and there may be externally stored fields */ + + if (UNIV_UNLIKELY(direction == 0) + && unique_search + && btr_search_enabled + && dict_index_is_clust(index) + && !index->table->is_temporary() + && !prebuilt->templ_contains_blob + && !prebuilt->used_in_HANDLER + && (prebuilt->mysql_row_len < srv_page_size / 8)) { + + mode = PAGE_CUR_GE; + + if (prebuilt->select_lock_type == LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && trx->read_view.is_open()) { + + /* This is a SELECT query done as a consistent read, + and the read view has already been allocated: + let us try a search shortcut through the hash + index. */ + + dberr_t err = DB_SUCCESS; + switch (row_sel_try_search_shortcut_for_mysql( + &rec, prebuilt, &offsets, &heap, + &mtr)) { + case SEL_FOUND: + /* At this point, rec is protected by + a page latch that was acquired by + row_sel_try_search_shortcut_for_mysql(). + The latch will not be released until + mtr.commit(). */ + ut_ad(!rec_get_deleted_flag(rec, comp)); + + if (prebuilt->pk_filter || prebuilt->idx_cond) { + switch (row_search_idx_cond_check( + buf, prebuilt, + rec, offsets)) { + case CHECK_ABORTED_BY_USER: + goto aborted; + case CHECK_NEG: + case CHECK_OUT_OF_RANGE: + case CHECK_ERROR: + err = DB_RECORD_NOT_FOUND; + goto shortcut_done; + case CHECK_POS: + goto shortcut_done; + } + + ut_ad("incorrect code" == 0); +aborted: + err = DB_INTERRUPTED; + goto shortcut_done; + } + + if (!row_sel_store_mysql_rec( + buf, prebuilt, + rec, NULL, false, index, + offsets)) { + /* Only fresh inserts may contain + incomplete externally stored + columns. Pretend that such + records do not exist. Such + records may only be accessed + at the READ UNCOMMITTED + isolation level or when + rolling back a recovered + transaction. Rollback happens + at a lower level, not here. */ + + /* Proceed as in case SEL_RETRY. */ + break; + } + + goto shortcut_done; + + case SEL_EXHAUSTED: + err = DB_RECORD_NOT_FOUND; + shortcut_done: + mtr.commit(); + + /* NOTE that we do NOT store the cursor + position */ + trx->op_info = ""; + ut_ad(!did_semi_consistent_read); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + DBUG_RETURN(err); + + case SEL_RETRY: + break; + + default: + ut_ad(0); + } + + mtr.commit(); + mtr.start(); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + /*-------------------------------------------------------------*/ + /* PHASE 3: Open or restore index cursor position */ + + spatial_search = dict_index_is_spatial(index) + && mode >= PAGE_CUR_CONTAIN; + +#ifdef UNIV_DEBUG + /* The state of a running trx can only be changed by the + thread that is currently serving the transaction. Because we + are that thread, we can read trx->state without holding any + mutex. */ + switch (trx->state) { + case TRX_STATE_ACTIVE: + break; + case TRX_STATE_NOT_STARTED: + ut_ad(prebuilt->sql_stat_start + || prebuilt->table->no_rollback()); + break; + default: + ut_ad("invalid trx->state" == 0); + } +#endif + + ut_ad(prebuilt->sql_stat_start + || prebuilt->select_lock_type != LOCK_NONE + || trx->read_view.is_open() + || prebuilt->table->no_rollback() + || srv_read_only_mode); + + /* Do not lock gaps at READ UNCOMMITTED or READ COMMITTED + isolation level */ + const bool set_also_gap_locks = + prebuilt->select_lock_type != LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_COMMITTED +#ifdef WITH_WSREP + && !wsrep_thd_skip_locking(trx->mysql_thd) +#endif /* WITH_WSREP */ + ; + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (UNIV_UNLIKELY(direction == 0)) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G + || mode >= PAGE_CUR_CONTAIN) { + moves_up = true; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = true; + } + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + clust_index = dict_table_get_first_index(prebuilt->table); + + dberr_t err = DB_SUCCESS; + + /* Do some start-of-statement preparations */ + + if (prebuilt->table->no_rollback()) { + /* NO_ROLLBACK tables do not support MVCC or locking. */ + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->sql_stat_start = FALSE; + } else if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + ut_a(prebuilt->select_lock_type != LOCK_NONE + || srv_read_only_mode || trx->read_view.is_open()); + } else { + prebuilt->sql_stat_start = FALSE; + trx_start_if_not_started(trx, false); + + if (prebuilt->select_lock_type == LOCK_NONE) { + trx->read_view.open(trx); + } else { +wait_table_again: + err = lock_table(prebuilt->table, nullptr, + prebuilt->select_lock_type == LOCK_S + ? LOCK_IS : LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + table_lock_waited = TRUE; + goto lock_table_wait; + } + } + } + + /* Open or restore index cursor position */ + + if (UNIV_LIKELY(direction != 0)) { + if (spatial_search) { + /* R-Tree access does not need to do + cursor position and resposition */ + goto next_rec; + } + + bool need_to_process = sel_restore_position_for_mysql( + &same_user_rec, BTR_SEARCH_LEAF, + pcur, moves_up, &mtr); + + if (UNIV_UNLIKELY(need_to_process)) { + if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) { + mtr.commit(); + trx->op_info = ""; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return DB_CORRUPTION; + } + + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + /* We did a semi-consistent read, + but the record was removed in + the meantime. */ + prebuilt->row_read_type + = ROW_READ_TRY_SEMI_CONSISTENT; + } + } else if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_DID_SEMI_CONSISTENT)) { + + /* The cursor was positioned on the record + that we returned previously. If we need + to repeat a semi-consistent read as a + pessimistic locking read, the record + cannot be skipped. */ + + goto next_rec_after_check; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + pcur->btr_cur.thr = thr; + pcur->old_rec = nullptr; + + if (index->is_spatial()) { + if (!prebuilt->rtr_info) { + prebuilt->rtr_info = rtr_create_rtr_info( + set_also_gap_locks, true, + btr_pcur_get_btr_cur(pcur), index); + prebuilt->rtr_info->search_tuple = search_tuple; + prebuilt->rtr_info->search_mode = mode; + rtr_info_update_btr(btr_pcur_get_btr_cur(pcur), + prebuilt->rtr_info); + } else { + rtr_info_reinit_in_cursor( + btr_pcur_get_btr_cur(pcur), + index, set_also_gap_locks); + prebuilt->rtr_info->search_tuple = search_tuple; + prebuilt->rtr_info->search_mode = mode; + } + + err = rtr_search_leaf(pcur, search_tuple, mode, &mtr); + } else { + err = btr_pcur_open_with_no_init(search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, &mtr); + } + + if (err != DB_SUCCESS) { +page_corrupted: + rec = NULL; + goto page_read_error; + } + + pcur->trx_if_known = trx; + + rec = btr_pcur_get_rec(pcur); + ut_ad(page_rec_is_leaf(rec)); + + if (!moves_up + && set_also_gap_locks + && !page_rec_is_supremum(rec) + && !dict_index_is_spatial(index)) { + + /* Try to place a gap lock on the next index record + to prevent phantoms in ORDER BY ... DESC queries */ + const rec_t* next_rec = page_rec_get_next_const(rec); + if (UNIV_UNLIKELY(!next_rec)) { + err = DB_CORRUPTION; + goto page_corrupted; + } + + offsets = rec_get_offsets(next_rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(pcur, + next_rec, index, offsets, + prebuilt->select_lock_type, + LOCK_GAP, thr, &mtr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) { + err = pcur->open_leaf(mode == PAGE_CUR_G, index, + BTR_SEARCH_LEAF, &mtr); + + if (err != DB_SUCCESS) { + if (err == DB_DECRYPTION_FAILED) { + btr_decryption_failed(*index); + } + rec = NULL; + goto page_read_error; + } + } + + /* Check if the table is supposed to be empty for our read view. + + If we read bulk_trx_id as an older transaction ID, it is not + incorrect to check here whether that transaction should be + visible to us. If bulk_trx_id is not visible to us, the table + must have been empty at an earlier point of time, also in our + read view. + + An INSERT would only update bulk_trx_id in + row_ins_clust_index_entry_low() if the table really was empty + (everything had been purged), when holding a leaf page latch + in the clustered index (actually, the root page is the only + leaf page in that case). + + We are already holding a leaf page latch here, either + in a secondary index or in a clustered index. + + If we are holding a clustered index page latch, there clearly + is no potential for race condition with a concurrent INSERT: + such INSERT would be blocked by us. + + If we are holding a secondary index page latch, then we are + not directly blocking a concurrent INSERT that might update + bulk_trx_id to something that does not exist in our read view. + But, in that case, the entire table (all indexes) must have + been empty. So, even if our read below missed the update of + index->table->bulk_trx_id, we can safely proceed to reading + the empty secondary index page. Our latch will prevent the + INSERT from proceeding to that page. It will first modify + the clustered index. Also, we may only look up something in + the clustered index if the secondary index page is not empty + to begin with. So, only if the table is corrupted + (the clustered index is empty but the secondary index is not) + we could return corrupted results. */ + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED + || !trx->read_view.is_open()) { + } else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) { + /* InnoDB should allow the transaction to read all + the rows when InnoDB intends to do any locking + on the record */ + if (prebuilt->select_lock_type == LOCK_NONE + && !trx->read_view.changes_visible(bulk_trx_id)) { + trx->op_info = ""; + err = DB_END_OF_INDEX; + goto normal_return; + } + } + +rec_loop: + DEBUG_SYNC_C("row_search_rec_loop"); + if (trx_is_interrupted(trx)) { + if (!spatial_search) { + btr_pcur_store_position(pcur, &mtr); + } + err = DB_INTERRUPTED; + goto normal_return; + } + + /*-------------------------------------------------------------*/ + /* PHASE 4: Look for matching records in a loop */ + + rec = btr_pcur_get_rec(pcur); + + ut_ad(!!page_rec_is_comp(rec) == comp); + ut_ad(page_rec_is_leaf(rec)); + + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (page_rec_is_supremum(rec)) { + + if (set_also_gap_locks + && !dict_index_is_spatial(index)) { + + /* Try to place a lock on the index record */ + + /* If the transaction isolation level is + READ UNCOMMITTED or READ COMMITTED, + we do not lock gaps. Supremum record is really + a gap and therefore we do not set locks there. */ + + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(pcur, + rec, index, offsets, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr, &mtr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + /* A page supremum record cannot be in the result set: skip + it now that we have placed a possible lock on it */ + + goto next_rec; + } + + /*-------------------------------------------------------------*/ + /* Do sanity checks in case our cursor has bumped into page + corruption */ + + if (comp) { + if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) { + /* Skip the metadata pseudo-record. */ + ut_ad(index->is_instant()); + goto next_rec; + } + + next_offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) { + + goto wrong_offs; + } + } else { + if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) { + /* Skip the metadata pseudo-record. */ + ut_ad(index->is_instant()); + goto next_rec; + } + + next_offs = rec_get_next_offs(rec, FALSE); + if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) { + + goto wrong_offs; + } + } + + if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) { + +wrong_offs: + if (srv_force_recovery == 0 || moves_up == false) { + ib::error() << "Rec address " + << static_cast<const void*>(rec) + << ", buf block fix count " + << btr_pcur_get_block(pcur)->page + .buf_fix_count(); + + ib::error() << "Index corruption: rec offs " + << page_offset(rec) << " next offs " + << next_offs + << btr_pcur_get_block(pcur)->page.id() + << ", index " << index->name + << " of table " << index->table->name + << ". Run CHECK TABLE. You may need to" + " restore from a backup, or dump + drop +" + " reimport the table."; + ut_ad(0); + err = DB_CORRUPTION; + + goto page_read_error; + } else { + /* The user may be dumping a corrupt table. Jump + over the corruption to recover as much as possible. */ + + ib::info() << "Index corruption: rec offs " + << page_offset(rec) << " next offs " + << next_offs + << btr_pcur_get_block(pcur)->page.id() + << ", index " << index->name + << " of table " << index->table->name + << ". We try to skip the rest of the page."; + + page_cur_set_after_last(btr_pcur_get_block(pcur), + btr_pcur_get_page_cur(pcur)); + pcur->old_rec = nullptr; + goto next_rec; + } + } + /*-------------------------------------------------------------*/ + + /* Calculate the 'offsets' associated with 'rec' */ + + ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur))); + ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id); + + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(srv_force_recovery > 0)) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { + + ib::error() << "Index corruption: rec offs " + << page_offset(rec) << " next offs " + << next_offs + << btr_pcur_get_block(pcur)->page.id() + << ", index " << index->name + << " of table " << index->table->name + << ". We try to skip the record."; + + goto next_rec; + } + } + + /* Note that we cannot trust the up_match value in the cursor at this + place because we can arrive here after moving the cursor! Thus + we have to recompare rec and search_tuple to determine if they + match enough. */ + + if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + /* fputs("Comparing rec and search tuple\n", stderr); */ + + if (cmp_dtuple_rec(search_tuple, rec, index, offsets)) { + + if (set_also_gap_locks + && !dict_index_is_spatial(index)) { + err = sel_set_rec_lock( + pcur, + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr, &mtr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + /* The found record was not a match, but may be used + as NEXT record (index_next). Set the relative position + to BTR_PCUR_BEFORE, to reflect that the position of + the persistent cursor is before the found/stored row + (pcur->old_rec). */ + ut_ad(pcur->rel_pos == BTR_PCUR_ON); + pcur->rel_pos = BTR_PCUR_BEFORE; + + err = DB_RECORD_NOT_FOUND; + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, + index, offsets)) { + + if (set_also_gap_locks + && !dict_index_is_spatial(index)) { + err = sel_set_rec_lock( + pcur, + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr, &mtr); + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + default: + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + /* The found record was not a match, but may be used + as NEXT record (index_next). Set the relative position + to BTR_PCUR_BEFORE, to reflect that the position of + the persistent cursor is before the found/stored row + (pcur->old_rec). */ + ut_ad(pcur->rel_pos == BTR_PCUR_ON); + pcur->rel_pos = BTR_PCUR_BEFORE; + + err = DB_RECORD_NOT_FOUND; + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; note that delete + marked records are a special case in a unique search. If there + is a non-delete marked record, then it is enough to lock its + existence with LOCK_REC_NOT_GAP. */ + + unsigned lock_type; + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + /* At READ COMMITTED or READ UNCOMMITTED + isolation levels, do not lock committed + delete-marked records. */ + if (!rec_get_deleted_flag(rec, comp)) { + goto no_gap_lock; + } + + /* At most one transaction can be active + for temporary table. */ + if (clust_index->table->is_temporary()) { + goto no_gap_lock; + } + + if (index == clust_index) { + trx_id_t trx_id = row_get_rec_trx_id( + rec, index, offsets); + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(trx_id); + if (!trx_sys.is_registered(trx, trx_id)) { + /* The clustered index record + was delete-marked in a committed + transaction. Ignore the record. */ + goto locks_ok_del_marked; + } + } else if (trx_t* t = row_vers_impl_x_locked( + trx, rec, index, offsets)) { + /* The record belongs to an active + transaction. We must acquire a lock. */ + t->release_reference(); + } else { + /* The secondary index record does not + point to a delete-marked clustered index + record that belongs to an active transaction. + Ignore the secondary index record, because + it is not locked. */ + goto next_rec; + } + + goto no_gap_lock; + } + +#ifdef WITH_WSREP + if (UNIV_UNLIKELY(!set_also_gap_locks)) { + ut_ad(wsrep_thd_skip_locking(trx->mysql_thd)); + goto no_gap_lock; + } +#else /* WITH_WSREP */ + ut_ad(set_also_gap_locks); +#endif /* WITH_WSREP */ + + /* Set next-key lock both for delete- and non-delete-marked + records for unique search, because non-delete-marked record can + be marked as deleted while transaction suspends. */ + if (index->is_spatial()) { + goto no_gap_lock; + } + + /* If we are doing a 'greater or equal than a primary key + value' search from a clustered index, and we find a record + that has that exact primary key value, then there is no need + to lock the gap before the record, because no insert in the + gap can be in our search range. That is, no phantom row can + appear that way. + + An example: if col1 is the primary key, the search is WHERE + col1 >= 100, and we find a record where col1 = 100, then no + need to lock the gap before that record. */ + + if (index == clust_index + && mode == PAGE_CUR_GE + && direction == 0 + && dtuple_get_n_fields_cmp(search_tuple) + == dict_index_get_n_unique(index) + && !cmp_dtuple_rec(search_tuple, rec, index, offsets)) { +no_gap_lock: + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(pcur, + rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr, &mtr); + + switch (err) { + const rec_t* old_vers; + case DB_SUCCESS_LOCKED_REC: + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + /* Note that a record of + prebuilt->index was locked. */ + prebuilt->new_rec_locks = 1; + } + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + break; + case DB_LOCK_WAIT: + /* Lock wait for R-tree should already + be handled in sel_set_rtr_rec_lock() */ + ut_ad(!dict_index_is_spatial(index)); + /* Never unlock rows that were part of a conflict. */ + prebuilt->new_rec_locks = 0; + + if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_TRY_SEMI_CONSISTENT) + || unique_search + || index != clust_index) { + if (!prebuilt->skip_locked) { + goto lock_wait_or_error; + } + } else { + /* The following call returns 'offsets' + associated with 'old_vers' */ + row_sel_build_committed_vers_for_mysql( + clust_index, prebuilt, rec, + &offsets, &heap, &old_vers, + need_vrow ? &vrow : NULL, &mtr); + } + + /* Check whether it was a deadlock or not, if not + a deadlock and the transaction had to wait then + release the lock it is waiting on. */ + + err = lock_trx_handle_wait(trx); + + switch (err) { + case DB_SUCCESS: + ut_ad( + !trx->lock.was_chosen_as_deadlock_victim); + /* The lock was granted while we were + searching for the last committed version. + Do a normal locking read. */ + + offsets = rec_get_offsets( + rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + goto locks_ok; + case DB_DEADLOCK: + goto lock_wait_or_error; + case DB_LOCK_WAIT: + ut_ad(!dict_index_is_spatial(index)); + err = DB_SUCCESS; + if (prebuilt->skip_locked) { + goto next_rec; + } + break; + case DB_LOCK_WAIT_TIMEOUT: + if (prebuilt->skip_locked) { + err = DB_SUCCESS; + goto next_rec; + } + /* fall through */ + default: + ut_error; + } + + if (old_vers == NULL) { + /* The row was not yet committed */ + + goto next_rec; + } + + did_semi_consistent_read = true; + rec = old_vers; + break; + case DB_RECORD_NOT_FOUND: + if (dict_index_is_spatial(index)) { + goto next_rec; + } else { + goto lock_wait_or_error; + } + break; + case DB_LOCK_WAIT_TIMEOUT: + if (prebuilt->skip_locked) { + err = DB_SUCCESS; + goto next_rec; + } + /* fall through */ + default: + + goto lock_wait_or_error; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED + || prebuilt->table->is_temporary() + || prebuilt->table->no_rollback()) { + + /* Do nothing: we let a non-locking SELECT read the + latest version of the record */ + + } else if (index == clust_index) { + + /* Fetch a previous version of the row if the current + one is not visible in the snapshot; if we have a very + high force recovery level set, we try to avoid crashes + by skipping this lookup */ + + err = row_sel_clust_sees(rec, *index, offsets, + trx->read_view); + + switch (err) { + default: + goto lock_wait_or_error; + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + ut_ad(srv_force_recovery + < SRV_FORCE_NO_UNDO_LOG_SCAN); + rec_t* old_vers; + /* The following call returns 'offsets' + associated with 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + prebuilt, clust_index, + rec, &offsets, &heap, &old_vers, + need_vrow ? &vrow : nullptr, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + ut_ad(!dict_index_is_clust(index)); + + if (!srv_read_only_mode) { + trx_id_t trx_id = page_get_max_trx_id( + page_align(rec)); + ut_ad(trx_id); + if (trx->read_view.sees(trx_id)) { + goto locks_ok; + } + /* We should look at the clustered index. + However, as this is a non-locking read, + we can skip the clustered index lookup if + the condition does not match the secondary + index entry. */ + switch (row_search_idx_cond_check( + buf, prebuilt, rec, offsets)) { + case CHECK_NEG: + goto next_rec; + case CHECK_ABORTED_BY_USER: + err = DB_INTERRUPTED; + goto idx_cond_failed; + case CHECK_OUT_OF_RANGE: + case CHECK_ERROR: + err = DB_RECORD_NOT_FOUND; + goto idx_cond_failed; + case CHECK_POS: + goto requires_clust_rec; + } + + ut_error; + } + } + } + +locks_ok: + /* NOTE that at this point rec can be an old version of a clustered + index record built for a consistent read. We cannot assume after this + point that rec is on a buffer pool page. Functions like + page_rec_is_comp() cannot be used! */ + + if (rec_get_deleted_flag(rec, comp)) { +locks_ok_del_marked: + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(index != clust_index + || row_get_rec_trx_id(rec, index, offsets)); + + /* The record is delete-marked: we can skip it */ + + /* This is an optimization to skip setting the next key lock + on the record that follows this delete-marked record. This + optimization works because of the unique search criteria + which precludes the presence of a range lock between this + delete marked record and the record following it. + + For now this is applicable only to clustered indexes while + doing a unique search except for HANDLER queries because + HANDLER allows NEXT and PREV even in unique search on + clustered index. There is scope for further optimization + applicable to unique secondary indexes. Current behaviour is + to widen the scope of a lock on an already delete marked record + if the same record is deleted twice by the same transaction */ + if (index == clust_index && unique_search + && !prebuilt->used_in_HANDLER) { + + err = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + + goto next_rec; + } + + /* Check if the record matches the index condition. */ + switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) { + case CHECK_NEG: + if (did_semi_consistent_read) { + row_unlock_for_mysql(prebuilt, TRUE); + } + goto next_rec; + case CHECK_ABORTED_BY_USER: + err = DB_INTERRUPTED; + goto idx_cond_failed; + case CHECK_OUT_OF_RANGE: + case CHECK_ERROR: + err = DB_RECORD_NOT_FOUND; + goto idx_cond_failed; + case CHECK_POS: + break; + } + + if (index != clust_index && prebuilt->need_to_access_clustered) { + if (row_search_with_covering_prefix(prebuilt, rec, offsets)) { + goto use_covering_index; + } +requires_clust_rec: + ut_ad(index != clust_index); + /* We use a 'goto' to the preceding label if a consistent + read of a secondary index record requires us to look up old + versions of the associated clustered index record. */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_extra_clust_savepoint = mtr.get_savepoint(); + + ut_ad(!vrow); + /* The following call returns 'offsets' associated with + 'clust_rec'. Note that 'clust_rec' can be an old version + built for a consistent read. */ + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, + &offsets, &heap, + need_vrow ? &vrow : NULL, + &mtr); + if (err == DB_LOCK_WAIT && prebuilt->skip_locked) { + err = lock_trx_handle_wait(trx); + } + switch (err) { + case DB_SUCCESS: + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE + || dict_index_is_spatial(index)); + goto next_rec; + } + break; + case DB_SUCCESS_LOCKED_REC: + ut_a(clust_rec != NULL); + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + /* Note that the clustered index record + was locked. */ + prebuilt->new_rec_locks = 2; + } + err = DB_SUCCESS; + break; + case DB_LOCK_WAIT_TIMEOUT: + case DB_LOCK_WAIT: + if (prebuilt->skip_locked) { + err = DB_SUCCESS; + goto next_rec; + } + /* fall through */ + default: + vrow = NULL; + goto lock_wait_or_error; + } + + if (rec_get_deleted_flag(clust_rec, comp)) { + + /* The record is delete marked: we can skip it */ + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE) { + + /* No need to keep a lock on a delete-marked + record if we do not want to use next-key + locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + goto next_rec; + } + + if (need_vrow && !vrow) { + if (!heap) { + heap = mem_heap_create(100); + } + row_sel_fill_vrow(rec, index, &vrow, heap); + } + + result_rec = clust_rec; + ut_ad(rec_offs_validate(result_rec, clust_index, offsets)); + + if (prebuilt->pk_filter || prebuilt->idx_cond) { + /* Convert the record to MySQL format. We were + unable to do this in row_search_idx_cond_check(), + because the condition is on the secondary index + and the requested column is in the clustered index. + We convert all fields, including those that + may have been used in ICP, because the + secondary index may contain a column prefix + rather than the full column. Also, as noted + in Bug #56680, the column in the secondary + index may be in the wrong case, and the + authoritative case is in result_rec, the + appropriate version of the clustered index record. */ + if (!row_sel_store_mysql_rec( + buf, prebuilt, result_rec, vrow, + true, clust_index, offsets)) { + goto next_rec; + } + } + } else { +use_covering_index: + result_rec = rec; + } + + /* We found a qualifying record 'result_rec'. At this point, + 'offsets' are associated with 'result_rec'. */ + + ut_ad(rec_offs_validate(result_rec, + result_rec != rec ? clust_index : index, + offsets)); + ut_ad(!rec_get_deleted_flag(result_rec, comp)); + + /* Decide whether to prefetch extra rows. + At this point, the clustered index record is protected + by a page latch that was acquired when pcur was positioned. + The latch will not be released until mtr.commit(). */ + + if ((match_mode == ROW_SEL_EXACT + || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD) + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->templ_contains_blob + && !prebuilt->clust_index_was_generated + && !prebuilt->used_in_HANDLER + && !prebuilt->in_fts_query) { + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE. + Since we keep space in prebuilt only for the BLOBs of + a single row, we cannot cache rows in the case there + are BLOBs in the fields to be fetched. In HANDLER we do + not cache rows because there the cursor is a scrollable + cursor. */ + + ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + + /* We only convert from InnoDB row format to MySQL row + format when ICP is disabled. */ + + if (!prebuilt->pk_filter && !prebuilt->idx_cond) { + /* We use next_buf to track the allocation of buffers + where we store and enqueue the buffers for our + pre-fetch optimisation. + + If next_buf == 0 then we store the converted record + directly into the MySQL record buffer (buf). If it is + != 0 then we allocate a pre-fetch buffer and store the + converted record there. + + If the conversion fails and the MySQL record buffer + was not written to then we reset next_buf so that + we can re-use the MySQL record buffer in the next + iteration. */ + + next_buf = next_buf + ? row_sel_fetch_last_buf(prebuilt) : buf; + + if (!row_sel_store_mysql_rec( + next_buf, prebuilt, result_rec, vrow, + result_rec != rec, + result_rec != rec ? clust_index : index, + offsets)) { + + if (next_buf == buf) { + ut_a(prebuilt->n_fetch_cached == 0); + next_buf = 0; + } + + /* Only fresh inserts may contain incomplete + externally stored columns. Pretend that such + records do not exist. Such records may only be + accessed at the READ UNCOMMITTED isolation + level or when rolling back a recovered + transaction. Rollback happens at a lower + level, not here. */ + goto next_rec; + } + + if (next_buf != buf) { + row_sel_enqueue_cache_row_for_mysql( + next_buf, prebuilt); + } + } else { + row_sel_enqueue_cache_row_for_mysql(buf, prebuilt); + } + + if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) { + goto next_rec; + } + } else { + if (!prebuilt->pk_filter && !prebuilt->idx_cond) { + /* The record was not yet converted to MySQL format. */ + if (!row_sel_store_mysql_rec( + buf, prebuilt, result_rec, vrow, + result_rec != rec, + result_rec != rec ? clust_index : index, + offsets)) { + /* Only fresh inserts may contain + incomplete externally stored + columns. Pretend that such records do + not exist. Such records may only be + accessed at the READ UNCOMMITTED + isolation level or when rolling back a + recovered transaction. Rollback + happens at a lower level, not here. */ + goto next_rec; + } + } + + if (!prebuilt->clust_index_was_generated) { + } else if (result_rec != rec || index->is_primary()) { + memcpy(prebuilt->row_id, result_rec, DATA_ROW_ID_LEN); + } else { + ulint len; + const byte* data = rec_get_nth_field( + result_rec, offsets, index->n_fields - 1, + &len); + ut_ad(dict_index_get_nth_col(index, + index->n_fields - 1) + ->prtype == (DATA_ROW_ID | DATA_NOT_NULL)); + ut_ad(len == DATA_ROW_ID_LEN); + memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN); + } + } + + /* From this point on, 'offsets' are invalid. */ + + /* We have an optimization to save CPU time: if this is a consistent + read on a unique condition on the clustered index, then we do not + store the pcur position, because any fetch next or prev will anyway + return 'end of file'. Exceptions are locking reads and the MySQL + HANDLER command where the user can move the cursor with PREV or NEXT + even after a unique search. */ + + err = DB_SUCCESS; + +idx_cond_failed: + if (!unique_search + || !dict_index_is_clust(index) + || direction != 0 + || prebuilt->select_lock_type != LOCK_NONE + || prebuilt->used_in_HANDLER) { + + /* Inside an update always store the cursor position */ + + if (!spatial_search) { + btr_pcur_store_position(pcur, &mtr); + } + } + + goto normal_return; + +next_rec: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } +next_rec_after_check: + did_semi_consistent_read = false; + prebuilt->new_rec_locks = 0; + vrow = NULL; + + /*-------------------------------------------------------------*/ + /* PHASE 5: Move the cursor to the next index record */ + + /* NOTE: For moves_up==FALSE, the mini-transaction will be + committed and restarted every time when switching b-tree + pages. For moves_up==TRUE in index condition pushdown, we can + scan an entire secondary index tree within a single + mini-transaction. As long as the prebuilt->idx_cond does not + match, we do not need to consult the clustered index or + return records to MySQL, and thus we can avoid repositioning + the cursor. What prevents us from buffer-fixing all leaf pages + within the mini-transaction is the btr_leaf_page_release() + call in btr_pcur_move_to_next_page(). Only the leaf page where + the cursor is positioned will remain buffer-fixed. + For R-tree spatial search, we also commit the mini-transaction + each time */ + + if (spatial_search) { + /* No need to do store restore for R-tree */ + mtr.rollback_to_savepoint(0); + } else if (mtr_extra_clust_savepoint) { + /* We must release any clustered index latches + if we are moving to the next non-clustered + index record, because we could break the latching + order if we would access a different clustered + index page right away without releasing the previous. */ + mtr.rollback_to_savepoint(mtr_extra_clust_savepoint); + } + + mtr_extra_clust_savepoint = 0; + + if (moves_up) { + if (UNIV_UNLIKELY(spatial_search)) { + if (rtr_pcur_move_to_next( + search_tuple, mode, pcur, 0, &mtr)) { + goto rec_loop; + } + } else { + /* This is based on btr_pcur_move_to_next() */ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(pcur->latch_mode != BTR_NO_LATCHES); + pcur->old_rec = nullptr; + if (btr_pcur_is_after_last_on_page(pcur)) { + if (btr_pcur_is_after_last_in_tree(pcur)) { + goto not_moved; + } + err = btr_pcur_move_to_next_page(pcur, &mtr); + if (err != DB_SUCCESS) { + goto lock_wait_or_error; + } + } else if (!btr_pcur_move_to_next_on_page(pcur)) { + goto corrupted; + } + + goto rec_loop; + } + } else { + if (btr_pcur_move_to_prev(pcur, &mtr)) { + goto rec_loop; + } + if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) { +corrupted: + err = DB_CORRUPTION; + goto normal_return; + } + } + +not_moved: + if (!spatial_search) { + btr_pcur_store_position(pcur, &mtr); + } + + err = match_mode ? DB_RECORD_NOT_FOUND : DB_END_OF_INDEX; + goto normal_return; + +lock_wait_or_error: + if (!dict_index_is_spatial(index)) { + btr_pcur_store_position(pcur, &mtr); + } +page_read_error: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = false; + +lock_table_wait: + mtr.commit(); + mtr_extra_clust_savepoint = 0; + + trx->error_state = err; + thr->lock_state = QUE_THR_LOCK_ROW; + + if (row_mysql_handle_errors(&err, trx, thr, NULL)) { + /* It was a lock wait, and it ended */ + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + mtr.start(); + + /* Table lock waited, go try to obtain table lock + again */ + if (table_lock_waited) { + table_lock_waited = FALSE; + + goto wait_table_again; + } + + if (!dict_index_is_spatial(index)) { + sel_restore_position_for_mysql( + &same_user_rec, BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + } + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && !same_user_rec) { + + /* Since we were not able to restore the cursor + on the same user record, we cannot use + row_unlock_for_mysql() to unlock any records, and + we must thus reset the new rec lock info. Since + in lock0lock.cc we have blocked the inheriting of gap + X-locks, we actually do not have any new record locks + set in this case. + + Note that if we were able to restore on the 'same' + user record, it is still possible that we were actually + waiting on a delete-marked record, and meanwhile + it was removed by purge and inserted again by some + other user. But that is no problem, because in + rec_loop we will again try to set a lock, and + new_rec_lock_info in trx will be right at the end. */ + + prebuilt->new_rec_locks = 0; + } + + mode = pcur->search_mode; + + goto rec_loop; + } + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + + goto func_exit; + +normal_return: + mtr.commit(); + + DEBUG_SYNC_C("row_search_for_mysql_before_return"); + + if (prebuilt->pk_filter || prebuilt->idx_cond) { + /* When ICP is active we don't write to the MySQL buffer + directly, only to buffers that are enqueued in the pre-fetch + queue. We need to dequeue the first buffer and copy the contents + to the record buffer that was passed in by MySQL. */ + + if (prebuilt->n_fetch_cached > 0) { + row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + err = DB_SUCCESS; + } + + } else if (next_buf != 0) { + + /* We may or may not have enqueued some buffers to the + pre-fetch queue, but we definitely wrote to the record + buffer passed to use by MySQL. */ + + DEBUG_SYNC_C("row_search_cached_row"); + err = DB_SUCCESS; + } + +#ifdef UNIV_DEBUG + if (dict_index_is_spatial(index) && err != DB_SUCCESS + && err != DB_END_OF_INDEX && err != DB_INTERRUPTED) { + rtr_node_path_t* path = pcur->btr_cur.rtr_info->path; + + ut_ad(path->empty()); + } +#endif + +func_exit: + trx->op_info = ""; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Set or reset the "did semi-consistent read" flag on return. + The flag did_semi_consistent_read is set if and only if + the record being returned was fetched with a semi-consistent read. */ + ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS + || !did_semi_consistent_read); + + if (prebuilt->row_read_type != ROW_READ_WITH_LOCKS) { + if (did_semi_consistent_read) { + prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + } + + DEBUG_SYNC_C("innodb_row_search_for_mysql_exit"); + + DBUG_RETURN(err); +} + +/********************************************************************//** +Count rows in a R-Tree leaf level. +@return DB_SUCCESS if successful */ +dberr_t +row_count_rtree_recs( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint* n_rows) /*!< out: number of entries + seen in the consistent read */ +{ + dict_index_t* index = prebuilt->index; + dberr_t ret = DB_SUCCESS; + mtr_t mtr; + mem_heap_t* heap; + dtuple_t* entry; + dtuple_t* search_entry = prebuilt->search_tuple; + ulint entry_len; + ulint i; + byte* buf; + + ut_a(dict_index_is_spatial(index)); + + *n_rows = 0; + + heap = mem_heap_create(256); + + /* Build a search tuple. */ + entry_len = dict_index_get_n_fields(index); + entry = dtuple_create(heap, entry_len); + + for (i = 0; i < entry_len; i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = ind_field->col; + dfield_t* dfield + = dtuple_get_nth_field(entry, i); + + if (i == 0) { + double* mbr; + double tmp_mbr[SPDIMS * 2]; + + dfield->type.mtype = DATA_GEOMETRY; + dfield->type.prtype |= DATA_GIS_MBR; + + /* Allocate memory for mbr field */ + mbr = static_cast<double*> + (mem_heap_alloc(heap, DATA_MBR_LEN)); + + /* Set mbr field data. */ + dfield_set_data(dfield, mbr, DATA_MBR_LEN); + + for (uint j = 0; j < SPDIMS; j++) { + tmp_mbr[j * 2] = DBL_MAX; + tmp_mbr[j * 2 + 1] = -DBL_MAX; + } + dfield_write_mbr(dfield, tmp_mbr); + continue; + } + + dfield->type.mtype = col->mtype; + dfield->type.prtype = col->prtype; + + } + + prebuilt->search_tuple = entry; + + ulint bufsize = std::max<ulint>(srv_page_size, + prebuilt->mysql_row_len); + buf = static_cast<byte*>(ut_malloc_nokey(bufsize)); + + ulint direction = 0; + +loop: + ret = row_search_mvcc(buf, PAGE_CUR_WITHIN, prebuilt, 0, direction); + direction = ROW_SEL_NEXT; + + switch (ret) { + case DB_SUCCESS: + break; + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + case DB_LOCK_WAIT_TIMEOUT: + case DB_INTERRUPTED: + goto func_exit; + default: + /* fall through (this error is ignored by CHECK TABLE) */ + case DB_END_OF_INDEX: + ret = DB_SUCCESS; +func_exit: + prebuilt->search_tuple = search_entry; + ut_free(buf); + mem_heap_free(heap); + + return(ret); + } + + ++*n_rows; + goto loop; +} + +/** Check if a version of a clustered index record and a secondary +index record match. + +@param prebuilt index and transaction +@param clust_rec a version of a clustered index record +@param clust_index clustered index +@param clust_offsets rec_get_offsets(clust_rec, clust_index) +@param rec secondary index leaf page record +@param offsets rec_get_offsets(rec, index) +@return an error code +@retval DB_SUCCESS if rec matches clust_rec +@retval DB_SUCCESS_LOCKED_REC if rec does not match clust_rec +*/ +static dberr_t row_check_index_match(row_prebuilt_t *prebuilt, + const rec_t *clust_rec, + const dict_index_t *clust_index, + const rec_offs *clust_offsets, + const rec_t *rec, + const dict_index_t *index, + const rec_offs *offsets) +{ + ut_ad(index == prebuilt->index); + + ib_vcol_row vc(index->has_virtual() ? mem_heap_create(256) : nullptr); + + const uint16_t n= index->n_user_defined_cols; + + for (uint16_t i= 0; i < n; i++) + { + ulint pos= 0; + ulint len, sec_len; + + const dict_field_t &ifield= index->fields[i]; + const byte *sec_field= rec_get_nth_field(rec, offsets, i, &sec_len); + const byte *field; + + if (ifield.col->is_virtual()) + { + /* Virtual column values must be reconstructed from the base columns. */ + row_ext_t *ext; + byte *record= vc.record(prebuilt->trx->mysql_thd, clust_index, + &prebuilt->m_mysql_table); + const dict_v_col_t *v_col= reinterpret_cast<const dict_v_col_t*> + (ifield.col); + dtuple_t *row= row_build(ROW_COPY_POINTERS, + clust_index, clust_rec, clust_offsets, + nullptr, nullptr, nullptr, &ext, vc.heap); + if (dfield_t *vfield= + innobase_get_computed_value(row, v_col, clust_index, &vc.heap, + nullptr, nullptr, + prebuilt->trx->mysql_thd, + prebuilt->m_mysql_table, + record, nullptr, nullptr)) + { + len= vfield->len; + field= static_cast<byte*>(vfield->data); + } + else + { + innobase_report_computed_value_failed(row); + return DB_COMPUTE_VALUE_FAILED; + } + } + else + { + pos= dict_col_get_clust_pos(ifield.col, clust_index); + field= rec_get_nth_cfield(clust_rec, clust_index, clust_offsets, pos, + &len); + if (len == UNIV_SQL_NULL) + { + if (sec_len == UNIV_SQL_NULL) + continue; + return DB_SUCCESS_LOCKED_REC; + } + if (sec_len == UNIV_SQL_NULL) + return DB_SUCCESS_LOCKED_REC; + + if (rec_offs_nth_extern(clust_offsets, pos)) + { + if (len == BTR_EXTERN_FIELD_REF_SIZE) + goto compare_blobs; + len-= BTR_EXTERN_FIELD_REF_SIZE; + } + + if (ifield.prefix_len) + { + len= + dtype_get_at_most_n_mbchars(ifield.col->prtype, ifield.col->mbminlen, + ifield.col->mbmaxlen, + ifield.prefix_len, len, + reinterpret_cast<const char*>(field)); + if (len < sec_len) + goto check_for_blob; + } + else + { +check_for_blob: + if (rec_offs_nth_extern(clust_offsets, pos)) + { +compare_blobs: + if (!row_sel_sec_rec_is_for_blob(ifield.col->mtype, + ifield.col->prtype, + ifield.col->mbminlen, + ifield.col->mbmaxlen, + field, len, sec_field, sec_len, + ifield.prefix_len, + clust_index->table)) + return DB_SUCCESS_LOCKED_REC; + continue; + } + } + } + + if (cmp_data(ifield.col->mtype, ifield.col->prtype, false, + field, len, sec_field, sec_len)) + return DB_SUCCESS_LOCKED_REC; + } + + return DB_SUCCESS; +} + +/** +Check the index records in CHECK TABLE. +The index must contain entries in an ascending order, +unique constraint must not be violated by duplicated keys, +and the number of index entries is counted in according to the +current read view. + +@param prebuilt index and transaction +@param n_rows number of records counted + +@return error code +@retval DB_SUCCESS if no error was found */ +dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows) +{ + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + *n_rows= 0; + dict_index_t *const index= prebuilt->index; + + if (!index->is_btree()) + return DB_CORRUPTION; + + mem_heap_t *heap= mem_heap_create(100); + + dtuple_t *prev_entry= nullptr; + mtr_t mtr; + mtr.start(); + + dict_index_t *clust_index= dict_table_get_first_index(prebuilt->table); + prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index; + dberr_t err= prebuilt->pcur->open_leaf(true, index, BTR_SEARCH_LEAF, &mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { +func_exit: + mtr.commit(); + mem_heap_free(heap); + return err; + } + + if (const trx_id_t bulk_trx_id= index->table->bulk_trx_id) + if (!prebuilt->trx->read_view.changes_visible(bulk_trx_id)) + goto func_exit; + + ReadView check_table_extended_view; + ReadView &view= + prebuilt->need_to_access_clustered && + !prebuilt->table->is_temporary() && + prebuilt->trx->isolation_level != TRX_ISO_READ_UNCOMMITTED + ? check_table_extended_view : prebuilt->trx->read_view; + if (&view == &check_table_extended_view) + check_table_extended_view.set_creator_trx_id(prebuilt->trx->id); + +page_loop: + if (&view == &check_table_extended_view) + /* In CHECK TABLE...EXTENDED, we make a copy of purge_sys.end_view + while holding a shared latch on the index leaf page. + Should a currently active purge batch desire to remove any further + records from this page, it would be blocked by our page latch. + + We will consult check_table_extended_view to determine if a + clustered index record corresponding to a secondary index record + is visible to the current purge batch. Right after we have made our + copy, purge_sys.end_view is free to be changed again. + + If we have an orphan secondary index record, we may attempt to + request a clustered index record version that cannot be retrieved + any more because the undo log records may have been freed + (according to the purge_sys.end_view). In such a case, + trx_undo_get_undo_rec() would cause + trx_undo_prev_version_build() and trx_undo_prev_version_build() + to return DB_MISSING_HISTORY. */ + static_cast<ReadViewBase&>(check_table_extended_view)= + purge_sys_t::end_view_guard{}.view(); + +rec_loop: + ut_ad(err == DB_SUCCESS); + + if (!btr_pcur_move_to_next_on_page(prebuilt->pcur)) + { + err= DB_CORRUPTION; + goto func_exit; + } + + const rec_t *rec= btr_pcur_get_rec(prebuilt->pcur); + rec_offs *offsets= offsets_; + + if (page_rec_is_supremum(rec)) + { + next_page: + if (btr_pcur_is_after_last_in_tree(prebuilt->pcur)) + goto func_exit; + err= btr_pcur_move_to_next_page(prebuilt->pcur, &mtr); + if (err == DB_SUCCESS && trx_is_interrupted(prebuilt->trx)) + err= DB_INTERRUPTED; + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + goto func_exit; + goto page_loop; + } + + offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + const auto info_bits= + rec_get_info_bits(rec, prebuilt->table->not_redundant()); + const bool rec_deleted= info_bits & REC_INFO_DELETED_FLAG; + + if (UNIV_UNLIKELY(info_bits & REC_INFO_MIN_REC_FLAG)) + { + if (*n_rows || !index->is_instant()) + { + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, + "InnoDB: invalid record encountered"); + prebuilt->autoinc_error= DB_INDEX_CORRUPT; + } + goto next_rec; + } + + if (prebuilt->table->is_temporary()) + { + count_or_not: + if (rec_deleted) + goto next_rec; + } + else if (index->is_clust()) + { + if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) + goto count_or_not; + + trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets); + + if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + { + invalid_trx_id: + if (prebuilt->autoinc_error == DB_SUCCESS) + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: DB_TRX_ID=" TRX_ID_FMT + " exceeds the system-wide maximum", + rec_trx_id); + prebuilt->autoinc_error= DB_CORRUPTION; + goto next_rec; + } + + if (!prebuilt->trx->read_view.changes_visible(rec_trx_id)) + { + ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN); + rec_t *old_vers; + /* The following call returns 'offsets' associated with 'old_vers' */ + err= row_sel_build_prev_vers_for_mysql(prebuilt, index, rec, &offsets, + &heap, &old_vers, nullptr, &mtr); + + if (err != DB_SUCCESS) + goto func_exit; + + if (old_vers) + { + rec= old_vers; + rec_trx_id= row_get_rec_trx_id(rec, index, offsets); + + if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + goto invalid_trx_id; + + if (!rec_get_deleted_flag(rec, prebuilt->table->not_redundant())) + goto count_row; + } + else + offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + goto next_rec; + } + else if (!rec_deleted && !rec_trx_id); + else if (!check_table_extended_view.changes_visible(rec_trx_id)); + else if (prebuilt->autoinc_error == DB_SUCCESS) + { + const char *msg= rec_deleted + ? "Unpurged clustered index record" + : "Clustered index record with stale history"; + + ib::warn w; + w << msg << " in table " << index->table->name << ": " + << rec_offsets_print(rec, offsets); + prebuilt->autoinc_error= DB_MISSING_HISTORY; + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, "InnoDB: %s", w.m_oss.str().c_str()); + } + + goto count_or_not; + } + else if (const trx_id_t page_trx_id= page_get_max_trx_id(page_align(rec))) + { + if (page_trx_id >= trx_sys.get_max_trx_id()) + goto invalid_PAGE_MAX_TRX_ID; + if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED); + else if (&view == &check_table_extended_view || rec_deleted || + !view.sees(page_trx_id)) + { + bool got_extended_match= &view == &check_table_extended_view; + const auto savepoint= mtr.get_savepoint(); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, index, offsets); + err= btr_pcur_open_with_no_init(prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, &mtr); + if (err != DB_SUCCESS) + goto func_exit; + + const rec_t *clust_rec= btr_pcur_get_rec(prebuilt->clust_pcur); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) || + btr_pcur_get_low_match(prebuilt->clust_pcur) < clust_index->n_uniq) + { + if (!rec_deleted) + { + not_found: + /* MDEV-29823 FIXME: There is a race condition between + rollback, purge, and possibly other SQL connections that + are creating and releasing read views. At the time + row_undo_mod_del_mark_or_remove_sec_low() is executing + rollback on a secondary index record, purge_sys.view + may not allow it to delete the record, and it will be + delete-marked. Eventually purge_sys.view would advance, + but the delete-marked record could never be removed, + because no undo log record was ever added to + the purge queue by trx_purge_add_undo_to_history(). + + For now, we will not flag an error about orphan secondary index + records that are delete-marked; we will only warn about them. */ + + if (!rec_deleted || prebuilt->autoinc_error == DB_SUCCESS) + { + ib::error_or_warn w(!rec_deleted); + w << "Clustered index record not found for index " + << index->name << " of table " << index->table->name + << ": " << rec_offsets_print(rec, offsets); + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, "InnoDB: %s", + w.m_oss.str().c_str()); + } + + if (prebuilt->autoinc_error == DB_SUCCESS) + prebuilt->autoinc_error= rec_deleted + ? DB_MISSING_HISTORY + : DB_CORRUPTION; + } + else if (&view == &check_table_extended_view) + extended_not_found: + if (view.changes_visible(page_trx_id)) + goto not_found; + did_not_find: + mtr.rollback_to_savepoint(savepoint); + goto next_rec; + } + + rec_offs *clust_offsets; + trx_id_t rec_trx_id; + rec_t *old_vers= nullptr; + + bool found_in_view= false; + trx_id_t visible_trx_id= ~0ULL; + + if (ulint trx_id_offset= clust_index->trx_id_offset) + { + clust_offsets= nullptr; + read_trx_id: + rec_trx_id= trx_read_trx_id(clust_rec + trx_id_offset); + + if (clust_rec[trx_id_offset + DATA_TRX_ID_LEN] & 0x80) + { + if (UNIV_UNLIKELY + (rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant()))) + { + err= DB_CORRUPTION; + goto func_exit; + } + + /* This is the oldest available record version (fresh insert). */ + if (!view.changes_visible(rec_trx_id)) + { + if (rec_trx_id >= view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + goto invalid_rec_trx_id; + if (got_extended_match) + goto check_latest_version; + goto did_not_find; + } + } + } + else + { + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + ulint trx_id_pos= clust_index->n_uniq ? clust_index->n_uniq : 1; + ulint len; + trx_id_offset= rec_get_nth_field_offs(clust_offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + goto read_trx_id; + } + + if (got_extended_match) + { + check_latest_version: + /* In CHECK TABLE...EXTENDED, always check if the secondary + index record matches the latest clustered index record + version, no matter if it is visible in our own read view. + + If the latest clustered index version is delete-marked and + purgeable, it is not safe to fetch any BLOBs for column prefix + indexes because they may already have been freed. */ + if (rec_trx_id && + rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant()) && + purge_sys.is_purgeable(rec_trx_id)) + goto did_not_find; + + if (!clust_offsets) + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + err= row_check_index_match(prebuilt, + clust_rec, clust_index, clust_offsets, + rec, index, offsets); + + switch (err) { + default: + goto func_exit; + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + } + + got_extended_match= err == DB_SUCCESS; + err= DB_SUCCESS; + + if (!prebuilt->trx->read_view.changes_visible(rec_trx_id)) + /* While CHECK TABLE ... EXTENDED checks for a matching + clustered index record version for each secondary index + record, it must count only those records that belong to its + own read view. + + If the latest version of clust_rec matches rec but is not + in our read view, there may still be an older version of + clust_rec that not only matches rec but is in our view. + We must evaluate old versions before deciding whether rec + should be counted. */ + goto check_old_vers; + + /* Remember that this is the visible clust_rec for rec, + and whether it matches rec. */ + visible_trx_id= rec_trx_id; + found_in_view= got_extended_match && + !rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant()); + + if (!got_extended_match) + goto check_old_vers; + + if (!found_in_view) + goto did_not_find; + + found_match: + mtr.rollback_to_savepoint(savepoint); + goto count_row; + } + else if (!view.changes_visible(rec_trx_id)) + { + check_old_vers: + if (rec_trx_id >= view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + { + invalid_rec_trx_id: + if (prebuilt->autoinc_error == DB_SUCCESS) + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: DB_TRX_ID=" TRX_ID_FMT + " exceeds the system-wide maximum", + rec_trx_id); + goto not_found; + } + + if (!clust_offsets) + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + row_sel_reset_old_vers_heap(prebuilt); + /* The following is adapted from row_vers_build_for_consistent_read() + because when using check_table_extended_view, we must + consider every available version of the clustered index record. */ + mem_heap_t *vers_heap= nullptr; + + for (;;) + { + mem_heap_t *prev_heap= vers_heap; + vers_heap= mem_heap_create(1024); + err= trx_undo_prev_version_build(clust_rec, + clust_index, clust_offsets, + vers_heap, &old_vers, + nullptr, nullptr, 0); + if (prev_heap) + mem_heap_free(prev_heap); + if (err != DB_SUCCESS) + { + old_vers_err: + mem_heap_free(vers_heap); + if (err == DB_MISSING_HISTORY) + { + err= DB_SUCCESS; + if (got_extended_match) + goto did_not_find; + goto not_found; + } + goto func_exit; + } + + if (UNIV_UNLIKELY(!old_vers)) + { + mem_heap_free(vers_heap); + /* We did not find a matching clustered index record version + for the secondary index record. Normal CHECK TABLE will simply + not count the secondary index record; CHECK TABLE ... EXTENDED + will flag such orphan records if appropriate. + + A secondary index record may may be "temporarily orphan" + if purge is in progress. We will only flag them if + everything up to PAGE_MAX_TRX_ID has been fully purged. + + "Temporary orphans" may be produced when + row_undo_mod_clust() resets the DB_TRX_ID of the latest + clust_rec version or when trx_undo_prev_version_build() + encounters a BLOB that may have been freed according to + purge_sys.view (not purge_sys.end_view). */ + if (&view == &check_table_extended_view && !got_extended_match) + goto extended_not_found; + goto did_not_find; + } + + clust_rec= old_vers; + clust_offsets= rec_get_offsets(clust_rec, clust_index, clust_offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + rec_trx_id= row_get_rec_trx_id(clust_rec, clust_index, + clust_offsets); + + if (UNIV_UNLIKELY(rec_trx_id >= + prebuilt->trx->read_view.low_limit_id() && + rec_trx_id >= trx_sys.get_max_trx_id())) + { + mem_heap_free(vers_heap); + goto invalid_rec_trx_id; + } + + const bool rec_visible= + prebuilt->trx->read_view.changes_visible(rec_trx_id); + const bool clust_rec_deleted= + rec_get_deleted_flag(clust_rec, prebuilt->table->not_redundant()); + + if (&view != &prebuilt->trx->read_view) + { + /* It is not safe to fetch BLOBs of committed delete-marked + records that may have been freed in purge. */ + err= clust_rec_deleted && rec_trx_id && + purge_sys.is_purgeable(rec_trx_id) + ? DB_SUCCESS_LOCKED_REC + : row_check_index_match(prebuilt, + clust_rec, clust_index, clust_offsets, + rec, index, offsets); + + switch (err) { + default: + goto old_vers_err; + case DB_SUCCESS_LOCKED_REC: + if (rec_visible && !~visible_trx_id) + visible_trx_id= rec_trx_id; + continue; + case DB_SUCCESS: + got_extended_match= true; + if (!rec_visible) + continue; + if (!~visible_trx_id) + { + visible_trx_id= rec_trx_id; + found_in_view= !clust_rec_deleted; + } + mem_heap_free(vers_heap); + if (!found_in_view) + goto did_not_find; + goto found_match; + } + } + else if (rec_visible) + { + if (!clust_rec_deleted) + { + clust_rec= rec_copy(mem_heap_alloc(heap, + rec_offs_size(clust_offsets)), + clust_rec, clust_offsets); + rec_offs_make_valid(clust_rec, clust_index, true, clust_offsets); + } + mem_heap_free(vers_heap); + if (clust_rec_deleted) + goto did_not_find; + goto check_match; + } + } + } + else if (rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant())) + goto did_not_find; + + ut_ad(clust_rec); + ut_ad(&view != &check_table_extended_view); + + /* If we had to go to an earlier version of row or the secondary + index record is delete marked, then it may be that the secondary + index record corresponding to clust_rec (or old_vers) is not + rec; in that case we must ignore such row because in our + snapshot rec would not have existed. Remember that from rec we + cannot see directly which transaction id corresponds to it: we + have to go to the clustered index record. A query where we want + to fetch all rows where the secondary index value is in some + interval would return a wrong result if we would not drop rows + which we come to visit through secondary index records that + would not really exist in our snapshot. */ + + if (rec_deleted) + { + if (!clust_offsets) + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + check_match: + /* This clustered index record version exists in + prebuilt->trx->read_view and is not delete-marked. + By design, any BLOBs in it are not allowed to be + freed in the purge of committed transaction history. */ + err= row_check_index_match(prebuilt, clust_rec, clust_index, + clust_offsets, rec, index, offsets); + switch (err) { + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + err= DB_SUCCESS; + goto did_not_find; + default: + goto func_exit; + } + } + + mtr.rollback_to_savepoint(savepoint); + } + } + else + { + invalid_PAGE_MAX_TRX_ID: + if (UNIV_LIKELY(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN)) + { + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, + "InnoDB: Invalid PAGE_MAX_TRX_ID=%llu" + " in index '%-.200s'", + page_trx_id, index->name()); + prebuilt->autoinc_error= DB_INDEX_CORRUPT; + } + goto next_rec; + } + +count_row: + ++*n_rows; + + if (prev_entry) + { + ulint matched_fields= 0; + int cmp= cmp_dtuple_rec_with_match(prev_entry, rec, index, offsets, + &matched_fields); + const char* msg; + + if (UNIV_LIKELY(cmp < 0)); + else if (cmp > 0) + { + prebuilt->autoinc_error= DB_INDEX_CORRUPT; + msg= "index records in a wrong order in "; +not_ok: + ib::error() << msg << index->name << " of table " << index->table->name + << ": " << *prev_entry << ", " + << rec_offsets_print(rec, offsets); + } + else if (index->is_unique() && matched_fields >= + dict_index_get_n_ordering_defined_by_user(index)) + { + /* NULL values in unique indexes are considered not to be duplicates */ + for (ulint i= 0; i < dict_index_get_n_ordering_defined_by_user(index); + i++) + if (dfield_is_null(dtuple_get_nth_field(prev_entry, i))) + goto next_rec; + + if (prebuilt->autoinc_error == DB_SUCCESS) + prebuilt->autoinc_error= DB_DUPLICATE_KEY; + msg= "duplicate key in "; + goto not_ok; + } + } + +next_rec: + ut_ad(err == DB_SUCCESS); + + { + mem_heap_t *tmp_heap= nullptr; + + /* Empty the heap on each round. But preserve offsets[] + for the row_rec_to_index_entry() call, by copying them + into a separate memory heap when needed. */ + if (UNIV_UNLIKELY(offsets != offsets_)) + { + ulint size= rec_offs_get_n_alloc(offsets) * sizeof *offsets; + tmp_heap= mem_heap_create(size); + offsets= static_cast<rec_offs*>(mem_heap_dup(tmp_heap, offsets, size)); + } + + mem_heap_empty(heap); + prev_entry= row_rec_to_index_entry(rec, index, offsets, heap); + + if (UNIV_LIKELY_NULL(tmp_heap)) + mem_heap_free(tmp_heap); + } + + if (btr_pcur_is_after_last_on_page(prebuilt->pcur)) + goto next_page; + + goto rec_loop; +} + +/*******************************************************************//** +Read the AUTOINC column from the current row. If the value is less than +0 and the type is not unsigned then we reset the value to 0. +@return value read from the column */ +static +ib_uint64_t +row_search_autoinc_read_column( +/*===========================*/ + dict_index_t* index, /*!< in: index to read from */ + const rec_t* rec, /*!< in: current rec */ + ulint col_no, /*!< in: column number */ + ulint mtype, /*!< in: column main type */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + ulint len; + const byte* data; + ib_uint64_t value; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + rec_offs_init(offsets_); + ut_ad(page_rec_is_leaf(rec)); + + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + col_no + 1, &heap); + + if (rec_offs_nth_sql_null(offsets, col_no)) { + /* There is no non-NULL value in the auto-increment column. */ + value = 0; + goto func_exit; + } + + data = rec_get_nth_field(rec, offsets, col_no, &len); + + value = row_parse_int(data, len, mtype, unsigned_type); + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(value); +} + +/** Get the maximum and non-delete-marked record in an index. +@param[in] index index tree +@param[in,out] mtr mini-transaction (may be committed and restarted) +@return maximum record, page s-latched in mtr +@retval NULL if there are no records, or if all of them are delete-marked */ +static +const rec_t* +row_search_get_max_rec( + dict_index_t* index, + mtr_t* mtr) +{ + btr_pcur_t pcur; + const rec_t* rec; + const bool desc = index->fields[0].descending; + + if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) { + return nullptr; + } + + if (desc) { + const bool comp = index->table->not_redundant(); + while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) { + rec = btr_pcur_get_rec(&pcur); + if (rec_is_metadata(rec, *index)) { + continue; + } + if (!rec_get_deleted_flag(rec, comp)) { + goto found; + } + } + } else { + do { + rec = page_find_rec_last_not_deleted( + btr_pcur_get_page(&pcur)); + if (page_rec_is_user_rec(rec)) { + goto found; + } + btr_pcur_move_before_first_on_page(&pcur); + } while (btr_pcur_move_to_prev(&pcur, mtr)); + } + + rec = nullptr; + +found: + ut_ad(!rec + || !(rec_get_info_bits(rec, dict_table_is_comp(index->table)) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))); + return(rec); +} + +/** Read the max AUTOINC value from an index. +@param[in] index index starting with an AUTO_INCREMENT column +@return the largest AUTO_INCREMENT value +@retval 0 if no records were found */ +ib_uint64_t +row_search_max_autoinc(dict_index_t* index) +{ + const dict_field_t* dfield = dict_index_get_nth_field(index, 0); + + ib_uint64_t value = 0; + + mtr_t mtr; + mtr.start(); + + if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) { + value = row_search_autoinc_read_column( + index, rec, 0, + dfield->col->mtype, + dfield->col->prtype & DATA_UNSIGNED); + } + + mtr.commit(); + return(value); +} diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc new file mode 100644 index 00000000..23255cc9 --- /dev/null +++ b/storage/innobase/row/row0uins.cc @@ -0,0 +1,652 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0uins.cc +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "fil0fil.h" +#include <mysql/service_thd_mdl.h> + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***************************************************************//** +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_clust_rec( +/*==========================*/ + undo_node_t* node) /*!< in: undo node */ +{ + dberr_t err; + ulint n_tries = 0; + mtr_t mtr; + dict_index_t* index = node->pcur.index(); + table_id_t table_id = 0; + const bool dict_locked = node->trx->dict_operation_lock_mode; +restart: + MDL_ticket* mdl_ticket = nullptr; + ut_ad(!table_id || dict_locked + || !node->trx->dict_operation_lock_mode); + dict_table_t *table = table_id + ? dict_table_open_on_id(table_id, dict_locked, + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED, + node->trx->mysql_thd, &mdl_ticket) + : nullptr; + + ut_ad(index->is_primary()); + ut_ad(node->trx->in_rollback); + + mtr.start(); + if (index->table->is_temporary()) { + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + mtr.set_log_mode(MTR_LOG_NO_REDO); + ut_ad(index->table->id >= DICT_HDR_FIRST_ID); + } else { + index->set_modified(mtr); + ut_ad(lock_table_has_locks(index->table)); + } + + /* This is similar to row_undo_mod_clust(). The DDL thread may + already have copied this row from the log to the new table. + We must log the removal, so that the row will be correctly + purged. However, we can log the removal out of sync with the + B-tree modification. */ + ut_a(node->pcur.restore_position( + (node->rec_type == TRX_UNDO_INSERT_METADATA) + ? BTR_MODIFY_TREE + : BTR_MODIFY_LEAF, + &mtr) == btr_pcur_t::SAME_ALL); + rec_t* rec = btr_pcur_get_rec(&node->pcur); + + ut_ad(rec_get_trx_id(rec, index) == node->trx->id + || node->table->is_temporary()); + ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant()) + || rec_is_alter_metadata(rec, index->table->not_redundant())); + ut_ad(rec_is_metadata(rec, index->table->not_redundant()) + == (node->rec_type == TRX_UNDO_INSERT_METADATA)); + + switch (node->table->id) { + case DICT_COLUMNS_ID: + /* This is rolling back an INSERT into SYS_COLUMNS. + If it was part of an instant ALTER TABLE operation, we + must evict the table definition, so that it can be + reloaded after the dictionary operation has been + completed. At this point, any corresponding operation + to the metadata record will have been rolled back. */ + ut_ad(node->trx->dict_operation_lock_mode); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + if (rec_get_n_fields_old(rec) + != DICT_NUM_FIELDS__SYS_COLUMNS + || (rec_get_1byte_offs_flag(rec) + ? rec_1_get_field_end_info(rec, 0) != 8 + : rec_2_get_field_end_info(rec, 0) != 8)) { + break; + } + static_assert(!DICT_FLD__SYS_COLUMNS__TABLE_ID, ""); + node->trx->evict_table(mach_read_from_8(rec)); + break; + case DICT_INDEXES_ID: + ut_ad(node->trx->dict_operation_lock_mode); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + if (!table_id) { + table_id = mach_read_from_8(rec); + if (table_id) { + mtr.commit(); + goto restart; + } + ut_ad("corrupted SYS_INDEXES record" == 0); + } + + pfs_os_file_t d = OS_FILE_CLOSED; + + const uint32_t space_id = dict_drop_index_tree( + &node->pcur, node->trx, &mtr); + if (space_id) { + if (table) { + lock_release_on_rollback(node->trx, + table); + if (!dict_locked) { + dict_sys.lock(SRW_LOCK_CALL); + } + if (table->release()) { + dict_sys.remove(table); + } else if (table->space_id + == space_id) { + table->space = nullptr; + table->file_unreadable = true; + } + if (!dict_locked) { + dict_sys.unlock(); + } + table = nullptr; + if (!mdl_ticket); + else if (MDL_context* mdl_context = + static_cast<MDL_context*>( + thd_mdl_context( + node->trx-> + mysql_thd))) { + mdl_context->release_lock( + mdl_ticket); + mdl_ticket = nullptr; + } + } + + d = fil_delete_tablespace(space_id); + } + + mtr.commit(); + + if (d != OS_FILE_CLOSED) { + os_file_close(d); + } + + if (space_id) { + ibuf_delete_for_discarded_space(space_id); + } + + mtr.start(); + ut_a(node->pcur.restore_position( + BTR_MODIFY_LEAF, &mtr) == btr_pcur_t::SAME_ALL); + } + + err = btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr); + + if (err != DB_FAIL) { + goto func_exit; + } + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr.start(); + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + } + ut_a(node->pcur.restore_position(BTR_PURGE_TREE, &mtr) + == btr_pcur_t::SAME_ALL); + + btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true, + &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + +func_exit: + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) { + /* When rolling back the very first instant ADD COLUMN + operation, reset the root page to the basic state. */ + btr_reset_instant(*index, true, &mtr); + } + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + + if (UNIV_LIKELY_NULL(table)) { + dict_table_close(table, dict_locked, + node->trx->mysql_thd, mdl_ticket); + } + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec_low( +/*========================*/ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to remove */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_pcur_t pcur; + dberr_t err = DB_SUCCESS; + mtr_t mtr; + const bool modify_leaf = mode == BTR_MODIFY_LEAF; + + pcur.btr_cur.page_cur.index = index; + row_mtr_start(&mtr, index, !modify_leaf); + + if (index->is_spatial()) { + mode = modify_leaf + ? btr_latch_mode(BTR_MODIFY_LEAF + | BTR_RTREE_DELETE_MARK + | BTR_RTREE_UNDO_INS) + : btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS); + btr_pcur_get_btr_cur(&pcur)->thr = thr; + if (rtr_search(entry, mode, &pcur, &mtr)) { + goto func_exit; + } + + if (rec_get_deleted_flag( + btr_pcur_get_rec(&pcur), + dict_table_is_comp(index->table))) { + ib::error() << "Record found in index " << index->name + << " is deleted marked on insert rollback."; + ut_ad(0); + } + goto found; + } else if (modify_leaf) { + mode = BTR_MODIFY_LEAF_ALREADY_LATCHED; + mtr_s_lock_index(index, &mtr); + } else { + ut_ad(mode == BTR_PURGE_TREE); + mode = BTR_PURGE_TREE_ALREADY_LATCHED; + mtr_x_lock_index(index, &mtr); + } + + switch (row_search_index_entry(entry, mode, &pcur, &mtr)) { + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + case ROW_NOT_FOUND: + break; + case ROW_FOUND: + found: + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (modify_leaf) { + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr); + } else { + /* Passing rollback=false here, because we are + deleting a secondary index record: the distinction + only matters when deleting a record that contains + externally stored columns. */ + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + false, &mtr); + } + } + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec( +/*====================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low(BTR_PURGE_TREE, index, entry, thr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/** Parse an insert undo record. +@param[in,out] node row rollback state +@param[in] dict_locked whether the data dictionary cache is locked */ +static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked) +{ + dict_index_t* clust_index; + const byte* ptr; + undo_no_t undo_no; + table_id_t table_id; + byte dummy; + bool dummy_extern; + + ut_ad(node->trx->in_rollback); + ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr)); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy, + &dummy_extern, &undo_no, &table_id); + + node->update = NULL; + if (!node->is_temp) { + node->table = dict_table_open_on_id(table_id, dict_locked, + DICT_TABLE_OP_NORMAL); + } else if (!dict_locked) { + dict_sys.freeze(SRW_LOCK_CALL); + node->table = dict_sys.acquire_temporary_table(table_id); + dict_sys.unfreeze(); + } else { + node->table = dict_sys.acquire_temporary_table(table_id); + } + + if (!node->table) { + return false; + } + + switch (node->rec_type) { + default: + ut_ad("wrong undo record type" == 0); + goto close_table; + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + case TRX_UNDO_EMPTY: + break; + case TRX_UNDO_RENAME_TABLE: + dict_table_t* table = node->table; + ut_ad(!table->is_temporary()); + ut_ad(table->file_unreadable + || dict_table_is_file_per_table(table) + == !is_system_tablespace(table->space_id)); + size_t len = mach_read_from_2(node->undo_rec) + - page_offset(ptr) - 2; + const span<const char> name(reinterpret_cast<const char*>(ptr), + len); + if (strlen(table->name.m_name) != len + || memcmp(table->name.m_name, ptr, len)) { + dict_table_rename_in_cache(table, name, true); + } else if (table->space && table->space->id) { + const auto s = table->space->name(); + if (len != s.size() || memcmp(ptr, s.data(), len)) { + table->rename_tablespace(name, true); + } + } + goto close_table; + } + + if (UNIV_UNLIKELY(!node->table->is_accessible())) { +close_table: + /* Normally, tables should not disappear or become + unaccessible during ROLLBACK, because they should be + protected by InnoDB table locks. Corruption could be + a valid exception. + + FIXME: When running out of temporary tablespace, it + would probably be better to just drop all temporary + tables (and temporary undo log records) of the current + connection, instead of doing this rollback. */ + dict_table_close(node->table, dict_locked); + node->table = NULL; + return false; + } else { + ut_ad(!node->table->skip_alter_undo); + clust_index = dict_table_get_first_index(node->table); + + if (clust_index != NULL) { + switch (node->rec_type) { + case TRX_UNDO_INSERT_REC: + ptr = trx_undo_rec_get_row_ref( + ptr, clust_index, &node->ref, + node->heap); + break; + case TRX_UNDO_EMPTY: + node->ref = nullptr; + return true; + default: + node->ref = &trx_undo_metadata; + if (!row_undo_search_clust_to_pcur(node)) { + /* An error probably occurred during + an insert into the clustered index, + after we wrote the undo log record. */ + goto close_table; + } + return true; + } + + if (!row_undo_search_clust_to_pcur(node)) { + /* An error probably occurred during + an insert into the clustered index, + after we wrote the undo log record. */ + goto close_table; + } + if (node->table->n_v_cols) { + trx_undo_read_v_cols(node->table, ptr, + node->row, false); + } + + } else { + ib::warn() << "Table " << node->table->name + << " has no indexes," + " ignoring the table"; + goto close_table; + } + } + + return true; +} + +/***************************************************************//** +Removes secondary index records. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec_rec( +/*========================*/ + undo_node_t* node, /*!< in/out: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err = DB_SUCCESS; + dict_index_t* index; + mem_heap_t* heap; + + heap = mem_heap_create(1024); + + for (index = node->index; index; + index = dict_table_get_next_index(index)) { + if (index->type & (DICT_FTS | DICT_CORRUPT) + || !index->is_committed()) { + continue; + } + + /* An insert undo record TRX_UNDO_INSERT_REC will + always contain all fields of the index. It does not + matter if any indexes were created afterwards; all + index entries can be reconstructed from the row. */ + dtuple_t* entry = row_build_index_entry( + node->row, node->ext, index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record, or a statement is being rolled + back because an error occurred while storing + off-page columns. + + Because secondary index entries are inserted + after the clustered index record, we may + assume that the secondary index record does + not exist. */ + } else { + err = row_undo_ins_remove_sec(index, entry, thr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto func_exit; + } + } + + mem_heap_empty(heap); + } + +func_exit: + node->index = index; + mem_heap_free(heap); + return(err); +} + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + const bool dict_locked = node->trx->dict_operation_lock_mode; + + if (!row_undo_ins_parse_undo_rec(node, dict_locked)) { + return DB_SUCCESS; + } + + ut_ad(node->table->is_temporary() + || lock_table_has_locks(node->table)); + + /* Iterate over all the indexes and undo the insert.*/ + + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + switch (node->rec_type) { + default: + ut_ad("wrong undo record type" == 0); + /* fall through */ + case TRX_UNDO_INSERT_REC: + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); + + err = row_undo_ins_remove_sec_rec(node, thr); + + if (err != DB_SUCCESS) { + break; + } + + log_free_check(); + + if (!dict_locked && node->table->id == DICT_INDEXES_ID) { + dict_sys.lock(SRW_LOCK_CALL); + err = row_undo_ins_remove_clust_rec(node); + dict_sys.unlock(); + } else { + ut_ad(node->table->id != DICT_INDEXES_ID + || !node->table->is_temporary()); + err = row_undo_ins_remove_clust_rec(node); + } + + if (err == DB_SUCCESS && node->table->stat_initialized) { + /* Not protected by dict_sys.latch + or table->stats_mutex_lock() for + performance reasons, we would rather get garbage + in stat_n_rows (which is just an estimate anyway) + than protecting the following code with a latch. */ + dict_table_n_rows_dec(node->table); + + /* Do not attempt to update statistics when + executing ROLLBACK in the InnoDB SQL + interpreter, because in that case we would + already be holding dict_sys.latch, which + would be acquired when updating statistics. */ + if (!dict_locked) { + dict_stats_update_if_needed(node->table, + *node->trx); + } + } + break; + + case TRX_UNDO_INSERT_METADATA: + log_free_check(); + ut_ad(!node->table->is_temporary()); + err = row_undo_ins_remove_clust_rec(node); + break; + case TRX_UNDO_EMPTY: + err = node->table->clear(thr); + break; + } + + dict_table_close(node->table, dict_locked); + + node->table = NULL; + + return(err); +} diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc new file mode 100644 index 00000000..a01eaea5 --- /dev/null +++ b/storage/innobase/row/row0umod.cc @@ -0,0 +1,1288 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0umod.cc +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#include "row0umod.h" +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "ibuf0ibuf.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "log0log.h" + +/* Considerations on undoing a modify operation. +(1) Undoing a delete marking: all index records should be found. Some of +them may have delete mark already FALSE, if the delete mark operation was +stopped underway, or if the undo operation ended prematurely because of a +system crash. +(2) Undoing an update of a delete unmarked record: the newer version of +an updated secondary index entry should be removed if no prior version +of the clustered index record requires its existence. Otherwise, it should +be delete marked. +(3) Undoing an update of a delete marked record. In this kind of update a +delete marked clustered index record was delete unmarked and possibly also +some of its fields were changed. Now, it is possible that the delete marked +version has become obsolete at the time the undo is started. */ + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***********************************************************//** +Undoes a modify in a clustered index record. +@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_clust_low( +/*===================*/ + undo_node_t* node, /*!< in: row undo node */ + rec_offs** offsets,/*!< out: rec_get_offsets() on the record */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + byte* sys, /*!< out: DB_TRX_ID, DB_ROLL_PTR + for row_log_table_delete() */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in: mtr; must be committed before + latching any further pages */ + btr_latch_mode mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + dberr_t err; + + pcur = &node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + if (pcur->restore_position(mode, mtr) != btr_pcur_t::SAME_ALL) { + return DB_CORRUPTION; + } + + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur)) + == thr_get_trx(thr)->id + || btr_cur_get_index(btr_cur)->table->is_temporary()); + ut_ad(node->ref != &trx_undo_metadata + || node->update->info_bits == REC_INFO_METADATA_ADD + || node->update->info_bits == REC_INFO_METADATA_ALTER); + + if (mode != BTR_MODIFY_TREE) { + ut_ad(mode == BTR_MODIFY_LEAF + || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED); + + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, offsets, offsets_heap, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); + ut_ad(err != DB_SUCCESS || node->ref != &trx_undo_metadata); + } else { + big_rec_t* dummy_big_rec; + + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, offsets, offsets_heap, heap, + &dummy_big_rec, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + + ut_a(!dummy_big_rec); + + if (err == DB_SUCCESS + && node->ref == &trx_undo_metadata + && btr_cur_get_index(btr_cur)->table->instant + && node->update->info_bits == REC_INFO_METADATA_ADD) { + btr_reset_instant(*btr_cur->index(), false, mtr); + } + } + + if (err != DB_SUCCESS) { + return err; + } + + switch (const auto id = btr_cur_get_index(btr_cur)->table->id) { + unsigned c; + case DICT_TABLES_ID: + if (node->trx != trx_roll_crash_recv_trx) { + break; + } + c = DICT_COL__SYS_TABLES__ID; + goto evict; + case DICT_INDEXES_ID: + if (node->trx != trx_roll_crash_recv_trx) { + break; + } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC + && btr_cur_get_rec(btr_cur) + [8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] + == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) { + /* We are rolling back the DELETE of metadata + for a failed ADD INDEX operation. This does + not affect any cached table definition, + because we are filtering out such indexes in + dict_load_indexes(). */ + break; + } + /* fall through */ + case DICT_COLUMNS_ID: + static_assert(!DICT_COL__SYS_INDEXES__TABLE_ID, ""); + static_assert(!DICT_COL__SYS_COLUMNS__TABLE_ID, ""); + c = DICT_COL__SYS_COLUMNS__TABLE_ID; + /* This is rolling back an UPDATE or DELETE on SYS_COLUMNS. + If it was part of an instant ALTER TABLE operation, we + must evict the table definition, so that it can be + reloaded after the dictionary operation has been + completed. At this point, any corresponding operation + to the metadata record will have been rolled back. */ + evict: + const dfield_t& table_id = *dtuple_get_nth_field(node->row, c); + ut_ad(dfield_get_len(&table_id) == 8); + node->trx->evict_table(mach_read_from_8( + static_cast<byte*>( + table_id.data)), + id == DICT_COLUMNS_ID); + } + + return DB_SUCCESS; +} + +/** Get the byte offset of the DB_TRX_ID column +@param[in] rec clustered index record +@param[in] index clustered index +@return the byte offset of DB_TRX_ID, from the start of rec */ +static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index) +{ + ut_ad(index->n_uniq <= MAX_REF_PARTS); + ulint trx_id_offset = index->trx_id_offset; + if (!trx_id_offset) { + /* Reserve enough offsets for the PRIMARY KEY and 2 columns + so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + const ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; + rec_offs* offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + trx_id_pos + 1, &heap); + ut_ad(!heap); + ulint len; + trx_id_offset = rec_get_nth_field_offs( + offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + } + + return trx_id_offset; +} + +/** Determine if rollback must execute a purge-like operation. +@param node row undo +@return whether the record should be purged */ +static bool row_undo_mod_must_purge(const undo_node_t &node) +{ + ut_ad(node.rec_type == TRX_UNDO_UPD_DEL_REC); + ut_ad(!node.table->is_temporary()); + + const btr_cur_t &btr_cur= node.pcur.btr_cur; + ut_ad(btr_cur.index()->is_primary()); + DEBUG_SYNC_C("rollback_purge_clust"); + + if (!purge_sys.is_purgeable(node.new_trx_id)) + return false; + + const rec_t *rec= btr_cur_get_rec(&btr_cur); + return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur.index())) == + node.new_trx_id; +} + +/***********************************************************//** +Undoes a modify in a clustered index record. Sets also the node state for the +next round of undo. +@return DB_SUCCESS or error code: we may run out of file space */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_clust( +/*===============*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_pcur_t* pcur; + mtr_t mtr; + dberr_t err; + dict_index_t* index; + + ut_ad(thr_get_trx(thr) == node->trx); + ut_ad(node->trx->in_rollback); + + log_free_check(); + pcur = &node->pcur; + index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur)); + ut_ad(index->is_primary()); + + mtr.start(); + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + ut_ad(lock_table_has_locks(index->table)); + } + + mem_heap_t* heap = mem_heap_create(1024); + mem_heap_t* offsets_heap = NULL; + rec_offs* offsets = NULL; + byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + + /* Try optimistic processing of the record, keeping changes within + the index page */ + + err = row_undo_mod_clust_low(node, &offsets, &offsets_heap, + heap, sys, thr, &mtr, BTR_MODIFY_LEAF); + + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a pessimistic + descent down the index tree */ + + mtr.start(); + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + } + + err = row_undo_mod_clust_low(node, &offsets, &offsets_heap, + heap, sys, thr, &mtr, + BTR_MODIFY_TREE); + ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE); + } + + /** + * when scrubbing, and records gets cleared, + * the transaction id is not present afterwards. + * this is safe as: since the record is on free-list + * it can be reallocated at any time after this mtr-commits + * which is just below + */ + ut_ad(srv_immediate_scrub_data_uncompressed + || row_get_rec_trx_id(btr_pcur_get_rec(pcur), index, offsets) + == node->new_trx_id); + + btr_pcur_commit_specify_mtr(pcur, &mtr); + DEBUG_SYNC_C("rollback_undo_pk"); + + if (err != DB_SUCCESS) { + goto func_exit; + } + + /* FIXME: Perform the below operations in the above + mini-transaction when possible. */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing update_undo log record. */ + ut_ad(node->new_trx_id); + + mtr.start(); + if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) != + btr_pcur_t::SAME_ALL) { + goto mtr_commit_exit; + } + + ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(node->table))); + + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + err = btr_cur_optimistic_delete(&pcur->btr_cur, 0, + &mtr); + if (err != DB_FAIL) { + goto mtr_commit_exit; + } + err = DB_SUCCESS; + btr_pcur_commit_specify_mtr(pcur, &mtr); + } else { + index->set_modified(mtr); + if (!row_undo_mod_must_purge(*node)) { + goto mtr_commit_exit; + } + err = btr_cur_optimistic_delete(&pcur->btr_cur, 0, + &mtr); + if (err != DB_FAIL) { + goto mtr_commit_exit; + } + err = DB_SUCCESS; + btr_pcur_commit_specify_mtr(pcur, &mtr); + } + + mtr.start(); + if (pcur->restore_position(BTR_PURGE_TREE, &mtr) != + btr_pcur_t::SAME_ALL) { + goto mtr_commit_exit; + } + + ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(node->table))); + + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + if (!row_undo_mod_must_purge(*node)) { + goto mtr_commit_exit; + } + index->set_modified(mtr); + } + + /* This operation is analogous to purge, we can free + also inherited externally stored fields. We can also + assume that the record was complete (including BLOBs), + because it had been delete-marked after it had been + completely inserted. Therefore, we are passing + rollback=false, just like purge does. */ + btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0, + false, &mtr); + ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE); + } else if (!index->table->is_temporary() && node->new_trx_id) { + /* We rolled back a record so that it still exists. + We must reset the DB_TRX_ID if the history is no + longer accessible by any active read view. */ + + mtr.start(); + if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) + != btr_pcur_t::SAME_ALL + || !purge_sys.is_purgeable(node->new_trx_id)) { + goto mtr_commit_exit; + } + + rec_t* rec = btr_pcur_get_rec(pcur); + ulint trx_id_offset = index->trx_id_offset; + ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; + /* Reserve enough offsets for the PRIMARY KEY and + 2 columns so that we can access DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2]; + if (trx_id_offset) { +#ifdef UNIV_DEBUG + ut_ad(rec_offs_validate(NULL, index, offsets)); + if (buf_block_get_page_zip( + btr_pcur_get_block(&node->pcur))) { + /* Below, page_zip_write_trx_id_and_roll_ptr() + needs offsets to access DB_TRX_ID,DB_ROLL_PTR. + We already computed offsets for possibly + another record in the clustered index. + Because the PRIMARY KEY is fixed-length, + the offsets for the PRIMARY KEY and + DB_TRX_ID,DB_ROLL_PTR are still valid. + Silence the rec_offs_validate() assertion. */ + rec_offs_make_valid(rec, index, true, offsets); + } +#endif + } else if (rec_is_metadata(rec, *index)) { + ut_ad(!buf_block_get_page_zip(btr_pcur_get_block( + pcur))); + for (unsigned i = index->first_user_field(); i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } else { + ut_ad(index->n_uniq <= MAX_REF_PARTS); + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + trx_id_pos + 2, &heap); + ulint len; + trx_id_offset = rec_get_nth_field_offs( + offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + } + + if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) { + ut_ad(!rec_get_deleted_flag( + rec, dict_table_is_comp(node->table)) + || rec_is_alter_metadata(rec, *index)); + index->set_modified(mtr); + buf_block_t* block = btr_pcur_get_block(pcur); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + page_zip_write_trx_id_and_roll_ptr( + block, rec, offsets, trx_id_pos, + 0, 1ULL << ROLL_PTR_INSERT_FLAG_POS, + &mtr); + } else { + size_t offs = page_offset(rec + trx_id_offset); + mtr.memset(block, offs, DATA_TRX_ID_LEN, 0); + offs += DATA_TRX_ID_LEN; + mtr.write<1,mtr_t::MAYBE_NOP>(*block, + block->page.frame + + offs, 0x80U); + mtr.memset(block, offs + 1, + DATA_ROLL_PTR_LEN - 1, 0); + } + } + } else { + goto func_exit; + } + +mtr_commit_exit: + btr_pcur_commit_specify_mtr(pcur, &mtr); + +func_exit: + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + mem_heap_free(heap); + return(err); +} + +/***********************************************************//** +Delete marks or removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_mark_or_remove_sec_low( +/*====================================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry */ + btr_latch_mode mode) /*!< in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + dberr_t err = DB_SUCCESS; + mtr_t mtr; + mtr_t mtr_vers; + const bool modify_leaf = mode == BTR_MODIFY_LEAF; + + row_mtr_start(&mtr, index, !modify_leaf); + + pcur.btr_cur.page_cur.index = index; + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (index->is_spatial()) { + mode = modify_leaf + ? btr_latch_mode(BTR_MODIFY_LEAF + | BTR_RTREE_DELETE_MARK + | BTR_RTREE_UNDO_INS) + : btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS); + btr_cur->thr = thr; + if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) { + goto found; + } else { + goto func_exit; + } + } else if (!index->is_committed()) { + /* The index->online_status may change if the index is + or was being created online, but not committed yet. It + is protected by index->lock. */ + if (modify_leaf) { + mode = BTR_MODIFY_LEAF_ALREADY_LATCHED; + mtr_s_lock_index(index, &mtr); + } else { + ut_ad(mode == BTR_PURGE_TREE); + mode = BTR_PURGE_TREE_ALREADY_LATCHED; + mtr_x_lock_index(index, &mtr); + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_COMPLETE if + index->is_committed(). */ + ut_ad(!dict_index_is_online_ddl(index)); + } + + switch (UNIV_EXPECT(row_search_index_entry(entry, mode, &pcur, &mtr), + ROW_FOUND)) { + case ROW_NOT_FOUND: + /* In crash recovery, the secondary index record may + be missing if the UPDATE did not have time to insert + the secondary index records before the crash. When we + are undoing that UPDATE in crash recovery, the record + may be missing. + + In normal processing, if an update ends in a deadlock + before it has inserted all updated secondary index + records, then the undo will not find those records. */ + goto func_exit; + case ROW_FOUND: + break; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + } + +found: + /* We should remove the index record if no prior version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should delete mark the record. */ + + mtr_vers.start(); + + ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) == + btr_pcur_t::SAME_ALL); + + /* For temporary table, we can skip to check older version of + clustered index entry, because there is no MVCC or purge. */ + if (node->table->is_temporary() + || row_vers_old_has_index_entry( + false, btr_pcur_get_rec(&node->pcur), + &mtr_vers, index, entry, 0, 0)) { + btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur), + btr_cur_get_rec(btr_cur), &mtr); + } else { + /* Remove the index record */ + + if (dict_index_is_spatial(index)) { + rec_t* rec = btr_pcur_get_rec(&pcur); + if (rec_get_deleted_flag(rec, + dict_table_is_comp(index->table))) { + ib::error() << "Record found in index " + << index->name << " is deleted marked" + " on rollback update."; + ut_ad(0); + } + } + + if (modify_leaf) { + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr); + } else { + /* Passing rollback=false, + because we are deleting a secondary index record: + the distinction only matters when deleting a + record that contains externally stored columns. */ + ut_ad(!index->is_primary()); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + false, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/***********************************************************//** +Delete marks or removes a secondary index entry if found. +NOTE that if we updated the fields of a delete-marked secondary index record +so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot +return to the original values because we do not know them. But this should +not cause problems because in row0sel.cc, in queries we always retrieve the +clustered index record or an earlier version of it, if the secondary index +record through which we do the search is delete-marked. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_mark_or_remove_sec( +/*================================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + dberr_t err; + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_LEAF); + if (err == DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_PURGE_TREE); + return(err); +} + +/***********************************************************//** +Delete unmarks a secondary index entry which must be found. It might not be +delete-marked at the moment, but it does not harm to unmark it anyway. We also +need to update the fields of the secondary index record if we updated its +fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. +@retval DB_SUCCESS on success +@retval DB_FAIL if BTR_MODIFY_TREE should be tried +@retval DB_OUT_OF_FILE_SPACE when running out of tablespace +@retval DB_DUPLICATE_KEY if the value was missing + and an insert would lead to a duplicate exists */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_unmark_sec_and_undo_update( +/*========================================*/ + btr_latch_mode mode, /*!< in: search mode: BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry) /*!< in: index entry */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + upd_t* update; + dberr_t err = DB_SUCCESS; + big_rec_t* dummy_big_rec; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + const ulint flags + = BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG; + const auto orig_mode = mode; + + pcur.btr_cur.page_cur.index = index; + ut_ad(trx->id != 0); + + if (index->is_spatial()) { + /* FIXME: Currently we do a 2-pass search for the undo + due to avoid undel-mark a wrong rec in rolling back in + partial update. Later, we could log some info in + secondary index updates to avoid this. */ + static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), ""); + ut_ad(!(mode & 8)); + mode = btr_latch_mode(mode | BTR_RTREE_DELETE_MARK); + } + +try_again: + row_mtr_start(&mtr, index, mode & 8); + + btr_cur->thr = thr; + + if (index->is_spatial()) { + if (!rtr_search(entry, mode, &pcur, &mtr)) { + goto found; + } + + if (mode != orig_mode && btr_cur->rtr_info->fd_del) { + mode = orig_mode; + btr_pcur_close(&pcur); + mtr.commit(); + goto try_again; + } + + goto not_found; + } + + switch (row_search_index_entry(entry, mode, &pcur, &mtr)) { + mem_heap_t* heap; + mem_heap_t* offsets_heap; + rec_offs* offsets; + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + case ROW_NOT_FOUND: +not_found: + if (btr_cur->up_match >= dict_index_get_n_unique(index) + || btr_cur->low_match >= dict_index_get_n_unique(index)) { + ib::warn() << "Record in index " << index->name + << " of table " << index->table->name + << " was not found on rollback, and" + " a duplicate exists: " + << *entry + << " at: " << rec_index_print( + btr_cur_get_rec(btr_cur), index); + err = DB_DUPLICATE_KEY; + break; + } + + ib::warn() << "Record in index " << index->name + << " of table " << index->table->name + << " was not found on rollback, trying to insert: " + << *entry + << " at: " << rec_index_print( + btr_cur_get_rec(btr_cur), index); + + /* Insert the missing record that we were trying to + delete-unmark. */ + big_rec_t* big_rec; + rec_t* insert_rec; + offsets = NULL; + offsets_heap = NULL; + + err = btr_cur_optimistic_insert( + flags, btr_cur, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + ut_ad(!big_rec); + + if (err == DB_FAIL && mode == BTR_MODIFY_TREE) { + err = btr_cur_pessimistic_insert( + flags, btr_cur, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + /* There are no off-page columns in + secondary indexes. */ + ut_ad(!big_rec); + } + + if (err == DB_SUCCESS) { + page_update_max_trx_id( + btr_cur_get_block(btr_cur), + btr_cur_get_page_zip(btr_cur), + trx->id, &mtr); + } + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + break; + case ROW_FOUND: +found: + btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur), + btr_cur_get_rec(btr_cur), &mtr); + heap = mem_heap_create( + sizeof(upd_t) + + dtuple_get_n_fields(entry) * sizeof(upd_field_t)); + offsets_heap = NULL; + offsets = rec_get_offsets( + btr_cur_get_rec(btr_cur), + index, nullptr, index->n_core_fields, ULINT_UNDEFINED, + &offsets_heap); + update = row_upd_build_sec_rec_difference_binary( + btr_cur_get_rec(btr_cur), index, offsets, entry, heap); + if (upd_get_n_fields(update) == 0) { + + /* Do nothing */ + + } else if (mode != BTR_MODIFY_TREE) { + /* Try an optimistic updating of the record, keeping + changes within the page */ + + /* TODO: pass offsets, not &offsets */ + err = btr_cur_optimistic_update( + flags, btr_cur, &offsets, &offsets_heap, + update, 0, thr, thr_get_trx(thr)->id, &mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + default: + break; + } + } else { + err = btr_cur_pessimistic_update( + flags, btr_cur, &offsets, &offsets_heap, + heap, &dummy_big_rec, + update, 0, thr, thr_get_trx(thr)->id, &mtr); + ut_a(!dummy_big_rec); + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is UPD_DEL. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_upd_del_sec( +/*=====================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dberr_t err = DB_SUCCESS; + + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + ut_ad(!node->undo_row); + + heap = mem_heap_create(1024); + + do { + dict_index_t* index = node->index; + + if (index->type & (DICT_FTS | DICT_CORRUPT) + || !index->is_committed()) { + continue; + } + + /* During online index creation, + HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCk + should guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ + dtuple_t* entry = row_build_index_entry( + node->row, node->ext, index, heap); + + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record. Because secondary index entries + are inserted after the clustered index record, + we may assume that the secondary index record + does not exist. However, this situation may + only occur during the rollback of incomplete + transactions. */ + ut_a(thr_get_trx(thr) == trx_roll_crash_recv_trx); + } else { + err = row_undo_mod_del_mark_or_remove_sec( + node, thr, index, entry); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + break; + } + } + + mem_heap_empty(heap); + } while ((node->index = dict_table_get_next_index(node->index))); + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is DEL_MARK. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_del_mark_sec( +/*======================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* heap; + dberr_t err = DB_SUCCESS; + + ut_ad(!node->undo_row); + + heap = mem_heap_create(1024); + + do { + dict_index_t* index = node->index; + + if (index->type & (DICT_FTS | DICT_CORRUPT) + || !index->is_committed()) { + continue; + } + + /* During online index creation, + HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCK + should guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ + dtuple_t* entry = row_build_index_entry( + node->row, node->ext, index, heap); + + ut_a(entry); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err == DB_DUPLICATE_KEY) { + index->type |= DICT_CORRUPT; + err = DB_SUCCESS; + /* Do not return any error to the caller. The + duplicate will be reported by ALTER TABLE or + CREATE UNIQUE INDEX. Unfortunately we cannot + report the duplicate key value to the DDL + thread, because the altered_table object is + private to its call stack. */ + } else if (err != DB_SUCCESS) { + break; + } + + mem_heap_empty(heap); + } while ((node->index = dict_table_get_next_index(node->index))); + + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Undoes a modify in secondary indexes when undo record type is UPD_EXIST. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_mod_upd_exist_sec( +/*=======================*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + return DB_SUCCESS; + } + + mem_heap_t* heap = mem_heap_create(1024); + dberr_t err = DB_SUCCESS; + + do { + dict_index_t* index = node->index; + + if (index->type & (DICT_FTS | DICT_CORRUPT) + || !index->is_committed()) { + continue; + } + + if (!row_upd_changes_ord_field_binary_func( + index, node->update, +#ifdef UNIV_DEBUG + thr, +#endif /* UNIV_DEBUG */ + node->row, node->ext, ROW_BUILD_FOR_UNDO)) { + continue; + } + + /* Build the newest version of the index entry */ + dtuple_t* entry = row_build_index_entry( + node->row, node->ext, index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The server must have crashed in + row_upd_clust_rec_by_insert() before + the updated externally stored columns (BLOBs) + of the new clustered index entry were written. */ + + /* The table must be in DYNAMIC or COMPRESSED + format. REDUNDANT and COMPACT formats + store a local 768-byte prefix of each + externally stored column. */ + ut_a(dict_table_has_atomic_blobs(index->table)); + + /* This is only legitimate when + rolling back an incomplete transaction + after crash recovery. */ + ut_a(thr_get_trx(thr)->is_recovered); + + /* The server must have crashed before + completing the insert of the new + clustered index entry and before + inserting to the secondary indexes. + Because node->row was not yet written + to this index, we can ignore it. But + we must restore node->undo_row. */ + } else { + /* NOTE that if we updated the fields of a + delete-marked secondary index record so that + alphabetically they stayed the same, e.g., + 'abc' -> 'aBc', we cannot return to the + original values because we do not know them. + But this should not cause problems because + in row0sel.cc, in queries we always retrieve + the clustered index record or an earlier + version of it, if the secondary index record + through which we do the search is + delete-marked. */ + + err = row_undo_mod_del_mark_or_remove_sec( + node, thr, index, entry); + if (err != DB_SUCCESS) { + break; + } + } + + mem_heap_empty(heap); + /* We may have to update the delete mark in the + secondary index record of the previous version of + the row. We also need to update the fields of + the secondary index record if we updated its fields + but alphabetically they stayed the same, e.g., + 'abc' -> 'aBc'. */ + entry = row_build_index_entry_low(node->undo_row, + node->undo_ext, + index, heap, + ROW_BUILD_FOR_UNDO); + ut_a(entry); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err == DB_DUPLICATE_KEY) { + index->type |= DICT_CORRUPT; + err = DB_SUCCESS; + } else if (err != DB_SUCCESS) { + break; + } + + mem_heap_empty(heap); + } while ((node->index = dict_table_get_next_index(node->index))); + + mem_heap_free(heap); + + return(err); +} + +/** Parse an update undo record. +@param[in,out] node row rollback state +@param[in] dict_locked whether the data dictionary cache is locked */ +static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked) +{ + dict_index_t* clust_index; + undo_no_t undo_no; + table_id_t table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + byte info_bits; + byte type; + byte cmpl_info; + bool dummy_extern; + + ut_ad(node->trx->in_rollback); + ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr)); + + const byte *ptr = trx_undo_rec_get_pars( + node->undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + node->rec_type = type; + + if (!node->is_temp) { + node->table = dict_table_open_on_id(table_id, dict_locked, + DICT_TABLE_OP_NORMAL); + } else if (!dict_locked) { + dict_sys.freeze(SRW_LOCK_CALL); + node->table = dict_sys.acquire_temporary_table(table_id); + dict_sys.unfreeze(); + } else { + node->table = dict_sys.acquire_temporary_table(table_id); + } + + if (!node->table) { + return false; + } + + ut_ad(!node->table->skip_alter_undo); + + if (UNIV_UNLIKELY(!node->table->is_accessible())) { +close_table: + /* Normally, tables should not disappear or become + unaccessible during ROLLBACK, because they should be + protected by InnoDB table locks. Corruption could be + a valid exception. + + FIXME: When running out of temporary tablespace, it + would probably be better to just drop all temporary + tables (and temporary undo log records) of the current + connection, instead of doing this rollback. */ + dict_table_close(node->table, dict_locked); + node->table = NULL; + return false; + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, + node->heap, &(node->update)); + node->new_trx_id = trx_id; + node->cmpl_info = cmpl_info; + ut_ad(!node->ref->info_bits); + + if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) { + if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG) + != REC_INFO_MIN_REC_FLAG) { + ut_ad("wrong info_bits in undo log record" == 0); + goto close_table; + } + /* This must be an undo log record for a subsequent + instant ALTER TABLE, extending the metadata record. */ + ut_ad(clust_index->is_instant()); + ut_ad(clust_index->table->instant + || !(node->update->info_bits & REC_INFO_DELETED_FLAG)); + node->ref = &trx_undo_metadata; + node->update->info_bits = (node->update->info_bits + & REC_INFO_DELETED_FLAG) + ? REC_INFO_METADATA_ALTER + : REC_INFO_METADATA_ADD; + } + + if (!row_undo_search_clust_to_pcur(node)) { + /* As long as this rolling-back transaction exists, + the PRIMARY KEY value pointed to by the undo log + record should exist. + + However, if InnoDB is killed during a rollback, or + shut down during the rollback of recovered + transactions, then after restart we may try to roll + back some of the same undo log records again, because + trx_roll_try_truncate() is not being invoked after + every undo log record. + + It is also possible that the record + was not modified yet (the DB_ROLL_PTR does not match + node->roll_ptr) and thus there is nothing to roll back. + + btr_cur_upd_lock_and_undo() only writes the undo log + record after successfully acquiring an exclusive lock + on the the clustered index record. That lock will not + be released before the transaction is committed or + fully rolled back. (Exception: if the server was + killed, restarted, and shut down again before the + rollback of the recovered transaction was completed, + it is possible that the transaction was partially + rolled back and locks released.) */ + goto close_table; + } + + /* Extract indexed virtual columns from undo log */ + if (node->ref != &trx_undo_metadata && node->table->n_v_cols) { + row_upd_replace_vcol(node->row, node->table, + node->update, false, node->undo_row, + (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) + ? nullptr : ptr); + } + + return true; +} + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +dberr_t +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err = DB_SUCCESS; + ut_ad(thr_get_trx(thr) == node->trx); + const bool dict_locked = node->trx->dict_operation_lock_mode; + + if (!row_undo_mod_parse_undo_rec(node, dict_locked)) { + return DB_SUCCESS; + } + + ut_ad(node->table->is_temporary() + || lock_table_has_locks(node->table)); + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + if (node->ref->info_bits) { + ut_ad(node->ref->is_metadata()); + goto rollback_clust; + } + + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); + if (node->index) { + switch (node->rec_type) { + case TRX_UNDO_UPD_EXIST_REC: + err = row_undo_mod_upd_exist_sec(node, thr); + break; + case TRX_UNDO_DEL_MARK_REC: + err = row_undo_mod_del_mark_sec(node, thr); + break; + case TRX_UNDO_UPD_DEL_REC: + err = row_undo_mod_upd_del_sec(node, thr); + break; + default: + MY_ASSERT_UNREACHABLE(); + } + } + + if (err == DB_SUCCESS) { +rollback_clust: + err = row_undo_mod_clust(node, thr); + + bool update_statistics + = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); + + if (err == DB_SUCCESS && node->table->stat_initialized) { + switch (node->rec_type) { + case TRX_UNDO_UPD_EXIST_REC: + break; + case TRX_UNDO_DEL_MARK_REC: + dict_table_n_rows_inc(node->table); + update_statistics = update_statistics + || !srv_stats_include_delete_marked; + break; + case TRX_UNDO_UPD_DEL_REC: + dict_table_n_rows_dec(node->table); + update_statistics = update_statistics + || !srv_stats_include_delete_marked; + break; + } + + /* Do not attempt to update statistics when + executing ROLLBACK in the InnoDB SQL + interpreter, because in that case we would + already be holding dict_sys.latch, which + would be acquired when updating statistics. */ + if (update_statistics && !dict_locked) { + dict_stats_update_if_needed(node->table, + *node->trx); + } else { + node->table->stat_modified_counter++; + } + } + } + + dict_table_close(node->table, dict_locked); + + node->table = NULL; + + return(err); +} diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc new file mode 100644 index 00000000..8a1041c8 --- /dev/null +++ b/storage/innobase/row/row0undo.cc @@ -0,0 +1,453 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0undo.cc +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#include "row0undo.h" +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0uins.h" +#include "row0umod.h" +#include "row0upd.h" +#include "row0mysql.h" +#include "srv0srv.h" +#include "srv0start.h" + +/* How to undo row operations? +(1) For an insert, we have stored a prefix of the clustered index record +in the undo log. Using it, we look for the clustered record, and using +that we look for the records in the secondary indexes. The insert operation +may have been left incomplete, if the database crashed, for example. +We may have look at the trx id and roll ptr to make sure the record in the +clustered index is really the one for which the undo log record was +written. We can use the framework we get from the original insert op. +(2) Delete marking: We can use the framework we get from the original +delete mark op. We only have to check the trx id. +(3) Update: This may be the most complicated. We have to use the framework +we get from the original update op. + +What if the same trx repeatedly deletes and inserts an identical row. +Then the row id changes and also roll ptr. What if the row id was not +part of the ordering fields in the clustered index? Maybe we have to write +it to undo log. Well, maybe not, because if we order the row id and trx id +in descending order, then the only undeleted copy is the first in the +index. Our searches in row operations always position the cursor before +the first record in the result set. But, if there is no key defined for +a table, then it would be desirable that row id is in ascending order. +So, lets store row id in descending order only if it is not an ordering +field in the clustered index. + +NOTE: Deletes and inserts may lead to situation where there are identical +records in a secondary index. Is that a problem in the B-tree? Yes. +Also updates can lead to this, unless trx id and roll ptr are included in +ord fields. +(1) Fix in clustered indexes: include row id, trx id, and roll ptr +in node pointers of B-tree. +(2) Fix in secondary indexes: include all fields in node pointers, and +if an entry is inserted, check if it is equal to the right neighbor, +in which case update the right neighbor: the neighbor must be delete +marked, set it unmarked and write the trx id of the current transaction. + +What if the same trx repeatedly updates the same row, updating a secondary +index field or not? Updating a clustered index ordering field? + +(1) If it does not update the secondary index and not the clustered index +ord field. Then the secondary index record stays unchanged, but the +trx id in the secondary index record may be smaller than in the clustered +index record. This is no problem? +(2) If it updates secondary index ord field but not clustered: then in +secondary index there are delete marked records, which differ in an +ord field. No problem. +(3) Updates clustered ord field but not secondary, and secondary index +is unique. Then the record in secondary index is just updated at the +clustered ord field. +(4) + +Problem with duplicate records: +Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a +bigger trx id has inserted and delete marked a similar row, our trx inserts +again a similar row, and a trx with an even bigger id delete marks it. Then +the position of the row should change in the index if the trx id affects +the alphabetical ordering. + +Fix 2: If an insert encounters a similar row marked deleted, we turn the +insert into an 'update' of the row marked deleted. Then we must write undo +info on the update. A problem: what if a purge operation tries to remove +the delete marked row? + +We can think of the database row versions as a linked list which starts +from the record in the clustered index, and is linked by roll ptrs +through undo logs. The secondary index records are references which tell +what kinds of records can be found in this linked list for a record +in the clustered index. + +How to do the purge? A record can be removed from the clustered index +if its linked list becomes empty, i.e., the row has been marked deleted +and its roll ptr points to the record in the undo log we are going through, +doing the purge. Similarly, during a rollback, a record can be removed +if the stored roll ptr in the undo log points to a trx already (being) purged, +or if the roll ptr is NULL, i.e., it was a fresh insert. */ + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap) /*!< in: memory heap where created */ +{ + undo_node_t* undo; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) + || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) + || trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(parent); + + undo = static_cast<undo_node_t*>( + mem_heap_alloc(heap, sizeof(undo_node_t))); + + undo->common.type = QUE_NODE_UNDO; + undo->common.parent = parent; + + undo->trx = trx; + + btr_pcur_init(&(undo->pcur)); + + undo->heap = mem_heap_create(256); + + return(undo); +} + +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return true if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +bool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node) /*!< in/out: row undo node */ +{ + dict_index_t* clust_index; + bool found; + mtr_t mtr; + row_ext_t** ext; + const rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(!node->table->skip_alter_undo); + + mtr_start(&mtr); + + clust_index = dict_table_get_first_index(node->table); + + found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF, + node->table, node->ref, &mtr); + + if (!found) { + goto func_exit; + } + + rec = btr_pcur_get_rec(&node->pcur); + + offsets = rec_get_offsets(rec, clust_index, offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + found = row_get_rec_roll_ptr(rec, clust_index, offsets) + == node->roll_ptr; + + if (found) { + ut_ad(row_get_rec_trx_id(rec, clust_index, offsets) + == node->trx->id || node->table->is_temporary()); + + if (dict_table_has_atomic_blobs(node->table)) { + /* There is no prefix of externally stored + columns in the clustered index record. Build a + cache of column prefixes. */ + ext = &node->ext; + } else { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored + column. No cache is needed. */ + ext = NULL; + node->ext = NULL; + } + + node->row = row_build(ROW_COPY_DATA, clust_index, rec, + offsets, NULL, + NULL, NULL, ext, node->heap); + + /* We will need to parse out virtual column info from undo + log, first mark them DATA_MISSING. So we will know if the + value gets updated */ + if (node->table->n_v_cols + && !trx_undo_roll_ptr_is_insert(node->roll_ptr) + && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + for (ulint i = 0; + i < dict_table_get_n_v_cols(node->table); i++) { + dfield_get_type(dtuple_get_nth_v_field( + node->row, i))->mtype = DATA_MISSING; + } + } + + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + ut_ad((node->row->info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_MIN_REC_FLAG + || node->row->info_bits == 0); + node->undo_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->undo_row, &node->undo_ext, + clust_index, node->update, node->heap); + } else { + ut_ad(((node->row->info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_MIN_REC_FLAG) + == (node->rec_type == TRX_UNDO_INSERT_METADATA)); + node->undo_row = NULL; + node->undo_ext = NULL; + } + + btr_pcur_store_position(&node->pcur, &mtr); + } + + if (heap) { + mem_heap_free(heap); + } + +func_exit: + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + return(found); +} + +/** Get the latest undo log record for rollback. +@param[in,out] node rollback context +@return undo block for the undo log record +@retval nullptr if no undo log record was fetched */ +static buf_block_t* row_undo_rec_get(undo_node_t* node) +{ + trx_t* trx = node->trx; + + if (trx->pages_undone) { + trx->pages_undone = 0; + trx_undo_try_truncate(*trx); + } + + trx_undo_t* undo = NULL; + trx_undo_t* update = trx->rsegs.m_redo.undo; + trx_undo_t* temp = trx->rsegs.m_noredo.undo; + const undo_no_t limit = trx->roll_limit; + node->is_temp = false; + + ut_ad(!update || !temp || update->empty() || temp->empty() + || update->top_undo_no != temp->top_undo_no); + + if (update && !update->empty() && update->top_undo_no >= limit) { + if (!undo) { + undo = update; + } else if (undo->top_undo_no < update->top_undo_no) { + undo = update; + } + } + + if (temp && !temp->empty() && temp->top_undo_no >= limit) { + if (!undo || undo->top_undo_no < temp->top_undo_no) { + undo = temp; + node->is_temp = true; + } + } + + if (undo == NULL) { + trx_undo_try_truncate(*trx); + /* Mark any ROLLBACK TO SAVEPOINT completed, so that + if the transaction object is committed and reused + later, we will default to a full ROLLBACK. */ + trx->roll_limit = 0; + trx->in_rollback = false; + return nullptr; + } + + ut_ad(!undo->empty()); + ut_ad(limit <= undo->top_undo_no); + + node->roll_ptr = trx_undo_build_roll_ptr( + false, trx_sys.rseg_id(undo->rseg, !node->is_temp), + undo->top_page_no, undo->top_offset); + + mtr_t mtr; + mtr.start(); + + buf_block_t* undo_page = buf_page_get( + page_id_t(undo->rseg->space->id, undo->top_page_no), + 0, RW_S_LATCH, &mtr); + if (!undo_page) { + return nullptr; + } + + uint16_t offset = undo->top_offset; + + buf_block_t* prev_page = undo_page; + if (trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec( + prev_page, offset, undo->hdr_page_no, undo->hdr_offset, + true, &mtr)) { + if (prev_page != undo_page) { + trx->pages_undone++; + } + + undo->top_page_no = prev_page->page.id().page_no(); + undo->top_offset = page_offset(prev_rec); + undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + ut_ad(!undo->empty()); + } else { + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); + } + + undo_page->fix(); + mtr.commit(); + + node->undo_rec = undo_page->page.frame + offset; + + const size_t end = mach_read_from_2(node->undo_rec); + if (UNIV_UNLIKELY(end <= offset + || end >= srv_page_size - FIL_PAGE_DATA_END)) { + undo_page->unfix(); + node->undo_rec = nullptr; + return nullptr; + } + + switch (node->undo_rec[2] & (TRX_UNDO_CMPL_INFO_MULT - 1)) { + case TRX_UNDO_INSERT_METADATA: + /* This record type was introduced in MDEV-11369 + instant ADD COLUMN, which was implemented after + MDEV-12288 removed the insert_undo log. There is no + instant ADD COLUMN for temporary tables. Therefore, + this record can only be present in the main undo log. */ + /* fall through */ + case TRX_UNDO_RENAME_TABLE: + ut_ad(undo == update); + /* fall through */ + case TRX_UNDO_INSERT_REC: + case TRX_UNDO_EMPTY: + node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS; + } + + trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no( + node->undo_rec); + return undo_page; +} + +/***********************************************************//** +Fetches an undo log record and does the undo for the recorded operation. +If none left, or a partial rollback completed, returns control to the +parent node, which is always a query thread node. +@return DB_SUCCESS if operation successfully completed, else error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +row_undo( +/*=====*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(node->trx->in_rollback); + + buf_block_t* undo_page = row_undo_rec_get(node); + + if (!undo_page) { + /* Rollback completed for this query thread */ + thr->run_node = que_node_get_parent(node); + return DB_SUCCESS; + } + + dberr_t err = trx_undo_roll_ptr_is_insert(node->roll_ptr) + ? row_undo_ins(node, thr) : row_undo_mod(node, thr); + undo_page->unfix(); + btr_pcur_close(&(node->pcur)); + + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(err); +} + +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + undo_node_t* node; + trx_t* trx = thr_get_trx(thr); + + node = static_cast<undo_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); + + if (UNIV_UNLIKELY(!trx->dict_operation + && !srv_undo_sources + && srv_shutdown_state != SRV_SHUTDOWN_NONE) + && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) { + /* Shutdown has been initiated. */ + trx->error_state = DB_INTERRUPTED; + return NULL; + } + + if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) { + trx_roll_report_progress(); + } + + err = row_undo(node, thr); + +#ifdef ENABLED_DEBUG_SYNC + if (trx->mysql_thd) { + DEBUG_SYNC_C("trx_after_rollback_row"); + } +#endif /* ENABLED_DEBUG_SYNC */ + + trx->error_state = err; + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + ib::fatal() << "Error (" << err << ") in rollback."; + } + + return(thr); +} diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc new file mode 100644 index 00000000..bec53841 --- /dev/null +++ b/storage/innobase/row/row0upd.cc @@ -0,0 +1,3002 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0upd.cc +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "row0upd.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "trx0undo.h" +#include "rem0rec.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "mach0data.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0ins.h" +#include "row0log.h" +#include "row0row.h" +#include "row0sel.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0sym.h" +#include "eval0eval.h" +#include "buf0lru.h" +#include "trx0rec.h" +#include "fts0fts.h" +#include "fts0types.h" +#include <algorithm> +#include <mysql/plugin.h> +#include <mysql/service_wsrep.h> +#ifdef WITH_WSREP +#include "log.h" +#include "wsrep.h" +#endif /* WITH_WSREP */ + + +/* What kind of latch and lock can we assume when the control comes to + ------------------------------------------------------------------- +an update node? +-------------- +Efficiency of massive updates would require keeping an x-latch on a +clustered index page through many updates, and not setting an explicit +x-lock on clustered index records, as they anyway will get an implicit +x-lock when they are updated. A problem is that the read nodes in the +graph should know that they must keep the latch when passing the control +up to the update node, and not set any record lock on the record which +will be updated. Another problem occurs if the execution is stopped, +as the kernel switches to another query thread, or the transaction must +wait for a lock. Then we should be able to release the latch and, maybe, +acquire an explicit x-lock on the record. + Because this seems too complicated, we conclude that the less +efficient solution of releasing all the latches when the control is +transferred to another node, and acquiring explicit x-locks, is better. */ + +/* How is a delete performed? If there is a delete without an +explicit cursor, i.e., a searched delete, there are at least +two different situations: +the implicit select cursor may run on (1) the clustered index or +on (2) a secondary index. The delete is performed by setting +the delete bit in the record and substituting the id of the +deleting transaction for the original trx id, and substituting a +new roll ptr for previous roll ptr. The old trx id and roll ptr +are saved in the undo log record. Thus, no physical changes occur +in the index tree structure at the time of the delete. Only +when the undo log is purged, the index records will be physically +deleted from the index trees. + +The query graph executing a searched delete would consist of +a delete node which has as a subtree a select subgraph. +The select subgraph should return a (persistent) cursor +in the clustered index, placed on page which is x-latched. +The delete node should look for all secondary index records for +this clustered index entry and mark them as deleted. When is +the x-latch freed? The most efficient way for performing a +searched delete is obviously to keep the x-latch for several +steps of query graph execution. */ + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***********************************************************//** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. +@return TRUE if changes */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + dtuple_t* entry, /*!< in: old value of index entry */ + dict_index_t* index, /*!< in: index of entry */ + const upd_t* update, /*!< in: update vector for the row */ + ulint n); /*!< in: how many first fields to check */ + +/*********************************************************************//** +Checks if index currently is mentioned as a referenced index in a foreign +key constraint. + +@return true if referenced */ +static +bool +row_upd_index_is_referenced( +/*========================*/ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + dict_table_t *table= index->table; + /* The pointers in table->referenced_set are safe to dereference + thanks to the SQL layer having acquired MDL on all (grand)parent tables. */ + dict_foreign_set::iterator end= table->referenced_set.end(); + return end != std::find_if(table->referenced_set.begin(), end, + dict_foreign_with_index(index)); +} + +#ifdef WITH_WSREP +static +bool +wsrep_row_upd_index_is_foreign( +/*========================*/ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + if (!trx->is_wsrep()) + return false; + + dict_table_t *table= index->table; + + if (table->foreign_set.empty()) + return false; + + /* No MDL protects dereferencing the members of table->foreign_set. */ + const bool no_lock= !trx->dict_operation_lock_mode; + if (no_lock) + dict_sys.freeze(SRW_LOCK_CALL); + + auto end= table->foreign_set.end(); + const bool is_referenced= end != + std::find_if(table->foreign_set.begin(), end, + [index](const dict_foreign_t* f) + {return f->foreign_index == index;}); + if (no_lock) + dict_sys.unfreeze(); + + return is_referenced; +} +#endif /* WITH_WSREP */ + +/*********************************************************************//** +Checks if possible foreign key constraints hold after a delete of the record +under pcur. + +NOTE that this function will temporarily commit mtr and lose the +pcur position! + +@return DB_SUCCESS or an error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_check_references_constraints( +/*=================================*/ + upd_node_t* node, /*!< in: row update node */ + btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /*!< in: table in question */ + dict_index_t* index, /*!< in: index of the cursor */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + const rec_t* rec; + dberr_t err; + + DBUG_ENTER("row_upd_check_references_constraints"); + + if (table->referenced_set.empty()) { + DBUG_RETURN(DB_SUCCESS); + } + + rec = btr_pcur_get_rec(pcur); + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(rec, index, offsets, heap); + + mtr_commit(mtr); + + DEBUG_SYNC_C("foreign_constraint_check_for_update"); + + mtr->start(); + + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "foreign_constraint_check_for_insert"); + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->referenced_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary( + entry, index, node->update, + foreign->n_fields))) { + dict_table_t* ref_table = nullptr; + + if (!foreign->foreign_table) { + ref_table = dict_table_open_on_name( + foreign->foreign_table_name_lookup, + false, DICT_ERR_IGNORE_NONE); + } + + err = row_ins_check_foreign_constraint( + FALSE, foreign, table, entry, thr); + + if (ref_table) { + dict_table_close(ref_table); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + } + } + + err = DB_SUCCESS; + +func_exit: + mem_heap_free(heap); + + DEBUG_SYNC_C("foreign_constraint_check_for_update_done"); + DBUG_RETURN(err); +} + +#ifdef WITH_WSREP +static +dberr_t +wsrep_row_upd_check_foreign_constraints( +/*=================================*/ + upd_node_t* node, /*!< in: row update node */ + btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /*!< in: table in question */ + dict_index_t* index, /*!< in: index of the cursor */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + const rec_t* rec; + dberr_t err; + + if (table->foreign_set.empty()) { + return(DB_SUCCESS); + } + + /* TODO: make native slave thread bail out here */ + + rec = btr_pcur_get_rec(pcur); + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(rec, index, offsets, heap); + + mtr_commit(mtr); + + mtr_start(mtr); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->foreign_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary( + entry, index, node->update, + foreign->n_fields))) { + + dict_table_t *opened = nullptr; + + if (!foreign->referenced_table) { + foreign->referenced_table = + dict_table_open_on_name( + foreign->referenced_table_name_lookup, + false, DICT_ERR_IGNORE_NONE); + opened = foreign->referenced_table; + } + + err = row_ins_check_foreign_constraint( + TRUE, foreign, table, entry, thr); + + if (opened) { + dict_table_close(opened); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + } + } + + err = DB_SUCCESS; +func_exit: + mem_heap_free(heap); + + return(err); +} + +/** Determine if a FOREIGN KEY constraint needs to be processed. +@param[in] node query node +@param[in] trx transaction +@return whether the node cannot be ignored */ + +inline bool wsrep_must_process_fk(const upd_node_t* node, const trx_t* trx) +{ + if (!trx->is_wsrep()) { + return false; + } + return que_node_get_type(node->common.parent) != QUE_NODE_UPDATE + || static_cast<upd_node_t*>(node->common.parent)->cascade_node + != node; +} +#endif /* WITH_WSREP */ + +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + upd_node_t* node; + + node = static_cast<upd_node_t*>( + mem_heap_zalloc(heap, sizeof(upd_node_t))); + + node->common.type = QUE_NODE_UPDATE; + node->state = UPD_NODE_UPDATE_CLUSTERED; + node->heap = mem_heap_create(128); + node->magic_n = UPD_NODE_MAGIC_N; + + return(node); +} + +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update) /*!< in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint old_len; + ulint new_len; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + ut_ad(!index->table->skip_alter_undo); + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + /* We should ignore virtual field if the index is not + a virtual index */ + if (upd_fld_is_virtual_col(upd_field) + && !index->has_virtual()) { + continue; + } + + new_val = &(upd_field->new_val); + if (dfield_is_ext(new_val)) { + return(TRUE); + } + new_len = dfield_get_len(new_val); + ut_ad(new_len != UNIV_SQL_DEFAULT); + + if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) { + new_len = dict_col_get_sql_null_size( + dict_index_get_nth_col(index, + upd_field->field_no), + 0); + } + + if (rec_offs_nth_default(offsets, upd_field->field_no)) { + /* This is an instantly added column that is + at the initial default value. */ + return(TRUE); + } + + if (rec_offs_comp(offsets) + && rec_offs_nth_sql_null(offsets, upd_field->field_no)) { + /* Note that in the compact table format, for a + variable length field, an SQL NULL will use zero + bytes in the offset array at the start of the physical + record, but a zero-length value (empty string) will + use one byte! Thus, we cannot use update-in-place + if we update an SQL NULL varchar to an empty string! */ + + old_len = UNIV_SQL_NULL; + } else { + old_len = rec_offs_nth_size(offsets, + upd_field->field_no); + } + + if (old_len != new_len + || rec_offs_nth_extern(offsets, upd_field->field_no)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const dtuple_t* entry, /*!< in: entry to insert */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + + /* This function is used only for a secondary index */ + ut_a(!dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry)); + ut_ad(!rec_offs_any_extern(offsets)); + ut_ad(!rec_offs_any_default(offsets)); + ut_ad(!index->table->skip_alter_undo); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE that it may be that len != dfield_get_len(dfield) if we + are updating in a character set and collation where strings of + different length can be equal in an alphabetical comparison, + and also in the case where we have a column prefix index + and the last characters in the index field are spaces; the + latter case probably caused the assertion failures reported at + row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */ + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (!dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index); + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + + +/** Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@param[in] index clustered index +@param[in] entry clustered index entry to insert +@param[in] rec clustered index record +@param[in] offsets rec_get_offsets(rec,index), or NULL +@param[in] no_sys skip the system columns + DB_TRX_ID and DB_ROLL_PTR +@param[in] trx transaction (for diagnostics), + or NULL +@param[in] heap memory heap from which allocated +@param[in] mysql_table NULL, or mysql table object when + user thread invokes dml +@param[out] error error number in case of failure +@return own: update vector of differing fields, excluding roll ptr and +trx id,if error is not equal to DB_SUCCESS, return NULL */ +upd_t* +row_upd_build_difference_binary( + dict_index_t* index, + const dtuple_t* entry, + const rec_t* rec, + const rec_offs* offsets, + bool no_sys, + bool ignore_warnings, + trx_t* trx, + mem_heap_t* heap, + TABLE* mysql_table, + dberr_t* error) +{ + ulint len; + upd_t* update; + ulint n_diff; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint n_v_fld = dtuple_get_n_v_fields(entry); + rec_offs_init(offsets_); + + /* This function is used only for a clustered index */ + ut_a(dict_index_is_clust(index)); + ut_ad(!index->table->skip_alter_undo); + ut_ad(entry->n_fields <= index->n_fields); + ut_ad(entry->n_fields >= index->n_core_fields); + + update = upd_create(index->n_fields + n_v_fld, heap); + + n_diff = 0; + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + for (uint16_t i = 0; i < entry->n_fields; i++) { + const byte* data = rec_get_nth_cfield(rec, index, offsets, i, + &len); + const dfield_t* dfield = dtuple_get_nth_field(entry, i); + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + if (no_sys && (i == index->db_trx_id() + || i == index->db_roll_ptr())) { + continue; + } + + if (!dfield_is_ext(dfield) + != !rec_offs_nth_extern(offsets, i) + || !dfield_data_is_binary_equal(dfield, len, data)) { + upd_field_t* uf = upd_get_nth_field(update, n_diff++); + dfield_copy(&uf->new_val, dfield); + upd_field_set_field_no(uf, i, index); + } + } + + for (uint16_t i = static_cast<uint16_t>(entry->n_fields); + i < index->n_fields; i++) { + upd_field_t* uf = upd_get_nth_field(update, n_diff++); + const dict_col_t* col = dict_index_get_nth_col(index, i); + /* upd_create() zero-initialized uf */ + uf->new_val.data = const_cast<byte*>(col->instant_value(&len)); + uf->new_val.len = static_cast<unsigned>(len); + dict_col_copy_type(col, &uf->new_val.type); + upd_field_set_field_no(uf, i, index); + } + + /* Check the virtual columns updates. Even if there is no non-virtual + column (base columns) change, we will still need to build the + indexed virtual column value so that undo log would log them ( + for purge/mvcc purpose) */ + if (n_v_fld > 0) { + row_ext_t* ext; + THD* thd; + + if (trx == NULL) { + thd = current_thd; + } else { + thd = trx->mysql_thd; + } + + ut_ad(!update->old_vrow); + + ib_vcol_row vc(NULL); + uchar *record = vc.record(thd, index, &mysql_table); + + for (uint16_t i = 0; i < n_v_fld; i++) { + const dict_v_col_t* col + = dict_table_get_nth_v_col(index->table, i); + + if (!col->m_col.ord_part) { + continue; + } + + if (update->old_vrow == NULL) { + update->old_vrow = row_build( + ROW_COPY_POINTERS, index, rec, offsets, + index->table, NULL, NULL, &ext, heap); + } + + dfield_t* vfield = innobase_get_computed_value( + update->old_vrow, col, index, + &vc.heap, heap, NULL, thd, mysql_table, record, + NULL, NULL, ignore_warnings); + if (vfield == NULL) { + *error = DB_COMPUTE_VALUE_FAILED; + return(NULL); + } + + const dfield_t* dfield = dtuple_get_nth_v_field( + entry, i); + + if (!dfield_data_is_binary_equal( + dfield, vfield->len, + static_cast<byte*>(vfield->data))) { + upd_field_t* uf = upd_get_nth_field(update, + n_diff++); + uf->old_v_val = static_cast<dfield_t*>( + mem_heap_alloc(heap, + sizeof *uf->old_v_val)); + dfield_copy(uf->old_v_val, vfield); + dfield_copy(&uf->new_val, dfield); + upd_field_set_v_field_no(uf, i, index); + } + } + } + + update->n_fields = n_diff; + ut_ad(update->validate()); + + return(update); +} + +/** Fetch a prefix of an externally stored column. +This is similar to row_ext_lookup(), but the row_ext_t holds the old values +of the column and must not be poisoned with the new values. +@param[in] data 'internally' stored part of the field +containing also the reference to the external part +@param[in] local_len length of data, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] len input - length of prefix to +fetch; output: fetched length of the prefix +@param[in,out] heap heap where to allocate +@return BLOB prefix +@retval NULL if the record is incomplete (should only happen +in row_vers_vc_matches_cluster() executed concurrently with another purge) */ +static +byte* +row_upd_ext_fetch( + const byte* data, + ulint local_len, + ulint zip_size, + ulint* len, + mem_heap_t* heap) +{ + byte* buf = static_cast<byte*>(mem_heap_alloc(heap, *len)); + + *len = btr_copy_externally_stored_field_prefix( + buf, *len, zip_size, data, local_len); + + return *len ? buf : NULL; +} + +/** Replaces the new column value stored in the update vector in +the given index entry field. +@param[in,out] dfield data field of the index entry +@param[in] field index field +@param[in] col field->col +@param[in] uf update field +@param[in,out] heap memory heap for allocating and copying +the new value +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return whether the previous version was built successfully */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +bool +row_upd_index_replace_new_col_val( + dfield_t* dfield, + const dict_field_t* field, + const dict_col_t* col, + const upd_field_t* uf, + mem_heap_t* heap, + ulint zip_size) +{ + ulint len; + const byte* data; + + dfield_copy_data(dfield, &uf->new_val); + + if (dfield_is_null(dfield)) { + return true; + } + + len = dfield_get_len(dfield); + data = static_cast<const byte*>(dfield_get_data(dfield)); + + if (field->prefix_len > 0) { + ibool fetch_ext = dfield_is_ext(dfield) + && len < (ulint) field->prefix_len + + BTR_EXTERN_FIELD_REF_SIZE; + + if (fetch_ext) { + ulint l = len; + + len = field->prefix_len; + + data = row_upd_ext_fetch(data, l, zip_size, + &len, heap); + if (UNIV_UNLIKELY(!data)) { + return false; + } + } + + len = dtype_get_at_most_n_mbchars(col->prtype, + col->mbminlen, col->mbmaxlen, + field->prefix_len, len, + (const char*) data); + + dfield_set_data(dfield, data, len); + + if (!fetch_ext) { + dfield_dup(dfield, heap); + } + + return true; + } + + switch (uf->orig_len) { + byte* buf; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(dfield, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(dfield); + /* fall through */ + case 0: + dfield_dup(dfield, heap); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len)); + + /* Copy the locally stored prefix. */ + memcpy(buf, data, + unsigned(uf->orig_len) - BTR_EXTERN_FIELD_REF_SIZE); + + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(dfield, buf, uf->orig_len); + dfield_set_ext(dfield); + break; + } + + return true; +} + +/** Apply an update vector to an metadata entry. +@param[in,out] entry clustered index metadata record to be updated +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ +static +void +row_upd_index_replace_metadata( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) +{ + ut_ad(!index->table->skip_alter_undo); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->info_bits == update->info_bits); + ut_ad(entry->n_fields == ulint(index->n_fields) + 1); + const ulint zip_size = index->table->space->zip_size(); + const ulint first = index->first_user_field(); + ut_d(bool found_mblob = false); + + for (ulint i = upd_get_n_fields(update); i--; ) { + const upd_field_t* uf = upd_get_nth_field(update, i); + ut_ad(!upd_fld_is_virtual_col(uf)); + ut_ad(uf->field_no >= first - 2); + ulint f = uf->field_no; + dfield_t* dfield = dtuple_get_nth_field(entry, f); + + if (f == first) { + ut_d(found_mblob = true); + ut_ad(!dfield_is_null(&uf->new_val)); + ut_ad(dfield_is_ext(dfield)); + ut_ad(dfield_get_len(dfield) == FIELD_REF_SIZE); + ut_ad(!dfield_is_null(dfield)); + dfield_set_data(dfield, uf->new_val.data, + uf->new_val.len); + if (dfield_is_ext(&uf->new_val)) { + dfield_set_ext(dfield); + } + continue; + } + + f -= f > first; + const dict_field_t* field = dict_index_get_nth_field(index, f); + if (!row_upd_index_replace_new_col_val(dfield, field, + field->col, + uf, heap, zip_size)) { + ut_error; + } + } + + ut_ad(found_mblob); +} + +/** Apply an update vector to an index entry. +@param[in,out] entry index entry to be updated; the clustered index record + must be covered by a lock or a page latch to prevent + deletion (rollback or purge) +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ +void +row_upd_index_replace_new_col_vals_index_pos( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) +{ + ut_ad(!index->table->skip_alter_undo); + ut_ad(!entry->is_metadata() || entry->info_bits == update->info_bits); + + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + row_upd_index_replace_metadata(entry, index, update, heap); + return; + } + + const ulint zip_size = index->table->space->zip_size(); + + dtuple_set_info_bits(entry, update->info_bits); + + for (uint16_t i = index->n_fields; i--; ) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + if (col->is_virtual()) { + const dict_v_col_t* vcol = reinterpret_cast< + const dict_v_col_t*>( + col); + + uf = upd_get_field_by_field_no( + update, vcol->v_pos, true); + } else { + uf = upd_get_field_by_field_no( + update, i, false); + } + + if (uf && UNIV_UNLIKELY(!row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, + zip_size))) { + ut_error; + } + } +} + +/** Replace the new column values stored in the update vector, +during trx_undo_prev_version_build(). +@param entry clustered index tuple where the values are replaced + (the clustered index leaf page latch must be held) +@param index clustered index +@param update update vector for the clustered index +@param heap memory heap for allocating and copying values +@return whether the previous version was built successfully */ +bool +row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index, + const upd_t *update, mem_heap_t *heap) +{ + ut_ad(index.is_primary()); + const ulint zip_size= index.table->space->zip_size(); + + ut_ad(!index.table->skip_alter_undo); + dtuple_set_info_bits(entry, update->info_bits); + + for (ulint i= 0; i < index.n_fields; i++) + { + const dict_field_t *field= &index.fields[i]; + const dict_col_t* col= dict_field_get_col(field); + const upd_field_t *uf; + + if (col->is_virtual()) + { + const dict_v_col_t *vcol= reinterpret_cast<const dict_v_col_t*>(col); + uf= upd_get_field_by_field_no(update, vcol->v_pos, true); + } + else + uf= upd_get_field_by_field_no(update, static_cast<uint16_t> + (dict_col_get_clust_pos(col, &index)), + false); + + if (!uf) + continue; + + if (!row_upd_index_replace_new_col_val(dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size)) + return false; + } + + return true; +} + +/** Replaces the virtual column values stored in the update vector. +@param[in,out] row row whose column to be set +@param[in] field data to set +@param[in] len data length +@param[in] vcol virtual column info */ +static +void +row_upd_set_vcol_data( + dtuple_t* row, + const byte* field, + ulint len, + dict_v_col_t* vcol) +{ + dfield_t* dfield = dtuple_get_nth_v_field(row, vcol->v_pos); + + if (dfield_get_type(dfield)->mtype == DATA_MISSING) { + dict_col_copy_type(&vcol->m_col, dfield_get_type(dfield)); + + dfield_set_data(dfield, field, len); + } +} + +/** Replaces the virtual column values stored in a dtuple with that of +a update vector. +@param[in,out] row row whose column to be updated +@param[in] table table +@param[in] update an update vector built for the clustered index +@param[in] upd_new update to new or old value +@param[in,out] undo_row undo row (if needs to be updated) +@param[in] ptr remaining part in update undo log */ +void +row_upd_replace_vcol( + dtuple_t* row, + const dict_table_t* table, + const upd_t* update, + bool upd_new, + dtuple_t* undo_row, + const byte* ptr) +{ + ulint col_no; + ulint i; + ulint n_cols; + + ut_ad(!table->skip_alter_undo); + + n_cols = dtuple_get_n_v_fields(row); + for (col_no = 0; col_no < n_cols; col_no++) { + dfield_t* dfield; + + const dict_v_col_t* col + = dict_table_get_nth_v_col(table, col_no); + + /* If there is no index on the column, do not bother for + value update */ + if (!col->m_col.ord_part) { + continue; + } + + dfield = dtuple_get_nth_v_field(row, col_no); + + for (i = 0; i < upd_get_n_fields(update); i++) { + const upd_field_t* upd_field + = upd_get_nth_field(update, i); + if (!upd_fld_is_virtual_col(upd_field) + || upd_field->field_no != col->v_pos) { + continue; + } + + if (upd_new) { + dfield_copy_data(dfield, &upd_field->new_val); + } else { + dfield_copy_data(dfield, upd_field->old_v_val); + } + + dfield->type = upd_field->new_val.type; + break; + } + } + + bool first_v_col = true; + bool is_undo_log = true; + + /* We will read those unchanged (but indexed) virtual columns in */ + if (ptr) { + const byte* const end_ptr = ptr + mach_read_from_2(ptr); + ptr += 2; + + while (ptr != end_ptr) { + const byte* field; + uint32_t field_no, len, orig_len; + + field_no = mach_read_next_compressed(&ptr); + + const bool is_v = (field_no >= REC_MAX_N_FIELDS); + + if (is_v) { + ptr = trx_undo_read_v_idx( + table, ptr, first_v_col, &is_undo_log, + &field_no); + first_v_col = false; + } + + ptr = trx_undo_rec_get_col_val( + ptr, &field, &len, &orig_len); + + if (field_no == FIL_NULL) { + ut_ad(is_v); + continue; + } + + if (is_v) { + dict_v_col_t* vcol = dict_table_get_nth_v_col( + table, field_no); + + row_upd_set_vcol_data(row, field, len, vcol); + + if (undo_row) { + row_upd_set_vcol_data( + undo_row, field, len, vcol); + } + } + ut_ad(ptr<= end_ptr); + } + } +} + +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint col_no; + ulint i; + ulint n_cols; + ulint n_ext_cols; + ulint* ext_cols; + const dict_table_t* table; + + ut_ad(row); + ut_ad(ext); + ut_ad(index); + ut_ad(dict_index_is_clust(index)); + ut_ad(update); + ut_ad(heap); + ut_ad(update->validate()); + + n_cols = dtuple_get_n_fields(row); + table = index->table; + ut_ad(n_cols == dict_table_get_n_cols(table)); + + ext_cols = static_cast<ulint*>( + mem_heap_alloc(heap, n_cols * sizeof *ext_cols)); + + n_ext_cols = 0; + + dtuple_set_info_bits(row, update->info_bits); + + for (col_no = 0; col_no < n_cols; col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + const ulint clust_pos + = dict_col_get_clust_pos(col, index); + dfield_t* dfield; + + if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) { + + continue; + } + + dfield = dtuple_get_nth_field(row, col_no); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + const upd_field_t* upd_field + = upd_get_nth_field(update, i); + + if (upd_field->field_no != clust_pos + || upd_fld_is_virtual_col(upd_field)) { + + continue; + } + + dfield_copy_data(dfield, &upd_field->new_val); + break; + } + + if (dfield_is_ext(dfield) && col->ord_part) { + ext_cols[n_ext_cols++] = col_no; + } + } + + if (n_ext_cols) { + *ext = row_ext_create(n_ext_cols, ext_cols, *table, row, heap); + } else { + *ext = NULL; + } + + row_upd_replace_vcol(row, table, update, true, nullptr, nullptr); +} + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +ibool +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + const row_ext_t*ext, /*!< NULL, or prefixes of the externally + stored columns in the old row */ + ulint flag) /*!< in: ROW_BUILD_NORMAL, + ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */ +{ + ulint n_unique; + ulint i; + const dict_index_t* clust_index; + + ut_ad(!index->table->skip_alter_undo); + + n_unique = dict_index_get_n_unique(index); + + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n_unique; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_no; + const upd_field_t* upd_field; + const dfield_t* dfield; + dfield_t dfield_ext; + ulint dfield_len= 0; + const byte* buf; + bool is_virtual; + const dict_v_col_t* vcol = NULL; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_no = dict_col_get_no(col); + is_virtual = col->is_virtual(); + + if (is_virtual) { + vcol = reinterpret_cast<const dict_v_col_t*>(col); + + upd_field = upd_get_field_by_field_no( + update, vcol->v_pos, true); + } else { + upd_field = upd_get_field_by_field_no( + update, static_cast<uint16_t>( + dict_col_get_clust_pos( + col, clust_index)), + false); + } + + if (upd_field == NULL) { + continue; + } + + if (row == NULL) { + ut_ad(ext == NULL); + return(TRUE); + } + + if (is_virtual) { + dfield = dtuple_get_nth_v_field( + row, vcol->v_pos); + } else { + dfield = dtuple_get_nth_field(row, col_no); + } + + /* For spatial index update, since the different geometry + data could generate same MBR, so, if the new index entry is + same as old entry, which means the MBR is not changed, we + don't need to do anything. */ + if (dict_index_is_spatial(index) && i == 0) { + double mbr1[SPDIMS * 2]; + double mbr2[SPDIMS * 2]; + rtr_mbr_t* old_mbr; + rtr_mbr_t* new_mbr; + const uchar* dptr = NULL; + ulint flen = 0; + ulint dlen = 0; + mem_heap_t* temp_heap = NULL; + const dfield_t* new_field = &upd_field->new_val; + + const ulint zip_size = ext + ? ext->zip_size + : index->table->space->zip_size(); + + ut_ad(dfield->data != NULL + && dfield->len > GEO_DATA_HEADER_SIZE); + ut_ad(dict_col_get_spatial_status(col) != SPATIAL_NONE); + + /* Get the old mbr. */ + if (dfield_is_ext(dfield)) { + /* For off-page stored data, we + need to read the whole field data. */ + flen = dfield_get_len(dfield); + dptr = static_cast<const byte*>( + dfield_get_data(dfield)); + temp_heap = mem_heap_create(1000); + + dptr = btr_copy_externally_stored_field( + &dlen, dptr, + zip_size, + flen, + temp_heap); + } else { + dptr = static_cast<const uchar*>(dfield->data); + dlen = dfield->len; + } + + rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, + static_cast<uint>(dlen + - GEO_DATA_HEADER_SIZE), + SPDIMS, mbr1); + old_mbr = reinterpret_cast<rtr_mbr_t*>(mbr1); + + /* Get the new mbr. */ + if (dfield_is_ext(new_field)) { + if (flag == ROW_BUILD_FOR_UNDO + && dict_table_has_atomic_blobs( + index->table)) { + /* For ROW_FORMAT=DYNAMIC + or COMPRESSED, a prefix of + off-page records is stored + in the undo log record + (for any column prefix indexes). + For SPATIAL INDEX, we must + ignore this prefix. The + full column value is stored in + the BLOB. + For non-spatial index, we + would have already fetched a + necessary prefix of the BLOB, + available in the "ext" parameter. + + Here, for SPATIAL INDEX, we are + fetching the full column, which is + potentially wasting a lot of I/O, + memory, and possibly involving a + concurrency problem, similar to ones + that existed before the introduction + of row_ext_t. + + MDEV-11657 FIXME: write the MBR + directly to the undo log record, + and avoid recomputing it here! */ + flen = BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(dfield_get_len(new_field) >= + BTR_EXTERN_FIELD_REF_SIZE); + dptr = static_cast<const byte*>( + dfield_get_data(new_field)) + + dfield_get_len(new_field) + - BTR_EXTERN_FIELD_REF_SIZE; + } else { + flen = dfield_get_len(new_field); + dptr = static_cast<const byte*>( + dfield_get_data(new_field)); + } + + if (temp_heap == NULL) { + temp_heap = mem_heap_create(1000); + } + + dptr = btr_copy_externally_stored_field( + &dlen, dptr, + zip_size, + flen, + temp_heap); + } else { + dptr = static_cast<const byte*>( + upd_field->new_val.data); + dlen = upd_field->new_val.len; + } + rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, + static_cast<uint>(dlen + - GEO_DATA_HEADER_SIZE), + SPDIMS, mbr2); + new_mbr = reinterpret_cast<rtr_mbr_t*>(mbr2); + + if (temp_heap) { + mem_heap_free(temp_heap); + } + + if (!MBR_EQUAL_CMP(old_mbr, new_mbr)) { + return(TRUE); + } else { + continue; + } + } + + /* This treatment of column prefix indexes is loosely + based on row_build_index_entry(). */ + + if (UNIV_LIKELY(ind_field->prefix_len == 0) + || dfield_is_null(dfield)) { + /* do nothing special */ + } else if (ext) { + /* Silence a compiler warning without + silencing a Valgrind error. */ + dfield_len = 0; + MEM_UNDEFINED(&dfield_len, sizeof dfield_len); + /* See if the column is stored externally. */ + buf = row_ext_lookup(ext, col_no, &dfield_len); + + ut_ad(col->ord_part); + + if (UNIV_LIKELY_NULL(buf)) { + if (UNIV_UNLIKELY(buf == field_ref_zero)) { + /* The externally stored field + was not written yet. This + record should only be seen by + trx_rollback_recovered() + when the server had crashed before + storing the field. */ + ut_ad(!thr + || thr->graph->trx->is_recovered); + ut_ad(!thr + || thr->graph->trx + == trx_roll_crash_recv_trx); + return(TRUE); + } + + goto copy_dfield; + } + } else if (dfield_is_ext(dfield)) { + dfield_len = dfield_get_len(dfield); + ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE); + dfield_len -= BTR_EXTERN_FIELD_REF_SIZE; + ut_a(dict_index_is_clust(index) + || ind_field->prefix_len <= dfield_len); + + buf= static_cast<const byte*>(dfield_get_data(dfield)); +copy_dfield: + ut_a(dfield_len > 0); + dfield_copy(&dfield_ext, dfield); + dfield_set_data(&dfield_ext, buf, dfield_len); + dfield = &dfield_ext; + } + + if (!dfield_datas_are_binary_equal( + dfield, &upd_field->new_val, + ind_field->prefix_len)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update) /*!< in: update vector for the row */ +{ + upd_field_t* upd_field; + dict_index_t* index; + ulint i; + + index = dict_table_get_first_index(table); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (upd_fld_is_virtual_col(upd_field)) { + if (dict_table_get_nth_v_col(index->table, + upd_field->field_no) + ->m_col.ord_part) { + return(TRUE); + } + } else { + if (dict_field_get_col(dict_index_get_nth_field( + index, upd_field->field_no))->ord_part) { + return(TRUE); + } + } + } + + return(FALSE); +} + +/***********************************************************//** +Checks if an FTS Doc ID column is affected by an UPDATE. +@return whether the Doc ID column is changed */ +bool +row_upd_changes_doc_id( +/*===================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ +{ + ulint col_no; + dict_index_t* clust_index; + fts_t* fts = table->fts; + + ut_ad(!table->skip_alter_undo); + + clust_index = dict_table_get_first_index(table); + + /* Convert from index-specific column number to table-global + column number. */ + col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no); + + return(col_no == fts->doc_col); +} +/***********************************************************//** +Checks if an FTS indexed column is affected by an UPDATE. +@return offset within fts_t::indexes if FTS indexed column updated else +ULINT_UNDEFINED */ +ulint +row_upd_changes_fts_column( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ +{ + ulint col_no; + dict_index_t* clust_index; + fts_t* fts = table->fts; + + ut_ad(!table->skip_alter_undo); + + if (upd_fld_is_virtual_col(upd_field)) { + col_no = upd_field->field_no; + return(dict_table_is_fts_column(fts->indexes, col_no, true)); + } else { + clust_index = dict_table_get_first_index(table); + + /* Convert from index-specific column number to table-global + column number. */ + col_no = dict_index_get_nth_col_no(clust_index, + upd_field->field_no); + return(dict_table_is_fts_column(fts->indexes, col_no, false)); + } + +} + +/***********************************************************//** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. +@return TRUE if changes */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + dtuple_t* entry, /*!< in: index entry */ + dict_index_t* index, /*!< in: index of entry */ + const upd_t* update, /*!< in: update vector for the row */ + ulint n) /*!< in: how many first fields to check */ +{ + ulint n_upd_fields; + ulint i, j; + dict_index_t* clust_index; + + ut_ad(update && index); + ut_ad(n <= dict_index_get_n_fields(index)); + + n_upd_fields = upd_get_n_fields(update); + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_pos; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col, clust_index); + + ut_a(ind_field->prefix_len == 0); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field_t* upd_field + = upd_get_nth_field(update, j); + + if (col_pos == upd_field->field_no + && !dfield_datas_are_binary_equal( + dtuple_get_nth_field(entry, i), + &upd_field->new_val, 0)) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/*********************************************************************//** +Copies the column values from a record. */ +UNIV_INLINE +void +row_upd_copy_columns( +/*=================*/ + rec_t* rec, /*!< in: record in a clustered index */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const dict_index_t* index, /*!< in: index of rec */ + sym_node_t* column) /*!< in: first column in a column list, or + NULL */ +{ + ut_ad(dict_index_is_clust(index)); + + const byte* data; + ulint len; + + while (column) { + data = rec_get_nth_cfield( + rec, index, offsets, + column->field_nos[SYM_CLUST_FIELD_NO], &len); + eval_node_copy_and_alloc_val(column, data, len); + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/*********************************************************************//** +Calculates the new values for fields to update. Note that row_upd_copy_columns +must have been called first. */ +UNIV_INLINE +void +row_upd_eval_new_vals( +/*==================*/ + upd_t* update) /*!< in/out: update vector */ +{ + que_node_t* exp; + upd_field_t* upd_field; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + exp = upd_field->exp; + + eval_exp(exp); + + dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp)); + } +} + +/** Stores to the heap the virtual columns that need for any indexes +@param[in,out] node row update node +@param[in] update an update vector if it is update +@param[in] thd mysql thread handle +@param[in,out] mysql_table mysql table object +@return true if success + false if virtual column value computation fails. */ +static +bool +row_upd_store_v_row( + upd_node_t* node, + const upd_t* update, + THD* thd, + TABLE* mysql_table) +{ + dict_index_t* index = dict_table_get_first_index(node->table); + ib_vcol_row vc(NULL); + + for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(node->table); + col_no++) { + + const dict_v_col_t* col + = dict_table_get_nth_v_col(node->table, col_no); + + if (col->m_col.ord_part) { + dfield_t* dfield + = dtuple_get_nth_v_field(node->row, col_no); + ulint n_upd + = update ? upd_get_n_fields(update) : 0; + ulint i = 0; + + /* Check if the value is already in update vector */ + for (i = 0; i < n_upd; i++) { + const upd_field_t* upd_field + = upd_get_nth_field(update, i); + if (!(upd_field->new_val.type.prtype + & DATA_VIRTUAL) + || upd_field->field_no != col->v_pos) { + continue; + } + + dfield_copy_data(dfield, upd_field->old_v_val); + dfield_dup(dfield, node->heap); + break; + } + + /* Not updated */ + if (i >= n_upd) { + /* If this is an update, then the value + should be in update->old_vrow */ + if (update) { + if (update->old_vrow == NULL) { + /* This only happens in + cascade update. And virtual + column can't be affected, + so it is Ok to set it to NULL */ + dfield_set_null(dfield); + } else { + dfield_t* vfield + = dtuple_get_nth_v_field( + update->old_vrow, + col_no); + dfield_copy_data(dfield, vfield); + dfield_dup(dfield, node->heap); + } + } else { + uchar *record = vc.record(thd, index, + &mysql_table); + /* Need to compute, this happens when + deleting row */ + dfield_t* vfield = + innobase_get_computed_value( + node->row, col, index, + &vc.heap, node->heap, + NULL, thd, mysql_table, + record, NULL, NULL); + if (vfield == NULL) { + return false; + } + } + } + } + } + + return true; +} + +/** Stores to the heap the row on which the node->pcur is positioned. +@param[in] node row update node +@param[in] thd mysql thread handle +@param[in,out] mysql_table NULL, or mysql table object when + user thread invokes dml +@return false if virtual column value computation fails + true otherwise. */ +static +bool +row_upd_store_row( + upd_node_t* node, + THD* thd, + TABLE* mysql_table) +{ + dict_index_t* clust_index; + rec_t* rec; + mem_heap_t* heap = NULL; + row_ext_t** ext; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + const rec_offs* offsets; + rec_offs_init(offsets_); + + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); + + if (node->row != NULL) { + mem_heap_empty(node->heap); + } + + clust_index = dict_table_get_first_index(node->table); + + rec = btr_pcur_get_rec(node->pcur); + + offsets = rec_get_offsets(rec, clust_index, offsets_, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_table_has_atomic_blobs(node->table)) { + /* There is no prefix of externally stored columns in + the clustered index record. Build a cache of column + prefixes. */ + ext = &node->ext; + } else { + /* REDUNDANT and COMPACT formats store a local + 768-byte prefix of each externally stored column. + No cache is needed. */ + ext = NULL; + node->ext = NULL; + } + + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + NULL, NULL, NULL, ext, node->heap); + + if (node->table->n_v_cols) { + bool ok = row_upd_store_v_row(node, + node->is_delete ? NULL : node->update, + thd, mysql_table); + if (!ok) { + return false; + } + } + + if (node->is_delete == PLAIN_DELETE) { + node->upd_row = NULL; + node->upd_ext = NULL; + } else { + node->upd_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->upd_row, &node->upd_ext, + clust_index, node->update, node->heap); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return true; +} + +/***********************************************************//** +Updates a secondary index entry of a row. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_sec_index_entry( +/*====================*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + mtr_t mtr; + btr_pcur_t pcur; + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + dberr_t err = DB_SUCCESS; + trx_t* trx = thr_get_trx(thr); + btr_latch_mode mode; + ulint flags; + enum row_search_result search_result; + + ut_ad(trx->id != 0); + + index = node->index; + ut_ad(index->is_committed()); + + /* For secondary indexes, index->online_status==ONLINE_INDEX_COMPLETE + if index->is_committed(). */ + ut_ad(!dict_index_is_online_ddl(index)); + + const bool referenced = row_upd_index_is_referenced(index, trx); +#ifdef WITH_WSREP + const bool foreign = wsrep_row_upd_index_is_foreign(index, trx); +#endif /* WITH_WSREP */ + + heap = mem_heap_create(1024); + + /* Build old index entry */ + entry = row_build_index_entry(node->row, node->ext, index, heap); + ut_a(entry); + + log_free_check(); + + DEBUG_SYNC_C_IF_THD(trx->mysql_thd, + "before_row_upd_sec_index_entry"); + + mtr.start(); + mode = BTR_MODIFY_LEAF; + + switch (index->table->space_id) { + case SRV_TMP_SPACE_ID: + mtr.set_log_mode(MTR_LOG_NO_REDO); + flags = BTR_NO_LOCKING_FLAG; + break; + default: + index->set_modified(mtr); + /* fall through */ + case IBUF_SPACE_ID: + flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0; + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + if (!referenced) { + mode = BTR_DELETE_MARK_LEAF; + } + break; + } + + /* Set the query thread, so that ibuf_insert_low() will be + able to invoke thd_get_trx(). */ + pcur.btr_cur.thr = thr; + pcur.btr_cur.page_cur.index = index; + + if (index->is_spatial()) { + mode = btr_latch_mode(BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK); + if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) { + goto found; + } + + if (pcur.btr_cur.rtr_info->fd_del) { + /* We found the record, but a delete marked */ + goto close; + } + + goto not_found; + } + + search_result = row_search_index_entry(entry, mode, &pcur, &mtr); + + switch (search_result) { + const rec_t* rec; + case ROW_NOT_DELETED_REF: /* should only occur for BTR_DELETE */ + ut_error; + break; + case ROW_BUFFERED: + /* Entry was delete marked already. */ + break; + + case ROW_NOT_FOUND: +not_found: + rec = btr_pcur_get_rec(&pcur); + ib::error() + << "Record in index " << index->name + << " of table " << index->table->name + << " was not found on update: " << *entry + << " at: " << rec_index_print(rec, index); +#ifdef UNIV_DEBUG + mtr_commit(&mtr); + mtr_start(&mtr); + ut_ad(btr_validate_index(index, 0) == DB_SUCCESS); + ut_ad(0); +#endif /* UNIV_DEBUG */ + break; + case ROW_FOUND: +found: + ut_ad(err == DB_SUCCESS); + rec = btr_pcur_get_rec(&pcur); + + /* Delete mark the old index record; it can already be + delete marked if we return after a lock wait in + row_ins_sec_index_entry() below */ + if (!rec_get_deleted_flag( + rec, dict_table_is_comp(index->table))) { + err = lock_sec_rec_modify_check_and_lock( + flags, + btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), index, thr, &mtr); + if (err != DB_SUCCESS) { + break; + } + + btr_rec_set_deleted<true>(btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), + &mtr); +#ifdef WITH_WSREP + if (!referenced && foreign + && wsrep_must_process_fk(node, trx) + && !wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { + + rec_offs* offsets = rec_get_offsets( + rec, index, NULL, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + err = wsrep_row_upd_check_foreign_constraints( + node, &pcur, index->table, + index, offsets, thr, &mtr); + + switch (err) { + case DB_SUCCESS: + case DB_NO_REFERENCED_ROW: + err = DB_SUCCESS; + break; + case DB_LOCK_WAIT: + case DB_DEADLOCK: + case DB_LOCK_WAIT_TIMEOUT: + WSREP_DEBUG("Foreign key check fail: " + "%s on table %s index %s query %s", + ut_strerr(err), index->name(), index->table->name.m_name, + wsrep_thd_query(trx->mysql_thd)); + break; + default: + WSREP_ERROR("Foreign key check fail: " + "%s on table %s index %s query %s", + ut_strerr(err), index->name(), index->table->name.m_name, + wsrep_thd_query(trx->mysql_thd)); + break; + } + } +#endif /* WITH_WSREP */ + } + +#ifdef WITH_WSREP + ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT + || err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT); +#else + ut_ad(err == DB_SUCCESS); +#endif + + if (referenced) { + rec_offs* offsets = rec_get_offsets( + rec, index, NULL, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, &pcur, index->table, + index, offsets, thr, &mtr); + } + } + +close: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->is_delete == PLAIN_DELETE || err != DB_SUCCESS) { + + goto func_exit; + } + + mem_heap_empty(heap); + + DEBUG_SYNC_C_IF_THD(trx->mysql_thd, + "before_row_upd_sec_new_index_entry"); + + /* Build a new index entry */ + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + /* Insert new index entry */ + err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete); + +func_exit: + mem_heap_free(heap); + + return(err); +} + +/***********************************************************//** +Updates the secondary index record if it is changed in the row update or +deletes it if this is a delete. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_sec_step( +/*=============*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC) + || (node->state == UPD_NODE_UPDATE_SOME_SEC)); + ut_ad(!dict_index_is_clust(node->index)); + + if (node->state == UPD_NODE_UPDATE_ALL_SEC + || row_upd_changes_ord_field_binary(node->index, node->update, + thr, node->row, node->ext)) { + return(row_upd_sec_index_entry(node, thr)); + } + + return(DB_SUCCESS); +} + +#ifdef UNIV_DEBUG +# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \ + row_upd_clust_rec_by_insert_inherit_func(rec,index,offsets,entry,update) +#else /* UNIV_DEBUG */ +# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \ + row_upd_clust_rec_by_insert_inherit_func(rec,entry,update) +#endif /* UNIV_DEBUG */ +/*******************************************************************//** +Mark non-updated off-page columns inherited when the primary key is +updated. We must mark them as inherited in entry, so that they are not +freed in a rollback. A limited version of this function used to be +called btr_cur_mark_dtuple_inherited_extern(). +@return whether any columns were inherited */ +static +bool +row_upd_clust_rec_by_insert_inherit_func( +/*=====================================*/ + const rec_t* rec, /*!< in: old record, or NULL */ +#ifdef UNIV_DEBUG + dict_index_t* index, /*!< in: index, or NULL */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec), or NULL */ +#endif /* UNIV_DEBUG */ + dtuple_t* entry, /*!< in/out: updated entry to be + inserted into the clustered index */ + const upd_t* update) /*!< in: update vector */ +{ + bool inherit = false; + + ut_ad(!rec == !offsets); + ut_ad(!rec == !index); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + ut_ad(!rec || rec_offs_any_extern(offsets)); + + for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) { + dfield_t* dfield = dtuple_get_nth_field(entry, i); + byte* data; + ulint len; + + ut_ad(!offsets + || !rec_offs_nth_extern(offsets, i) + == !dfield_is_ext(dfield) + || (!dict_index_get_nth_field(index, i)->name + && !dfield_is_ext(dfield) + && (dfield_is_null(dfield) || dfield->len == 0)) + || upd_get_field_by_field_no(update, i, false)); + if (!dfield_is_ext(dfield) + || upd_get_field_by_field_no(update, i, false)) { + continue; + } + +#ifdef UNIV_DEBUG + if (UNIV_LIKELY(rec != NULL)) { + ut_ad(!rec_offs_nth_default(offsets, i)); + const byte* rec_data + = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len == dfield_get_len(dfield)); + ut_ad(len != UNIV_SQL_NULL); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + + rec_data += len - BTR_EXTERN_FIELD_REF_SIZE; + + /* The pointer must not be zero. */ + ut_ad(memcmp(rec_data, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + /* The BLOB must be owned. */ + ut_ad(!(rec_data[BTR_EXTERN_LEN] + & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG */ + + len = dfield_get_len(dfield); + ut_a(len != UNIV_SQL_NULL); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + data += len - BTR_EXTERN_FIELD_REF_SIZE; + /* The pointer must not be zero. */ + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + /* The BLOB must be owned, unless we are resuming from + a lock wait and we already had disowned the BLOB. */ + ut_a(rec == NULL + || !(data[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + data[BTR_EXTERN_LEN] &= byte(~BTR_EXTERN_OWNER_FLAG); + data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG; + /* The BTR_EXTERN_INHERITED_FLAG only matters in + rollback of a fresh insert. Purge will always free + the extern fields of a delete-marked row. */ + + inherit = true; + } + + return(inherit); +} + +/***********************************************************//** +Marks the clustered index record deleted and inserts the updated version +of the record to the index. This function should be used when the ordering +fields of the clustered index record change. This should be quite rare in +database applications. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_clust_rec_by_insert( +/*========================*/ + upd_node_t* node, /*!< in/out: row update node */ + dict_index_t* index, /*!< in: clustered index of the record */ + que_thr_t* thr, /*!< in: query thread */ + bool referenced,/*!< in: whether index may be referenced in + a foreign key constraint */ +#ifdef WITH_WSREP + bool foreign,/*!< in: whether this is a foreign key */ +#endif + mtr_t* mtr) /*!< in/out: mini-transaction, + may be committed and restarted */ +{ + mem_heap_t* heap; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + trx_t* trx; + dict_table_t* table; + dtuple_t* entry; + dberr_t err; + rec_t* rec; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + ut_ad(dict_index_is_clust(index)); + + rec_offs_init(offsets_); + + trx = thr_get_trx(thr); + table = node->table; + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + heap = mem_heap_create(1000); + + entry = row_build_index_entry_low(node->upd_row, node->upd_ext, + index, heap, ROW_BUILD_FOR_INSERT); + if (index->is_instant()) entry->trim(*index); + ut_ad(dtuple_get_info_bits(entry) == 0); + + { + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(t->data), trx->id); + } + + switch (node->state) { + default: + ut_error; + case UPD_NODE_INSERT_CLUSTERED: + /* A lock wait occurred in row_ins_clust_index_entry() in + the previous invocation of this function. */ + row_upd_clust_rec_by_insert_inherit( + NULL, NULL, NULL, entry, node->update); + break; + case UPD_NODE_UPDATE_CLUSTERED: + /* This is the first invocation of the function where + we update the primary key. Delete-mark the old record + in the clustered index and prepare to insert a new entry. */ + rec = btr_cur_get_rec(btr_cur); + offsets = rec_get_offsets(rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + ut_ad(page_rec_is_user_rec(rec)); + + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* If the clustered index record is already delete + marked, then we are here after a DB_LOCK_WAIT. + Skip delete marking clustered index and disowning + its blobs. */ + ut_ad(row_get_rec_trx_id(rec, index, offsets) + == trx->id); + ut_ad(!trx_undo_roll_ptr_is_insert( + row_get_rec_roll_ptr(rec, index, + offsets))); + goto check_fk; + } + + err = btr_cur_del_mark_set_clust_rec( + btr_cur_get_block(btr_cur), rec, index, offsets, + thr, node->row, mtr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + /* If the the new row inherits externally stored + fields (off-page columns a.k.a. BLOBs) from the + delete-marked old record, mark them disowned by the + old record and owned by the new entry. */ + + if (rec_offs_any_extern(offsets)) { + if (row_upd_clust_rec_by_insert_inherit( + rec, index, offsets, + entry, node->update)) { + /* The blobs are disowned here, expecting the + insert down below to inherit them. But if the + insert fails, then this disown will be undone + when the operation is rolled back. */ + btr_cur_disown_inherited_fields( + btr_cur_get_block(btr_cur), + rec, index, offsets, node->update, + mtr); + } + } +check_fk: + if (referenced) { + /* NOTE that the following call loses + the position of pcur ! */ + + err = row_upd_check_references_constraints( + node, pcur, table, index, offsets, thr, mtr); + + if (err != DB_SUCCESS) { + goto err_exit; + } +#ifdef WITH_WSREP + } else if (foreign && wsrep_must_process_fk(node, trx)) { + err = wsrep_row_upd_check_foreign_constraints( + node, pcur, table, index, offsets, thr, mtr); + + switch (err) { + case DB_SUCCESS: + case DB_NO_REFERENCED_ROW: + err = DB_SUCCESS; + break; + case DB_LOCK_WAIT: + case DB_DEADLOCK: + case DB_LOCK_WAIT_TIMEOUT: + WSREP_DEBUG("Foreign key check fail: " + "%s on table %s index %s query %s", + ut_strerr(err), index->name(), index->table->name.m_name, + wsrep_thd_query(trx->mysql_thd)); + + goto err_exit; + default: + WSREP_ERROR("Foreign key check fail: " + "%s on table %s index %s query %s", + ut_strerr(err), index->name(), index->table->name.m_name, + wsrep_thd_query(trx->mysql_thd)); + + goto err_exit; + } +#endif /* WITH_WSREP */ + } + } + + mtr->commit(); + mtr->start(); + + node->state = UPD_NODE_INSERT_CLUSTERED; + err = row_ins_clust_index_entry(index, entry, thr, + dtuple_get_n_ext(entry)); +err_exit: + mem_heap_free(heap); + return(err); +} + +/***********************************************************//** +Updates a clustered index record of a row when the ordering fields do +not change. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_clust_rec( +/*==============*/ + ulint flags, /*!< in: undo logging and locking flags */ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index */ + rec_offs* offsets,/*!< in: rec_get_offsets() on node->pcur */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap, can be emptied */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in,out: mini-transaction; may be + committed and restarted here */ +{ + mem_heap_t* heap = NULL; + big_rec_t* big_rec = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + dberr_t err; + + ut_ad(dict_index_is_clust(index)); + ut_ad(!thr_get_trx(thr)->in_rollback); + ut_ad(!node->table->skip_alter_undo); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(btr_cur_get_index(btr_cur) == index); + ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur), + dict_table_is_comp(index->table))); + ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets)); + + /* Try optimistic updating of the record, keeping changes within + the page; we do not check locks because we assume the x-lock on the + record to update */ + + if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { + err = btr_cur_update_in_place( + flags | BTR_NO_LOCKING_FLAG, btr_cur, + offsets, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + } else { + err = btr_cur_optimistic_update( + flags | BTR_NO_LOCKING_FLAG, btr_cur, + &offsets, offsets_heap, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + } + + if (err == DB_SUCCESS) { + goto func_exit; + } + + if (buf_pool.running_out()) { + err = DB_LOCK_TABLE_FULL; + goto func_exit; + } + + /* We may have to modify the tree structure: do a pessimistic descent + down the index tree */ + + mtr->commit(); + mtr->start(); + + if (index->table->is_temporary()) { + /* Disable locking, because temporary tables are never + shared between transactions or connections. */ + flags |= BTR_NO_LOCKING_FLAG; + mtr->set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(*mtr); + } + + /* NOTE: this transaction has an s-lock or x-lock on the record and + therefore other transactions cannot modify the record when we have no + latch on the page. In addition, we assume that other query threads of + the same transaction do not modify the record in the meantime. + Therefore we can assert that the restoration of the cursor succeeds. */ + + ut_a(pcur->restore_position(BTR_MODIFY_TREE, mtr) == + btr_pcur_t::SAME_ALL); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table))); + + if (!heap) { + heap = mem_heap_create(1024); + } + + err = btr_cur_pessimistic_update( + flags | BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, + &offsets, offsets_heap, heap, &big_rec, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); + if (big_rec) { + ut_a(err == DB_SUCCESS); + + DEBUG_SYNC_C("before_row_upd_extern"); + err = btr_store_big_rec_extern_fields( + pcur, offsets, big_rec, mtr, BTR_STORE_UPDATE); + DEBUG_SYNC_C("after_row_upd_extern"); + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + + if (big_rec) { + dtuple_big_rec_free(big_rec); + } + + return(err); +} + +/***********************************************************//** +Delete marks a clustered index record. +@return DB_SUCCESS if operation successfully completed, else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_del_mark_clust_rec( +/*=======================*/ + upd_node_t* node, /*!< in: row update node */ + dict_index_t* index, /*!< in: clustered index */ + rec_offs* offsets,/*!< in/out: rec_get_offsets() for the + record under the cursor */ + que_thr_t* thr, /*!< in: query thread */ + bool referenced, + /*!< in: whether index may be referenced in + a foreign key constraint */ +#ifdef WITH_WSREP + bool foreign,/*!< in: whether this is a foreign key */ +#endif + mtr_t* mtr) /*!< in,out: mini-transaction; + will be committed and restarted */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + rec_t* rec; + trx_t* trx = thr_get_trx(thr); + + ut_ad(dict_index_is_clust(index)); + ut_ad(node->is_delete == PLAIN_DELETE); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Store row because we have to build also the secondary index + entries */ + + if (!row_upd_store_row(node, trx->mysql_thd, + thr->prebuilt && thr->prebuilt->table == node->table + ? thr->prebuilt->m_mysql_table : NULL)) { + return DB_COMPUTE_VALUE_FAILED; + } + + /* Mark the clustered index record deleted; we do not have to check + locks, because we assume that we have an x-lock on the record */ + + rec = btr_cur_get_rec(btr_cur); + + dberr_t err = btr_cur_del_mark_set_clust_rec( + btr_cur_get_block(btr_cur), rec, + index, offsets, thr, node->row, mtr); + + if (err != DB_SUCCESS) { + } else if (referenced) { + /* NOTE that the following call loses the position of pcur ! */ + + err = row_upd_check_references_constraints( + node, pcur, index->table, index, offsets, thr, mtr); +#ifdef WITH_WSREP + } else if (foreign && wsrep_must_process_fk(node, trx)) { + err = wsrep_row_upd_check_foreign_constraints( + node, pcur, index->table, index, offsets, thr, mtr); + + switch (err) { + case DB_SUCCESS: + case DB_NO_REFERENCED_ROW: + err = DB_SUCCESS; + break; + case DB_LOCK_WAIT: + case DB_DEADLOCK: + case DB_LOCK_WAIT_TIMEOUT: + WSREP_DEBUG("Foreign key check fail: " + "%d on table %s index %s query %s", + err, index->name(), index->table->name.m_name, + wsrep_thd_query(trx->mysql_thd)); + break; + default: + WSREP_ERROR("Foreign key check fail: " + "%d on table %s index %s query %s", + err, index->name(), index->table->name.m_name, + wsrep_thd_query(trx->mysql_thd)); + break; + } +#endif /* WITH_WSREP */ + } + + return(err); +} + +/***********************************************************//** +Updates the clustered index record. +@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT +in case of a lock wait, else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_upd_clust_step( +/*===============*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + dberr_t err; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets; + ulint flags; + trx_t* trx = thr_get_trx(thr); + + rec_offs_init(offsets_); + + index = dict_table_get_first_index(node->table); + + if (index->is_corrupted()) { + return DB_TABLE_CORRUPT; + } + + const bool referenced = row_upd_index_is_referenced(index, trx); +#ifdef WITH_WSREP + const bool foreign = wsrep_row_upd_index_is_foreign(index, trx); +#endif + + pcur = node->pcur; + + /* We have to restore the cursor to its position */ + + mtr.start(); + + if (node->table->is_temporary()) { + /* Disable locking, because temporary tables are + private to the connection (no concurrent access). */ + flags = node->table->no_rollback() + ? BTR_NO_ROLLBACK + : BTR_NO_LOCKING_FLAG; + /* Redo logging only matters for persistent tables. */ + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0; + index->set_modified(mtr); + } + + /* If the restoration does not succeed, then the same + transaction has deleted the record on which the cursor was, + and that is an SQL error. If the restoration succeeds, it may + still be that the same transaction has successively deleted + and inserted a record with the same ordering fields, but in + that case we know that the transaction has at least an + implicit x-lock on the record. */ + + ut_a(pcur->rel_pos == BTR_PCUR_ON); + + btr_latch_mode mode; + + DEBUG_SYNC_C_IF_THD(trx->mysql_thd, "innodb_row_upd_clust_step_enter"); + + if (dict_index_is_online_ddl(index)) { + ut_ad(node->table->id != DICT_INDEXES_ID); + mode = BTR_MODIFY_LEAF_ALREADY_LATCHED; + mtr_s_lock_index(index, &mtr); + } else { + mode = BTR_MODIFY_LEAF; + } + + if (pcur->restore_position(mode, &mtr) != btr_pcur_t::SAME_ALL) { + err = DB_RECORD_NOT_FOUND; + goto exit_func; + } + + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (!flags && !node->has_clust_rec_x_lock) { + err = lock_clust_rec_modify_check_and_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, thr); + if (err != DB_SUCCESS) { + goto exit_func; + } + } + + ut_ad(index->table->no_rollback() || index->table->is_temporary() + || row_get_rec_trx_id(rec, index, offsets) == trx->id + || lock_trx_has_expl_x_lock(*trx, *index->table, + btr_pcur_get_block(pcur)->page.id(), + page_rec_get_heap_no(rec))); + + if (node->is_delete == PLAIN_DELETE) { + err = row_upd_del_mark_clust_rec( + node, index, offsets, thr, referenced, +#ifdef WITH_WSREP + foreign, +#endif + &mtr); + goto all_done; + } + + /* If the update is made for MySQL, we already have the update vector + ready, else we have to do some evaluation: */ + + if (UNIV_UNLIKELY(!node->in_mysql_interface)) { + /* Copy the necessary columns from clust_rec and calculate the + new values to set */ + row_upd_copy_columns(rec, offsets, index, + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + } + + if (!node->is_delete && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + err = row_upd_clust_rec( + flags, node, index, offsets, &heap, thr, &mtr); + goto exit_func; + } + + if (!row_upd_store_row(node, trx->mysql_thd, thr->prebuilt + ? thr->prebuilt->m_mysql_table : NULL)) { + err = DB_COMPUTE_VALUE_FAILED; + goto exit_func; + } + + if (row_upd_changes_ord_field_binary(index, node->update, thr, + node->row, node->ext)) { + + /* Update causes an ordering field (ordering fields within + the B-tree) of the clustered index record to change: perform + the update by delete marking and inserting. + + TODO! What to do to the 'Halloween problem', where an update + moves the record forward in index so that it is again + updated when the cursor arrives there? Solution: the + read operation must check the undo record undo number when + choosing records to update. MySQL solves now the problem + externally! */ + + err = row_upd_clust_rec_by_insert( + node, index, thr, referenced, +#ifdef WITH_WSREP + foreign, +#endif + &mtr); +all_done: + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; +success: + node->index = dict_table_get_next_index(index); + } + } else { + err = row_upd_clust_rec( + flags, node, index, offsets, &heap, thr, &mtr); + + if (err == DB_SUCCESS) { + ut_ad(node->is_delete != PLAIN_DELETE); + node->state = node->is_delete + ? UPD_NODE_UPDATE_ALL_SEC + : UPD_NODE_UPDATE_SOME_SEC; + goto success; + } + } + +exit_func: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return err; +} + +/***********************************************************//** +Updates the affected index records of a row. When the control is transferred +to this node, we assume that we have a persistent cursor which was on a +record, and the position of the cursor is stored in the cursor. +@return DB_SUCCESS if operation successfully completed, else error +code or DB_LOCK_WAIT */ +static +dberr_t +row_upd( +/*====*/ + upd_node_t* node, /*!< in: row update node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err = DB_SUCCESS; + DBUG_ENTER("row_upd"); + + ut_ad(!thr_get_trx(thr)->in_rollback); + + DBUG_PRINT("row_upd", ("table: %s", node->table->name.m_name)); + DBUG_PRINT("row_upd", ("info bits in update vector: 0x%x", + node->update ? node->update->info_bits: 0)); + DBUG_PRINT("row_upd", ("foreign_id: %s", + node->foreign ? node->foreign->id: "NULL")); + + if (UNIV_LIKELY(node->in_mysql_interface)) { + + /* We do not get the cmpl_info value from the MySQL + interpreter: we must calculate it on the fly: */ + + if (node->is_delete == PLAIN_DELETE + || row_upd_changes_some_index_ord_field_binary( + node->table, node->update)) { + node->cmpl_info = 0; + } else { + node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; + } + } + + switch (node->state) { + case UPD_NODE_UPDATE_CLUSTERED: + case UPD_NODE_INSERT_CLUSTERED: + log_free_check(); + + err = row_upd_clust_step(node, thr); + + if (err != DB_SUCCESS) { + + DBUG_RETURN(err); + } + } + + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_upd_clust"); + + if (node->index == NULL + || (!node->is_delete + && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) { + + DBUG_RETURN(DB_SUCCESS); + } + + DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;); + + do { + if (!node->index) { + break; + } + + if (!(node->index->type & (DICT_FTS | DICT_CORRUPT)) + && node->index->is_committed()) { + err = row_upd_sec_step(node, thr); + + if (err != DB_SUCCESS) { + + DBUG_RETURN(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } while (node->index != NULL); + + ut_ad(err == DB_SUCCESS); + + /* Do some cleanup */ + + if (node->row != NULL) { + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + mem_heap_empty(node->heap); + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + DBUG_RETURN(err); +} + +/***********************************************************//** +Updates a row in a table. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + upd_node_t* node; + sel_node_t* sel_node; + que_node_t* parent; + dberr_t err = DB_SUCCESS; + trx_t* trx; + DBUG_ENTER("row_upd_step"); + + ut_ad(thr); + + trx = thr_get_trx(thr); + + node = static_cast<upd_node_t*>(thr->run_node); + + sel_node = node->select; + + parent = que_node_get_parent(node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + if (thr->prev_node == parent) { + node->state = UPD_NODE_SET_IX_LOCK; + } + + if (node->state == UPD_NODE_SET_IX_LOCK) { + + if (!node->has_clust_rec_x_lock) { + /* It may be that the current session has not yet + started its transaction, or it has been committed: */ + + err = lock_table(node->table, nullptr, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + if (node->searched_update) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to update */ + + thr->run_node = sel_node; + + DBUG_RETURN(thr); + } + } + + /* sel_node is NULL if we are in the MySQL interface */ + + if (sel_node && (sel_node->state != SEL_NODE_FETCH)) { + + if (!node->searched_update) { + /* An explicit cursor should be positioned on a row + to update */ + + ut_error; + + err = DB_ERROR; + + goto error_handling; + } + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to update, or the select node performed the + updates directly in-place */ + + thr->run_node = parent; + + DBUG_RETURN(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_upd(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + DBUG_RETURN(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->searched_update) { + /* Fetch next row to update */ + + thr->run_node = sel_node; + } else { + /* It was an explicit cursor update */ + + thr->run_node = parent; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + DBUG_RETURN(thr); +} + +/** Write query start time as SQL field data to a buffer. Needed by InnoDB. +@param thd Thread object +@param buf Buffer to hold start time data */ +void thd_get_query_start_data(THD *thd, char *buf); + +/** Appends row_start or row_end field to update vector and sets a +CURRENT_TIMESTAMP/trx->id value to it. Called by vers_make_update() and +vers_make_delete(). +@param[in] trx transaction +@param[in] vers_sys_idx table->row_start or table->row_end */ +void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx) +{ + ut_ad(in_mysql_interface); // otherwise needs to recalculate node->cmpl_info + ut_ad(idx == table->vers_start || idx == table->vers_end); + + dict_index_t *clust_index= dict_table_get_first_index(table); + const dict_col_t *col= dict_table_get_nth_col(table, idx); + ulint field_no= dict_col_get_clust_pos(col, clust_index); + upd_field_t *ufield; + + for (ulint i= 0; i < update->n_fields; ++i) + { + if (update->fields[i].field_no == field_no) + { + ufield= &update->fields[i]; + goto skip_append; + } + } + + /* row_create_update_node_for_mysql() pre-allocated this much. + At least one PK column always remains unchanged. */ + ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols)); + + update->n_fields++; + ufield= upd_get_nth_field(update, update->n_fields - 1); + upd_field_set_field_no(ufield, static_cast<uint16_t>(field_no), clust_index); + +skip_append: + char *where= reinterpret_cast<char *>(update->vers_sys_value); + if (col->vers_native()) + mach_write_to_8(where, trx->id); + else + thd_get_query_start_data(trx->mysql_thd, where); + + dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len); + + for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++) + { + const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no); + if (!v_col->m_col.ord_part) + continue; + for (ulint i= 0; i < unsigned(v_col->num_base); i++) + { + dict_col_t *base_col= v_col->base_col[i]; + if (base_col->ind == col->ind) + { + /* Virtual column depends on system field value + which we updated above. Remove it from update + vector, so it is recalculated in + row_upd_store_v_row() (see !update branch). */ + update->remove(v_col->v_pos); + break; + } + } + } +} + + +/** Prepare update vector for versioned delete. +Set row_end to CURRENT_TIMESTAMP or trx->id. +Initialize fts_next_doc_id for versioned delete. +@param[in] trx transaction */ +void upd_node_t::vers_make_delete(trx_t* trx) +{ + update->n_fields= 0; + is_delete= VERSIONED_DELETE; + vers_update_fields(trx, table->vers_end); + trx->fts_next_doc_id= table->fts ? UINT64_UNDEFINED : 0; +} diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc new file mode 100644 index 00000000..c3acf325 --- /dev/null +++ b/storage/innobase/row/row0vers.cc @@ -0,0 +1,1419 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0vers.cc +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0vers.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "row0mysql.h" + +/** Check whether all non-virtual index fields are equal. +@param[in] index the secondary index +@param[in] a first index entry to compare +@param[in] b second index entry to compare +@return whether all non-virtual fields are equal */ +static +bool +row_vers_non_virtual_fields_equal( + const dict_index_t* index, + const dfield_t* a, + const dfield_t* b) +{ + const dict_field_t* end = &index->fields[index->n_fields]; + + for (const dict_field_t* ifield = index->fields; ifield != end; + ifield++) { + if (!ifield->col->is_virtual() + && cmp_dfield_dfield(a++, b++)) { + return false; + } + } + + return true; +} + +/** Determine if an active transaction has inserted or modified a secondary +index record. +@param[in,out] caller_trx trx of current thread +@param[in] clust_rec clustered index record +@param[in] clust_index clustered index +@param[in] rec secondary index record +@param[in] index secondary index +@param[in] offsets rec_get_offsets(rec, index) +@param[in,out] mtr mini-transaction +@return the active transaction; state must be rechecked after +acquiring trx->mutex, and trx->release_reference() must be invoked +@retval NULL if the record was committed */ +UNIV_INLINE +trx_t* +row_vers_impl_x_locked_low( + trx_t* caller_trx, + const rec_t* clust_rec, + dict_index_t* clust_index, + const rec_t* rec, + dict_index_t* index, + const rec_offs* offsets, + mtr_t* mtr) +{ + trx_id_t trx_id; + rec_t* prev_version = NULL; + rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* clust_offsets; + mem_heap_t* heap; + dtuple_t* ientry = NULL; + mem_heap_t* v_heap = NULL; + dtuple_t* cur_vrow = NULL; + + rec_offs_init(clust_offsets_); + + DBUG_ENTER("row_vers_impl_x_locked_low"); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(clust_rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + + if (ulint trx_id_offset = clust_index->trx_id_offset) { + trx_id = mach_read_from_6(clust_rec + trx_id_offset); + if (trx_id == 0) { + /* The transaction history was already purged. */ + DBUG_RETURN(0); + } + } + + heap = mem_heap_create(1024); + + clust_offsets = rec_get_offsets(clust_rec, clust_index, clust_offsets_, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); + if (trx_id == 0) { + /* The transaction history was already purged. */ + mem_heap_free(heap); + DBUG_RETURN(0); + } + + ut_ad(!clust_index->table->is_temporary()); + + trx_t* trx; + + if (trx_id == caller_trx->id) { + trx = caller_trx; + trx->reference(); + } else { + trx = trx_sys.find(caller_trx, trx_id); + if (trx == 0) { + /* The transaction that modified or inserted + clust_rec is no longer active, or it is + corrupt: no implicit lock on rec */ + lock_check_trx_id_sanity(trx_id, clust_rec, + clust_index, clust_offsets); + mem_heap_free(heap); + DBUG_RETURN(0); + } + } + + const ulint comp = page_rec_is_comp(rec); + ut_ad(index->table == clust_index->table); + ut_ad(!!comp == dict_table_is_comp(index->table)); + ut_ad(!comp == !page_rec_is_comp(clust_rec)); + + const ulint rec_del = rec_get_deleted_flag(rec, comp); + + if (dict_index_has_virtual(index)) { + ulint est_size = DTUPLE_EST_ALLOC(index->n_fields); + + /* Allocate the dtuple for virtual columns extracted from undo + log with its own heap, so to avoid it being freed as we + iterating in the version loop below. */ + v_heap = mem_heap_create(est_size); + ientry = row_rec_to_index_entry(rec, index, offsets, v_heap); + } + + /* We look up if some earlier version, which was modified by + the trx_id transaction, of the clustered index record would + require rec to be in a different state (delete marked or + unmarked, or have different field values, or not existing). If + there is such a version, then rec was modified by the trx_id + transaction, and it has an implicit x-lock on rec. Note that + if clust_rec itself would require rec to be in a different + state, then the trx_id transaction has not yet had time to + modify rec, and does not necessarily have an implicit x-lock + on rec. */ + + for (const rec_t* version = clust_rec;; version = prev_version) { + row_ext_t* ext; + dtuple_t* row; + dtuple_t* entry; + ulint vers_del; + trx_id_t prev_trx_id; + mem_heap_t* old_heap = heap; + dtuple_t* vrow = NULL; + + /* We keep the semaphore in mtr on the clust_rec page, so + that no other transaction can update it and get an + implicit x-lock on rec until mtr_commit(mtr). */ + + heap = mem_heap_create(1024); + + trx_undo_prev_version_build( + version, clust_index, clust_offsets, + heap, &prev_version, NULL, + dict_index_has_virtual(index) ? &vrow : NULL, 0); + + ut_d(trx->mutex_lock()); + const bool committed = trx_state_eq( + trx, TRX_STATE_COMMITTED_IN_MEMORY); + ut_d(trx->mutex_unlock()); + + /* The oldest visible clustered index version must not be + delete-marked, because we never start a transaction by + inserting a delete-marked record. */ + ut_ad(committed || prev_version + || !rec_get_deleted_flag(version, comp)); + + /* Free version and clust_offsets. */ + mem_heap_free(old_heap); + + if (committed) { + goto not_locked; + } + + if (prev_version == NULL) { + + /* We reached the oldest visible version without + finding an older version of clust_rec that would + match the secondary index record. If the secondary + index record is not delete marked, then clust_rec + is considered the correct match of the secondary + index record and hence holds the implicit lock. */ + + if (rec_del) { + /* The secondary index record is del marked. + So, the implicit lock holder of clust_rec + did not modify the secondary index record yet, + and is not holding an implicit lock on it. + + This assumes that whenever a row is inserted + or updated, the leaf page record always is + created with a clear delete-mark flag. + (We never insert a delete-marked record.) */ +not_locked: + trx->release_reference(); + trx = 0; + } + + break; + } + + clust_offsets = rec_get_offsets( + prev_version, clust_index, clust_offsets_, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + vers_del = rec_get_deleted_flag(prev_version, comp); + + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); + + /* The stack of versions is locked by mtr. Thus, it + is safe to fetch the prefixes for externally stored + columns. */ + + row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, + clust_offsets, + NULL, NULL, NULL, &ext, heap); + + if (dict_index_has_virtual(index)) { + if (vrow) { + /* Keep the virtual row info for the next + version */ + cur_vrow = dtuple_copy(vrow, v_heap); + dtuple_dup_v_fld(cur_vrow, v_heap); + } + + if (!cur_vrow) { + /* Build index entry out of row */ + entry = row_build_index_entry(row, ext, index, + heap); + + /* entry could only be NULL (the + clustered index record could contain + BLOB pointers that are NULL) if we + were accessing a freshly inserted + record before it was fully inserted. + prev_version cannot possibly be such + an incomplete record, because its + transaction would have to be committed + in order for later versions of the + record to be able to exist. */ + ut_ad(entry); + + /* If the indexed virtual columns has changed, + there must be log record to generate vrow. + Otherwise, it is not changed, so no need + to compare */ + if (!row_vers_non_virtual_fields_equal( + index, + ientry->fields, entry->fields)) { + if (rec_del != vers_del) { + break; + } + } else if (!rec_del) { + break; + } + + goto result_check; + } else { + ut_ad(row->n_v_fields == cur_vrow->n_v_fields); + dtuple_copy_v_fields(row, cur_vrow); + } + } + + entry = row_build_index_entry(row, ext, index, heap); + + /* entry could only be NULL (the clustered index + record could contain BLOB pointers that are NULL) if + we were accessing a freshly inserted record before it + was fully inserted. prev_version cannot possibly be + such an incomplete record, because its transaction + would have to be committed in order for later versions + of the record to be able to exist. */ + ut_ad(entry); + + /* If we get here, we know that the trx_id transaction + modified prev_version. Let us check if prev_version + would require rec to be in a different state. */ + + /* The previous version of clust_rec must be + accessible, because clust_rec was not a fresh insert. + There is no guarantee that the transaction is still + active. */ + + /* We check if entry and rec are identified in the alphabetical + ordering */ + if (0 == cmp_dtuple_rec(entry, rec, index, offsets)) { + /* The delete marks of rec and prev_version should be + equal for rec to be in the state required by + prev_version */ + + if (rec_del != vers_del) { + + break; + } + + /* It is possible that the row was updated so that the + secondary index record remained the same in + alphabetical ordering, but the field values changed + still. For example, 'abc' -> 'ABC'. Check also that. */ + + dtuple_set_types_binary( + entry, dtuple_get_n_fields(entry)); + + if (cmp_dtuple_rec(entry, rec, index, offsets)) { + + break; + } + + } else if (!rec_del) { + /* The delete mark should be set in rec for it to be + in the state required by prev_version */ + + break; + } + +result_check: + if (trx->id != prev_trx_id) { + /* prev_version was the first version modified by + the trx_id transaction: no implicit x-lock */ + goto not_locked; + } + } + + if (trx) { + DBUG_PRINT("info", ("Implicit lock is held by trx:" TRX_ID_FMT, + trx_id)); + } + + if (v_heap != NULL) { + mem_heap_free(v_heap); + } + + mem_heap_free(heap); + DBUG_RETURN(trx); +} + +/** Determine if an active transaction has inserted or modified a secondary +index record. +@param[in,out] caller_trx trx of current thread +@param[in] rec secondary index record +@param[in] index secondary index +@param[in] offsets rec_get_offsets(rec, index) +@return the active transaction; state must be rechecked after +acquiring trx->mutex, and trx->release_reference() must be invoked +@retval NULL if the record was committed */ +trx_t* +row_vers_impl_x_locked( + trx_t* caller_trx, + const rec_t* rec, + dict_index_t* index, + const rec_offs* offsets) +{ + mtr_t mtr; + trx_t* trx; + const rec_t* clust_rec; + dict_index_t* clust_index; + + lock_sys.assert_unlocked(); + + mtr_start(&mtr); + + /* Search for the clustered index record. The latch on the + page of clust_rec locks the top of the stack of versions. The + bottom of the version stack is not locked; oldest versions may + disappear by the fact that transactions may be committed and + collected by the purge. This is not a problem, because we are + only interested in active transactions. */ + + clust_rec = row_get_clust_rec( + BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr); + + if (!clust_rec) { + /* In a rare case it is possible that no clust rec is found + for a secondary index record: if in row0umod.cc + row_undo_mod_remove_clust_low() we have already removed the + clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case there cannot be + any implicit lock on the secondary index record, because + an active transaction which has modified the secondary index + record has also modified the clustered index record. And in + a rollback we always undo the modifications to secondary index + records before the clustered index record. */ + + trx = 0; + } else { + trx = row_vers_impl_x_locked_low( + caller_trx, clust_rec, clust_index, rec, index, + offsets, &mtr); + + ut_ad(trx == 0 || trx->is_referenced()); + } + + mtr_commit(&mtr); + + return(trx); +} + +/** build virtual column value from current cluster index record data +@param[in,out] row the cluster index row in dtuple form +@param[in] clust_index clustered index +@param[in] index the secondary index +@param[in] heap heap used to build virtual dtuple. */ +static +bool +row_vers_build_clust_v_col( + dtuple_t* row, + dict_index_t* clust_index, + dict_index_t* index, + mem_heap_t* heap) +{ + THD* thd= current_thd; + TABLE* maria_table= 0; + + ut_ad(dict_index_has_virtual(index)); + ut_ad(index->table == clust_index->table); + + DEBUG_SYNC(current_thd, "ib_clust_v_col_before_row_allocated"); + + ib_vcol_row vc(nullptr); + byte *record = vc.record(thd, index, &maria_table); + + ut_ad(maria_table); + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_col_t* c = dict_index_get_nth_col(index, i); + + if (c->is_virtual()) { + const dict_v_col_t* col + = reinterpret_cast<const dict_v_col_t*>(c); + + dfield_t *vfield = innobase_get_computed_value( + row, col, clust_index, &vc.heap, + heap, NULL, thd, maria_table, record, NULL, + NULL); + if (!vfield) { + innobase_report_computed_value_failed(row); + ut_ad(0); + return false; + } + } + } + + return true; +} + +/** Build latest virtual column data from undo log +@param[in] in_purge whether this is the purge thread +@param[in] rec clustered index record +@param[in] clust_index clustered index +@param[in,out] clust_offsets offsets on the clustered index record +@param[in] index the secondary index +@param[in] roll_ptr the rollback pointer for the purging record +@param[in] trx_id trx id for the purging record +@param[in,out] v_heap heap used to build vrow +@param[out] v_row dtuple holding the virtual rows +@param[in,out] mtr mtr holding the latch on rec */ +static +void +row_vers_build_cur_vrow_low( + bool in_purge, + const rec_t* rec, + dict_index_t* clust_index, + rec_offs* clust_offsets, + dict_index_t* index, + roll_ptr_t roll_ptr, + trx_id_t trx_id, + mem_heap_t* v_heap, + dtuple_t** vrow, + mtr_t* mtr) +{ + const rec_t* version; + rec_t* prev_version; + mem_heap_t* heap = NULL; + ulint num_v = dict_table_get_n_v_cols(index->table); + const dfield_t* field; + ulint i; + bool all_filled = false; + + *vrow = dtuple_create_with_vcol(v_heap, 0, num_v); + dtuple_init_v_fld(*vrow); + + for (i = 0; i < num_v; i++) { + dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype + = DATA_MISSING; + } + + ut_ad(mtr->memo_contains_page_flagged(rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + + version = rec; + + /* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE + bit to search the undo log until we hit the current undo log with + roll_ptr */ + const ulint status = in_purge + ? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE + : TRX_UNDO_GET_OLD_V_VALUE; + + while (!all_filled) { + mem_heap_t* heap2 = heap; + heap = mem_heap_create(1024); + roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr( + version, clust_index, clust_offsets); + + trx_undo_prev_version_build( + version, clust_index, clust_offsets, + heap, &prev_version, NULL, vrow, status); + + if (heap2) { + mem_heap_free(heap2); + } + + if (!prev_version) { + /* Versions end here */ + break; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + ulint entry_len = dict_index_get_n_fields(index); + + all_filled = true; + + for (i = 0; i < entry_len; i++) { + const dict_col_t* col + = dict_index_get_nth_col(index, i); + + if (!col->is_virtual()) { + continue; + } + + const dict_v_col_t* v_col + = reinterpret_cast<const dict_v_col_t*>(col); + field = dtuple_get_nth_v_field(*vrow, v_col->v_pos); + + if (dfield_get_type(field)->mtype == DATA_MISSING) { + all_filled = false; + break; + } + + } + + trx_id_t rec_trx_id = row_get_rec_trx_id( + prev_version, clust_index, clust_offsets); + + if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) { + break; + } + + version = prev_version; + } + + mem_heap_free(heap); +} + +/** Check a virtual column value index secondary virtual index matches +that of current cluster index record, which is recreated from information +stored in undo log +@param[in] rec record in the clustered index +@param[in] icentry the index entry built from a cluster row +@param[in] clust_index cluster index +@param[in] clust_offsets offsets on the cluster record +@param[in] index the secondary index +@param[in] ientry the secondary index entry +@param[in] roll_ptr the rollback pointer for the purging record +@param[in] trx_id trx id for the purging record +@param[in,out] v_heap heap used to build virtual dtuple +@param[in,out] v_row dtuple holding the virtual rows (if needed) +@param[in] mtr mtr holding the latch on rec +@return true if matches, false otherwise */ +static +bool +row_vers_vc_matches_cluster( + const rec_t* rec, + const dtuple_t* icentry, + dict_index_t* clust_index, + rec_offs* clust_offsets, + dict_index_t* index, + const dtuple_t* ientry, + roll_ptr_t roll_ptr, + trx_id_t trx_id, + mem_heap_t* v_heap, + dtuple_t** vrow, + mtr_t* mtr) +{ + const rec_t* version; + rec_t* prev_version; + mem_heap_t* heap2; + mem_heap_t* heap = NULL; + mem_heap_t* tuple_heap; + ulint num_v = dict_table_get_n_v_cols(index->table); + bool compare[REC_MAX_N_FIELDS]; + ulint n_fields = dtuple_get_n_fields(ientry); + ulint n_non_v_col = 0; + ulint n_cmp_v_col = 0; + const dfield_t* field1; + dfield_t* field2; + ulint i; + + /* First compare non-virtual columns (primary keys) */ + ut_ad(index->n_fields == n_fields); + ut_ad(n_fields == dtuple_get_n_fields(icentry)); + ut_ad(mtr->memo_contains_page_flagged(rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + + { + const dfield_t* a = ientry->fields; + const dfield_t* b = icentry->fields; + + for (const dict_field_t *ifield = index->fields, + *const end = &index->fields[index->n_fields]; + ifield != end; ifield++, a++, b++) { + if (!ifield->col->is_virtual()) { + if (cmp_dfield_dfield(a, b)) { + return false; + } + n_non_v_col++; + } + } + } + + tuple_heap = mem_heap_create(1024); + + ut_ad(n_fields > n_non_v_col); + + *vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v); + dtuple_init_v_fld(*vrow); + + for (i = 0; i < num_v; i++) { + dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype + = DATA_MISSING; + compare[i] = false; + } + + version = rec; + + while (n_cmp_v_col < n_fields - n_non_v_col) { + heap2 = heap; + heap = mem_heap_create(1024); + roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr( + version, clust_index, clust_offsets); + + ut_ad(cur_roll_ptr != 0); + ut_ad(roll_ptr != 0); + + trx_undo_prev_version_build( + version, clust_index, clust_offsets, + heap, &prev_version, NULL, vrow, + TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE); + + if (heap2) { + mem_heap_free(heap2); + } + + if (!prev_version) { + /* Versions end here */ + goto func_exit; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + ulint entry_len = dict_index_get_n_fields(index); + + for (i = 0; i < entry_len; i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col = ind_field->col; + field1 = dtuple_get_nth_field(ientry, i); + + if (!col->is_virtual()) { + continue; + } + + const dict_v_col_t* v_col + = reinterpret_cast<const dict_v_col_t*>(col); + field2 + = dtuple_get_nth_v_field(*vrow, v_col->v_pos); + + if ((dfield_get_type(field2)->mtype != DATA_MISSING) + && (!compare[v_col->v_pos])) { + + if (ind_field->prefix_len != 0 + && !dfield_is_null(field2)) { + field2->len = unsigned( + dtype_get_at_most_n_mbchars( + field2->type.prtype, + field2->type.mbminlen, + field2->type.mbmaxlen, + ind_field->prefix_len, + field2->len, + static_cast<char*> + (field2->data))); + } + + /* The index field mismatch */ + if (v_heap + || cmp_dfield_dfield(field2, field1)) { + if (v_heap) { + dtuple_dup_v_fld(*vrow, v_heap); + } + + mem_heap_free(tuple_heap); + mem_heap_free(heap); + return(false); + } + + compare[v_col->v_pos] = true; + n_cmp_v_col++; + } + } + + trx_id_t rec_trx_id = row_get_rec_trx_id( + prev_version, clust_index, clust_offsets); + + if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) { + break; + } + + version = prev_version; + } + +func_exit: + if (n_cmp_v_col == 0) { + *vrow = NULL; + } + + mem_heap_free(tuple_heap); + mem_heap_free(heap); + + /* FIXME: In the case of n_cmp_v_col is not the same as + n_fields - n_non_v_col, callback is needed to compare the rest + columns. At the timebeing, we will need to return true */ + return (true); +} + +/** Build a dtuple contains virtual column data for current cluster index +@param[in] in_purge called by purge thread +@param[in] rec cluster index rec +@param[in] clust_index cluster index +@param[in] clust_offsets cluster rec offset +@param[in] index secondary index +@param[in] roll_ptr roll_ptr for the purge record +@param[in] trx_id transaction ID on the purging record +@param[in,out] heap heap memory +@param[in,out] v_heap heap memory to keep virtual colum dtuple +@param[in] mtr mtr holding the latch on rec +@return dtuple contains virtual column data */ +static +dtuple_t* +row_vers_build_cur_vrow( + bool in_purge, + const rec_t* rec, + dict_index_t* clust_index, + rec_offs** clust_offsets, + dict_index_t* index, + roll_ptr_t roll_ptr, + trx_id_t trx_id, + mem_heap_t* heap, + mem_heap_t* v_heap, + mtr_t* mtr) +{ + dtuple_t* cur_vrow = NULL; + + roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr( + rec, clust_index, *clust_offsets); + + /* if the row is newly inserted, then the virtual + columns need to be computed */ + if (trx_undo_roll_ptr_is_insert(t_roll_ptr)) { + + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); + + /* This is a newly inserted record and cannot + be deleted, So the externally stored field + cannot be freed yet. */ + dtuple_t* row = row_build(ROW_COPY_POINTERS, clust_index, + rec, *clust_offsets, + NULL, NULL, NULL, NULL, heap); + + if (!row_vers_build_clust_v_col(row, clust_index, index, + heap)) { + return nullptr; + } + + cur_vrow = dtuple_copy(row, v_heap); + dtuple_dup_v_fld(cur_vrow, v_heap); + } else { + /* Try to fetch virtual column data from undo log */ + row_vers_build_cur_vrow_low( + in_purge, rec, clust_index, *clust_offsets, + index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr); + } + + *clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + return(cur_vrow); +} + +/** @return whether two data tuples are equal */ +static bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2) +{ + ut_ad(tuple1.magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(tuple2.magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(&tuple1)); + ut_ad(dtuple_check_typed(&tuple2)); + ut_ad(tuple1.n_fields == tuple2.n_fields); + + for (ulint i= 0; i < tuple1.n_fields; i++) + if (cmp_dfield_dfield(&tuple1.fields[i], &tuple2.fields[i])) + return false; + return true; +} + +/** Find out whether data tuple has missing data type +for indexed virtual column. +@param tuple data tuple +@param index virtual index +@return true if tuple has missing column type */ +static bool dtuple_vcol_data_missing(const dtuple_t &tuple, + dict_index_t *index) +{ + for (ulint i= 0; i < index->n_uniq; i++) + { + dict_col_t *col= index->fields[i].col; + if (!col->is_virtual()) + continue; + dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col); + for (ulint j= 0; j < index->table->n_v_cols; j++) + { + if (vcol == &index->table->v_cols[j] + && tuple.v_fields[j].type.mtype == DATA_MISSING) + return true; + } + } + return false; +} + +/** Finds out if a version of the record, where the version >= the current +purge_sys.view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. +@param[in] also_curr TRUE if also rec is included in the versions + to search; otherwise only versions prior + to it are searched +@param[in] rec record in the clustered index; the caller + must have a latch on the page +@param[in] mtr mtr holding the latch on rec; it will + also hold the latch on purge_view +@param[in] index secondary index +@param[in] ientry secondary index entry +@param[in] roll_ptr roll_ptr for the purge record +@param[in] trx_id transaction ID on the purging record +@return TRUE if earlier version should have */ +bool +row_vers_old_has_index_entry( + bool also_curr, + const rec_t* rec, + mtr_t* mtr, + dict_index_t* index, + const dtuple_t* ientry, + roll_ptr_t roll_ptr, + trx_id_t trx_id) +{ + const rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + rec_offs* clust_offsets; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + const dtuple_t* entry; + ulint comp; + dtuple_t* vrow = NULL; + mem_heap_t* v_heap = NULL; + dtuple_t* cur_vrow = NULL; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + clust_index = dict_table_get_first_index(index->table); + + comp = page_rec_is_comp(rec); + ut_ad(!dict_table_is_comp(index->table) == !comp); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_has_virtual(index)) { + v_heap = mem_heap_create(100); + } + + DBUG_EXECUTE_IF("ib_purge_virtual_index_crash", + DBUG_SUICIDE();); + + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row_ext_t* ext; + + /* The top of the stack of versions is locked by the + mtr holding a latch on the page containing the + clustered index record. The bottom of the stack is + locked by the fact that the purge_sys.view must + 'overtake' any read view of an active transaction. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, + NULL, NULL, NULL, &ext, heap); + + if (dict_index_has_virtual(index)) { + + +#ifdef DBUG_OFF +# define dbug_v_purge false +#else /* DBUG_OFF */ + bool dbug_v_purge = false; +#endif /* DBUG_OFF */ + + DBUG_EXECUTE_IF( + "ib_purge_virtual_index_callback", + dbug_v_purge = true;); + + roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr( + rec, clust_index, clust_offsets); + + /* if the row is newly inserted, then the virtual + columns need to be computed */ + if (trx_undo_roll_ptr_is_insert(t_roll_ptr) + || dbug_v_purge) { + + if (!row_vers_build_clust_v_col( + row, clust_index, index, heap)) { + goto unsafe_to_purge; + } + + entry = row_build_index_entry( + row, ext, index, heap); + if (entry && dtuple_coll_eq(*ientry, *entry)) { + goto unsafe_to_purge; + } + } else { + /* Build index entry out of row */ + entry = row_build_index_entry(row, ext, index, heap); + /* entry could only be NULL if + the clustered index record is an uncommitted + inserted record whose BLOBs have not been + written yet. The secondary index record + can be safely removed, because it cannot + possibly refer to this incomplete + clustered index record. (Insert would + always first be completed for the + clustered index record, then proceed to + secondary indexes.) */ + + if (entry && row_vers_vc_matches_cluster( + rec, entry, + clust_index, clust_offsets, + index, ientry, roll_ptr, + trx_id, NULL, &vrow, mtr)) { + goto unsafe_to_purge; + } + } + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index + ->n_core_fields, + ULINT_UNDEFINED, &heap); + } else { + + entry = row_build_index_entry( + row, ext, index, heap); + + /* If entry == NULL, the record contains unset BLOB + pointers. This must be a freshly inserted record. If + this is called from + row_purge_remove_sec_if_poss_low(), the thread will + hold latches on the clustered index and the secondary + index. Because the insert works in three steps: + + (1) insert the record to clustered index + (2) store the BLOBs and update BLOB pointers + (3) insert records to secondary indexes + + the purge thread can safely ignore freshly inserted + records and delete the secondary index record. The + thread that inserted the new record will be inserting + the secondary index records. */ + + /* NOTE that we cannot do the comparison as binary + fields because the row is maybe being modified so that + the clustered index record has already been updated to + a different binary value in a char field, but the + collation identifies the old and new value anyway! */ + if (entry && dtuple_coll_eq(*ientry, *entry)) { +unsafe_to_purge: + mem_heap_free(heap); + + if (v_heap) { + mem_heap_free(v_heap); + } + return true; + } + } + } else if (dict_index_has_virtual(index)) { + /* The current cluster index record could be + deleted, but the previous version of it might not. We will + need to get the virtual column data from undo record + associated with current cluster index */ + + cur_vrow = row_vers_build_cur_vrow( + also_curr, rec, clust_index, &clust_offsets, + index, roll_ptr, trx_id, heap, v_heap, mtr); + } + + version = rec; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + vrow = NULL; + + trx_undo_prev_version_build(version, + clust_index, clust_offsets, + heap, &prev_version, nullptr, + dict_index_has_virtual(index) + ? &vrow : nullptr, + TRX_UNDO_CHECK_PURGEABILITY); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (!prev_version) { + /* Versions end here */ + mem_heap_free(heap); + + if (v_heap) { + mem_heap_free(v_heap); + } + + return false; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_has_virtual(index)) { + if (vrow) { + if (dtuple_vcol_data_missing(*vrow, index)) { + goto nochange_index; + } + /* Keep the virtual row info for the next + version, unless it is changed */ + mem_heap_empty(v_heap); + cur_vrow = dtuple_copy(vrow, v_heap); + dtuple_dup_v_fld(cur_vrow, v_heap); + } + + if (!cur_vrow) { + /* Nothing for this index has changed, + continue */ +nochange_index: + version = prev_version; + continue; + } + } + + if (!rec_get_deleted_flag(prev_version, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, + NULL, NULL, NULL, &ext, heap); + + if (dict_index_has_virtual(index)) { + ut_ad(cur_vrow); + ut_ad(row->n_v_fields == cur_vrow->n_v_fields); + dtuple_copy_v_fields(row, cur_vrow); + } + + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset + BLOB pointers. This must be a freshly + inserted record that we can safely ignore. + For the justification, see the comments after + the previous row_build_index_entry() call. */ + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (entry && dtuple_coll_eq(*ientry, *entry)) { + goto unsafe_to_purge; + } + } + + version = prev_version; + } +} + +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return error code +@retval DB_SUCCESS if a previous version was fetched +@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */ +dberr_t +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + rec_offs** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + ReadView* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers,/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, + it was freshly inserted afterwards */ + dtuple_t** vrow) /*!< out: virtual row */ +{ + const rec_t* version; + rec_t* prev_version; + trx_id_t trx_id; + mem_heap_t* heap = NULL; + byte* buf; + dberr_t err; + + ut_ad(index->is_primary()); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + trx_id = row_get_rec_trx_id(rec, index, *offsets); + + ut_ad(!view->changes_visible(trx_id)); + + ut_ad(!vrow || !(*vrow)); + + version = rec; + + for (;;) { + mem_heap_t* prev_heap = heap; + + heap = mem_heap_create(1024); + + if (vrow) { + *vrow = NULL; + } + + /* If purge can't see the record then we can't rely on + the UNDO log record. */ + + err = trx_undo_prev_version_build( + version, index, *offsets, heap, + &prev_version, NULL, vrow, 0); + + if (prev_heap != NULL) { + mem_heap_free(prev_heap); + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + ut_ad(!vrow || !(*vrow)); + break; + } + + *offsets = rec_get_offsets( + prev_version, index, *offsets, + index->n_core_fields, ULINT_UNDEFINED, offset_heap); + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + trx_id = row_get_rec_trx_id(prev_version, index, *offsets); + + if (view->changes_visible(trx_id)) { + + /* The view already sees this version: we can copy + it to in_heap and return */ + + buf = static_cast<byte*>( + mem_heap_alloc( + in_heap, rec_offs_size(*offsets))); + + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, true, *offsets); + + if (vrow && *vrow) { + *vrow = dtuple_copy(*vrow, in_heap); + dtuple_dup_v_fld(*vrow, in_heap); + } + break; + } else if (trx_id >= view->low_limit_id() + && trx_id >= trx_sys.get_max_trx_id()) { + err = DB_CORRUPTION; + break; + } + version = prev_version; + } + + mem_heap_free(heap); + + return(err); +} + +#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ +/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */ +# pragma GCC optimize ("O0") +#endif +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +void +row_vers_build_for_semi_consistent_read( +/*====================================*/ + trx_t* caller_trx,/*!<in/out: trx of current thread */ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + rec_offs** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers,/*!< out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + dtuple_t** vrow) /*!< out: virtual row, old version, or NULL + if it is not updated in the view */ +{ + const rec_t* version; + mem_heap_t* heap = NULL; + byte* buf; + trx_id_t rec_trx_id = 0; + + ut_ad(index->is_primary()); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + version = rec; + ut_ad(!vrow || !(*vrow)); + + for (;;) { + mem_heap_t* heap2; + rec_t* prev_version; + trx_id_t version_trx_id; + + version_trx_id = row_get_rec_trx_id(version, index, *offsets); + if (rec == version) { + rec_trx_id = version_trx_id; + } + + if (!trx_sys.is_registered(caller_trx, version_trx_id)) { +committed_version_trx: + /* We found a version that belongs to a + committed transaction: return it. */ + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (rec == version) { + *old_vers = rec; + if (vrow) { + *vrow = NULL; + } + break; + } + + /* We assume that a rolled-back transaction stays in + TRX_STATE_ACTIVE state until all the changes have been + rolled back and the transaction is removed from + the global list of transactions. */ + + if (rec_trx_id == version_trx_id) { + /* The transaction was committed while + we searched for earlier versions. + Return the current version as a + semi-consistent read. */ + + version = rec; + *offsets = rec_get_offsets( + version, index, *offsets, + index->n_core_fields, ULINT_UNDEFINED, + offset_heap); + } + + buf = static_cast<byte*>( + mem_heap_alloc( + in_heap, rec_offs_size(*offsets))); + + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, true, *offsets); + if (vrow && *vrow) { + *vrow = dtuple_copy(*vrow, in_heap); + dtuple_dup_v_fld(*vrow, in_heap); + } + break; + } + + DEBUG_SYNC_C("after_row_vers_check_trx_active"); + + heap2 = heap; + heap = mem_heap_create(1024); + + if (trx_undo_prev_version_build(version, index, *offsets, heap, + &prev_version, in_heap, vrow, + 0) != DB_SUCCESS) { + mem_heap_free(heap); + heap = heap2; + heap2 = NULL; + goto committed_version_trx; + } + + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + ut_ad(!vrow || !(*vrow)); + break; + } + + version = prev_version; + *offsets = rec_get_offsets(version, index, *offsets, + index->n_core_fields, + ULINT_UNDEFINED, offset_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(version, *offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + }/* for (;;) */ + + if (heap) { + mem_heap_free(heap); + } +} |