diff options
Diffstat (limited to 'storage/innobase/btr/btr0cur.cc')
-rw-r--r-- | storage/innobase/btr/btr0cur.cc | 7017 |
1 files changed, 7017 insertions, 0 deletions
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc new file mode 100644 index 00000000..e736f338 --- /dev/null +++ b/storage/innobase/btr/btr0cur.cc @@ -0,0 +1,7017 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0cur.cc +The index tree cursor + +All changes that row operations make to a B-tree or the records +there must go through this module! Undo log records are written here +of every modify or insert of a clustered index record. + + NOTE!!! +To make sure we do not run out of disk space during a pessimistic +insert or update, we have to reserve 2 x the height of the index tree +many pages in the tablespace before we start the operation, because +if leaf splitting has been started, it is difficult to undo, except +by crashing the database and doing a roll-forward. + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0cur.h" +#include "row0upd.h" +#include "mtr0log.h" +#include "page0page.h" +#include "page0zip.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "row0log.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "que0que.h" +#include "row0row.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "zlib.h" +#include "srv0start.h" +#include "mysql_com.h" +#include "dict0stats.h" +#include "row0ins.h" +#ifdef WITH_WSREP +#include "mysql/service_wsrep.h" +#endif /* WITH_WSREP */ +#include "log.h" + +/** Buffered B-tree operation types, introduced as part of delete buffering. */ +enum btr_op_t { + BTR_NO_OP = 0, /*!< Not buffered */ + BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */ + BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */ + BTR_DELETE_OP, /*!< Purge a delete-marked record */ + BTR_DELMARK_OP /*!< Mark a record for deletion */ +}; + +/** Modification types for the B-tree operation. + Note that the order must be DELETE, BOTH, INSERT !! + */ +enum btr_intention_t { + BTR_INTENTION_DELETE, + BTR_INTENTION_BOTH, + BTR_INTENTION_INSERT +}; + +/** For the index->lock scalability improvement, only possibility of clear +performance regression observed was caused by grown huge history list length. +That is because the exclusive use of index->lock also worked as reserving +free blocks and read IO bandwidth with priority. To avoid huge glowing history +list as same level with previous implementation, prioritizes pessimistic tree +operations by purge as the previous, when it seems to be growing huge. + + Experimentally, the history list length starts to affect to performance +throughput clearly from about 100000. */ +#define BTR_CUR_FINE_HISTORY_LENGTH 100000 + +#ifdef BTR_CUR_HASH_ADAPT +/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */ +ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_non_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +ulint btr_cur_n_non_sea_old; +/** Number of successful adaptive hash index lookups in +btr_cur_t::search_leaf(). */ +ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_sea; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +ulint btr_cur_n_sea_old; +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +/** In the optimistic insert, if the insert does not fit, but this much space +can be released by page reorganize, then it is reorganized */ +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32) + +/** The structure of a BLOB part header */ +/* @{ */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB + part header, in bytes */ + +/* @} */ + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr); /*!< in: mtr, or NULL if not logged */ +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in: record */ + buf_block_t* block, /*!< in: index page of rec */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in: record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + buf_block_t* block, /*!< in: index page of rec */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ + +/*==================== B-TREE SEARCH =========================*/ + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] index clustered index definition +@param[in,out] mtr mini-transaction +@return error code +@retval DB_SUCCESS if no error occurred +@retval DB_CORRUPTION if any corruption was noticed */ +static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr) +{ + ut_ad(index->is_primary()); + ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES); + ut_ad(index->table->supports_instant()); + ut_ad(index->table->is_readable()); + + dberr_t err; + const fil_space_t* space = index->table->space; + if (!space) { +corrupted: + err = DB_CORRUPTION; +unreadable: + ib::error() << "Table " << index->table->name + << " has an unreadable root page"; + index->table->corrupted = true; + index->table->file_unreadable = true; + return err; + } + + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err); + if (!root) { + goto unreadable; + } + + if (btr_cur_instant_root_init(index, root->page.frame)) { + goto corrupted; + } + + ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES); + + if (fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX) { + ut_ad(!index->is_instant()); + return DB_SUCCESS; + } + + btr_cur_t cur; + /* Relax the assertion in rec_init_offsets(). */ + ut_ad(!index->in_instant_init); + ut_d(index->in_instant_init = true); + err = cur.open_leaf(true, index, BTR_SEARCH_LEAF, mtr); + ut_d(index->in_instant_init = false); + if (err != DB_SUCCESS) { + index->table->file_unreadable = true; + index->table->corrupted = true; + return err; + } + + ut_ad(page_cur_is_before_first(&cur.page_cur)); + ut_ad(page_is_leaf(cur.page_cur.block->page.frame)); + + const rec_t* rec = page_cur_move_to_next(&cur.page_cur); + const ulint comp = dict_table_is_comp(index->table); + const ulint info_bits = rec ? rec_get_info_bits(rec, comp) : 0; + + if (page_rec_is_supremum(rec) + || !(info_bits & REC_INFO_MIN_REC_FLAG)) { + if (rec && !index->is_instant()) { + /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be + assigned even if instant ADD COLUMN was not + committed. Changes to these page header fields are not + undo-logged, but changes to the hidden metadata record + are. If the server is killed and restarted, the page + header fields could remain set even though no metadata + record is present. */ + return DB_SUCCESS; + } + + ib::error() << "Table " << index->table->name + << " is missing instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG + || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) { +incompatible: + ib::error() << "Table " << index->table->name + << " contains unrecognizable instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + /* Read the metadata. We can get here on server restart + or when the table was evicted from the data dictionary cache + and is now being accessed again. + + Here, READ COMMITTED and REPEATABLE READ should be equivalent. + Committing the ADD COLUMN operation would acquire + MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any + concurrent operations on the table, including table eviction + from the cache. */ + + if (info_bits & REC_INFO_DELETED_FLAG) { + /* This metadata record includes a BLOB that identifies + any dropped or reordered columns. */ + ulint trx_id_offset = index->trx_id_offset; + /* If !index->trx_id_offset, the PRIMARY KEY contains + variable-length columns. For the metadata record, + variable-length columns should be written with zero + length. However, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of type + CHAR, we wrote more than zero bytes. That is why we + must determine the actual length of each PRIMARY KEY + column. The DB_TRX_ID will start right after any + PRIMARY KEY columns. */ + ut_ad(index->n_uniq); + + /* We cannot invoke rec_get_offsets() before + index->table->deserialise_columns(). Therefore, + we must duplicate some logic here. */ + if (trx_id_offset) { + } else if (index->table->not_redundant()) { + /* The PRIMARY KEY contains variable-length columns. + For the metadata record, variable-length columns are + always written with zero length. The DB_TRX_ID will + start right after any fixed-length columns. */ + + /* OK, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of + type CHAR, we wrote more than zero bytes. In + order to allow affected tables to be accessed, + it would be nice to determine the actual + length of each PRIMARY KEY column. However, to + be able to do that, we should determine the + size of the null-bit bitmap in the metadata + record. And we cannot know that before reading + the metadata BLOB, whose starting point we are + trying to find here. (Although the PRIMARY KEY + columns cannot be NULL, we would have to know + where the lengths of variable-length PRIMARY KEY + columns start.) + + So, unfortunately we cannot help users who + were affected by MDEV-21088 on a ROW_FORMAT=COMPACT + or ROW_FORMAT=DYNAMIC table. */ + + for (uint i = index->n_uniq; i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } else if (rec_get_1byte_offs_flag(rec)) { + trx_id_offset = rec_1_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK; + } else { + trx_id_offset = rec_2_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK; + } + + const byte* ptr = rec + trx_id_offset + + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) { + goto incompatible; + } + + uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4); + if (!len + || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) + != FIL_PAGE_DATA + || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + != space->id) { + goto incompatible; + } + + buf_block_t* block = buf_page_get( + page_id_t(space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, mtr); + if (!block) { + goto incompatible; + } + + if (fil_page_get_type(block->page.frame) != FIL_PAGE_TYPE_BLOB + || mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + != FIL_NULL + || mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + != len) { + goto incompatible; + } + + /* The unused part of the BLOB page should be zero-filled. */ + for (const byte* b = block->page.frame + + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len, + * const end = block->page.frame + srv_page_size + - BTR_EXTERN_LEN; + b < end; ) { + if (*b++) { + goto incompatible; + } + } + + if (index->table->deserialise_columns( + &block->page.frame + [FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) { + goto incompatible; + } + + /* Proceed to initialize the default values of + any instantly added columns. */ + } + + mem_heap_t* heap = NULL; + rec_offs* offsets = rec_get_offsets(rec, index, NULL, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + if (rec_offs_any_default(offsets)) { +inconsistent: + mem_heap_free(heap); + goto incompatible; + } + + /* In fact, because we only ever append fields to the metadata + record, it is also OK to perform READ UNCOMMITTED and + then ignore any extra fields, provided that + trx_sys.is_registered(DB_TRX_ID). */ + if (rec_offs_n_fields(offsets) + > ulint(index->n_fields) + !!index->table->instant + && !trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, + offsets))) { + goto inconsistent; + } + + for (unsigned i = index->n_core_fields; i < index->n_fields; i++) { + dict_col_t* col = index->fields[i].col; + const unsigned o = i + !!index->table->instant; + ulint len; + const byte* data = rec_get_nth_field(rec, offsets, o, &len); + ut_ad(!col->is_added()); + ut_ad(!col->def_val.data); + col->def_val.len = len; + switch (len) { + case UNIV_SQL_NULL: + continue; + case 0: + col->def_val.data = field_ref_zero; + continue; + } + ut_ad(len != UNIV_SQL_DEFAULT); + if (!rec_offs_nth_extern(offsets, o)) { + col->def_val.data = mem_heap_dup( + index->table->heap, data, len); + } else if (len < BTR_EXTERN_FIELD_REF_SIZE + || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + col->def_val.len = UNIV_SQL_DEFAULT; + goto inconsistent; + } else { + col->def_val.data = btr_copy_externally_stored_field( + &col->def_val.len, data, + cur.page_cur.block->zip_size(), + len, index->table->heap); + } + } + + mem_heap_free(heap); + return DB_SUCCESS; +} + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) +{ + mtr_t mtr; + dict_index_t* index = dict_table_get_first_index(table); + mtr.start(); + dberr_t err = index + ? btr_cur_instant_init_low(index, &mtr) + : DB_CORRUPTION; + mtr.commit(); + return(err); +} + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) +{ + ut_ad(!index->is_dummy); + ut_ad(index->is_primary()); + ut_ad(!index->is_instant()); + ut_ad(index->table->supports_instant()); + + if (page_has_siblings(page)) { + return true; + } + + /* This is normally executed as part of btr_cur_instant_init() + when dict_load_table_one() is loading a table definition. + Other threads should not access or modify the n_core_null_bytes, + n_core_fields before dict_load_table_one() returns. + + This can also be executed during IMPORT TABLESPACE, where the + table definition is exclusively locked. */ + + switch (fil_page_get_type(page)) { + default: + return true; + case FIL_PAGE_INDEX: + /* The field PAGE_INSTANT is guaranteed 0 on clustered + index root pages of ROW_FORMAT=COMPACT or + ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */ + if (page_is_comp(page) && page_get_instant(page)) { + return true; + } + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + return false; + case FIL_PAGE_TYPE_INSTANT: + break; + } + + const uint16_t n = page_get_instant(page); + + if (n < index->n_uniq + DATA_ROLL_PTR) { + /* The PRIMARY KEY (or hidden DB_ROW_ID) and + DB_TRX_ID,DB_ROLL_PTR columns must always be present + as 'core' fields. */ + return true; + } + + if (n > REC_MAX_N_FIELDS) { + return true; + } + + index->n_core_fields = n & dict_index_t::MAX_N_FIELDS; + + const rec_t* infimum = page_get_infimum_rec(page); + const rec_t* supremum = page_get_supremum_rec(page); + + if (!memcmp(infimum, "infimum", 8) + && !memcmp(supremum, "supremum", 8)) { + if (n > index->n_fields) { + /* All fields, including those for instantly + added columns, must be present in the + data dictionary. */ + return true; + } + + ut_ad(!index->is_dummy); + ut_d(index->is_dummy = true); + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(index->get_n_nullable(n))); + ut_d(index->is_dummy = false); + return false; + } + + if (memcmp(infimum, field_ref_zero, 8) + || memcmp(supremum, field_ref_zero, 7)) { + /* The infimum and supremum records must either contain + the original strings, or they must be filled with zero + bytes, except for the bytes that we have repurposed. */ + return true; + } + + index->n_core_null_bytes = supremum[7]; + return index->n_core_null_bytes > 128; +} + +/** +Gets intention in btr_intention_t from latch_mode, and cleares the intention +at the latch_mode. +@param latch_mode in/out: pointer to latch_mode +@return intention for latching tree */ +static +btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode) +{ + btr_intention_t intention; + + switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) { + case BTR_LATCH_FOR_INSERT: + intention = BTR_INTENTION_INSERT; + break; + case BTR_LATCH_FOR_DELETE: + intention = BTR_INTENTION_DELETE; + break; + default: + /* both or unknown */ + intention = BTR_INTENTION_BOTH; + } + *latch_mode = btr_latch_mode( + *latch_mode & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)); + + return(intention); +} + +/** @return whether the distance between two records is at most the +specified value */ +static bool +page_rec_distance_is_at_most(const rec_t *left, const rec_t *right, ulint val) +{ + do + { + if (left == right) + return true; + left= page_rec_get_next_const(left); + } + while (left && val--); + return false; +} + +/** Detects whether the modifying record might need a modifying tree structure. +@param[in] index index +@param[in] page page +@param[in] lock_intention lock intention for the tree operation +@param[in] rec record (current node_ptr) +@param[in] rec_size size of the record or max size of node_ptr +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] mtr mtr +@return true if tree modification is needed */ +static +bool +btr_cur_will_modify_tree( + dict_index_t* index, + const page_t* page, + btr_intention_t lock_intention, + const rec_t* rec, + ulint rec_size, + ulint zip_size, + mtr_t* mtr) +{ + ut_ad(!page_is_leaf(page)); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + /* Pessimistic delete of the first record causes delete & insert + of node_ptr at upper level. And a subsequent page shrink is + possible. It causes delete of node_ptr at the upper level. + So we should pay attention also to 2nd record not only + first record and last record. Because if the "delete & insert" are + done for the different page, the 2nd record become + first record and following compress might delete the record and causes + the uppper level node_ptr modification. */ + + const ulint n_recs = page_get_n_recs(page); + + if (lock_intention <= BTR_INTENTION_BOTH) { + compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH); + compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT); + + if (!page_has_siblings(page)) { + return true; + } + + ulint margin = rec_size; + + if (lock_intention == BTR_INTENTION_BOTH) { + ulint level = btr_page_get_level(page); + + /* This value is the worst expectation for the node_ptr + records to be deleted from this page. It is used to + expect whether the cursor position can be the left_most + record in this page or not. */ + ulint max_nodes_deleted = 0; + + /* By modifying tree operations from the under of this + level, logically (2 ^ (level - 1)) opportunities to + deleting records in maximum even unreally rare case. */ + if (level > 7) { + /* TODO: adjust this practical limit. */ + max_nodes_deleted = 64; + } else if (level > 0) { + max_nodes_deleted = (ulint)1 << (level - 1); + } + /* check delete will cause. (BTR_INTENTION_BOTH + or BTR_INTENTION_DELETE) */ + if (n_recs <= max_nodes_deleted * 2 + || page_rec_is_first(rec, page)) { + /* The cursor record can be the left most record + in this page. */ + return true; + } + + if (page_has_prev(page) + && page_rec_distance_is_at_most( + page_get_infimum_rec(page), rec, + max_nodes_deleted)) { + return true; + } + + if (page_has_next(page) + && page_rec_distance_is_at_most( + rec, page_get_supremum_rec(page), + max_nodes_deleted)) { + return true; + } + + /* Delete at leftmost record in a page causes delete + & insert at its parent page. After that, the delete + might cause btr_compress() and delete record at its + parent page. Thus we should consider max deletes. */ + margin *= max_nodes_deleted; + } + + /* Safe because we already have SX latch of the index tree */ + if (page_get_data_size(page) + < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) { + return(true); + } + } + + if (lock_intention >= BTR_INTENTION_BOTH) { + /* check insert will cause. BTR_INTENTION_BOTH + or BTR_INTENTION_INSERT*/ + + /* Once we invoke the btr_cur_limit_optimistic_insert_debug, + we should check it here in advance, since the max allowable + records in a page is limited. */ + LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true); + + /* needs 2 records' space for the case the single split and + insert cannot fit. + page_get_max_insert_size_after_reorganize() includes space + for page directory already */ + ulint max_size + = page_get_max_insert_size_after_reorganize(page, 2); + + if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size + || max_size < rec_size * 2) { + return(true); + } + + /* TODO: optimize this condition for ROW_FORMAT=COMPRESSED. + This is based on the worst case, and we could invoke + page_zip_available() on the block->page.zip. */ + /* needs 2 records' space also for worst compress rate. */ + if (zip_size + && page_zip_empty_size(index->n_fields, zip_size) + <= rec_size * 2 + page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + 2)) { + return(true); + } + } + + return(false); +} + +/** Detects whether the modifying record might need a opposite modification +to the intention. +@param bpage buffer pool page +@param is_clust whether this is a clustered index +@param lock_intention lock intention for the tree operation +@param node_ptr_max_size the maximum size of a node pointer +@param compress_limit BTR_CUR_PAGE_COMPRESS_LIMIT(index) +@param rec record (current node_ptr) +@return true if tree modification is needed */ +static bool btr_cur_need_opposite_intention(const buf_page_t &bpage, + bool is_clust, + btr_intention_t lock_intention, + ulint node_ptr_max_size, + ulint compress_limit, + const rec_t *rec) +{ + if (UNIV_LIKELY_NULL(bpage.zip.data) && + !page_zip_available(&bpage.zip, is_clust, node_ptr_max_size, 1)) + return true; + const page_t *const page= bpage.frame; + if (lock_intention != BTR_INTENTION_INSERT) + { + /* We compensate also for btr_cur_compress_recommendation() */ + if (!page_has_siblings(page) || + page_rec_is_first(rec, page) || page_rec_is_last(rec, page) || + page_get_data_size(page) < node_ptr_max_size + compress_limit) + return true; + if (lock_intention == BTR_INTENTION_DELETE) + return false; + } + else if (page_has_next(page) && page_rec_is_last(rec, page)) + return true; + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true); + const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2); + return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size || + max_size < node_ptr_max_size * 2; +} + +/** +@param[in] index b-tree +@return maximum size of a node pointer record in bytes */ +static ulint btr_node_ptr_max_size(const dict_index_t* index) +{ + if (dict_index_is_ibuf(index)) { + /* cannot estimate accurately */ + /* This is universal index for change buffer. + The max size of the entry is about max key length * 2. + (index key + primary key to be inserted to the index) + (The max key length is UNIV_PAGE_SIZE / 16 * 3 at + ha_innobase::max_supported_key_length(), + considering MAX_KEY_LENGTH = 3072 at MySQL imposes + the 3500 historical InnoDB value for 16K page size case.) + For the universal index, node_ptr contains most of the entry. + And 512 is enough to contain ibuf columns and meta-data */ + return srv_page_size / 8 * 3 + 512; + } + + /* Each record has page_no, length of page_no and header. */ + ulint comp = dict_table_is_comp(index->table); + ulint rec_max_size = comp + ? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(index->n_nullable) + : REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES + + 2 * index->n_fields; + + /* Compute the maximum possible record size. */ + for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint field_max_size; + ulint field_ext_max_size; + + /* Determine the maximum length of the index field. */ + + field_max_size = dict_col_get_fixed_size(col, comp); + if (field_max_size) { + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || field->fixed_len == field->prefix_len); + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + rec_max_size += field_max_size; + continue; + } + + field_max_size = dict_col_get_max_size(col); + if (UNIV_UNLIKELY(!field_max_size)) { + switch (col->mtype) { + case DATA_VARCHAR: + if (!comp + && (!strcmp(index->table->name.m_name, + "SYS_FOREIGN") + || !strcmp(index->table->name.m_name, + "SYS_FOREIGN_COLS"))) { + break; + } + /* fall through */ + case DATA_FIXBINARY: + case DATA_BINARY: + case DATA_VARMYSQL: + case DATA_CHAR: + case DATA_MYSQL: + /* BINARY(0), VARBINARY(0), + CHAR(0) and VARCHAR(0) are possible + data type definitions in MariaDB. + The InnoDB internal SQL parser maps + CHAR to DATA_VARCHAR, so DATA_CHAR (or + DATA_MYSQL) is only coming from the + MariaDB SQL layer. */ + if (comp) { + /* Add a length byte, because + fixed-length empty field are + encoded as variable-length. + For ROW_FORMAT=REDUNDANT, + these bytes were added to + rec_max_size before this loop. */ + rec_max_size++; + } + continue; + } + + /* SYS_FOREIGN.ID is defined as CHAR in the + InnoDB internal SQL parser, which translates + into the incorrect VARCHAR(0). InnoDB does + not enforce maximum lengths of columns, so + that is why any data can be inserted in the + first place. + + Likewise, SYS_FOREIGN.FOR_NAME, + SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are + defined as CHAR, and also they are part of a key. */ + + ut_ad(!strcmp(index->table->name.m_name, + "SYS_FOREIGN") + || !strcmp(index->table->name.m_name, + "SYS_FOREIGN_COLS")); + ut_ad(!comp); + ut_ad(col->mtype == DATA_VARCHAR); + + rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX) + ? REDUNDANT_REC_MAX_DATA_SIZE + : page_get_free_space_of_empty(FALSE) / 2; + } else if (field_max_size == NAME_LEN && i == 1 + && (!strcmp(index->table->name.m_name, + TABLE_STATS_NAME) + || !strcmp(index->table->name.m_name, + INDEX_STATS_NAME))) { + /* Interpret "table_name" as VARCHAR(199) even + if it was incorrectly defined as VARCHAR(64). + While the caller of ha_innobase enforces the + maximum length on any data written, the InnoDB + internal SQL parser will happily write as much + data as is provided. The purpose of this hack + is to avoid InnoDB hangs after persistent + statistics on partitioned tables are + deleted. */ + field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN; + } + field_ext_max_size = field_max_size < 256 ? 1 : 2; + + if (field->prefix_len + && field->prefix_len < field_max_size) { + field_max_size = field->prefix_len; + } + + if (comp) { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to rec_max_size before this loop. */ + rec_max_size += field_ext_max_size; + } + + rec_max_size += field_max_size; + } + + return rec_max_size; +} + +/** @return a B-tree search mode suitable for non-leaf pages +@param mode leaf page search mode */ +static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode) +{ + if (mode > PAGE_CUR_GE) + { + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); + return mode; + } + if (mode == PAGE_CUR_GE) + return PAGE_CUR_L; + ut_ad(mode == PAGE_CUR_G); + return PAGE_CUR_LE; +} + +static MY_ATTRIBUTE((nonnull)) +/** Acquire a latch on the previous page without violating the latching order. +@param block index page +@param page_id page identifier with valid space identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH) +@param mtr mini-transaction +@param err error code +@retval 0 if an error occurred +@retval 1 if the page could be latched in the wrong order +@retval -1 if the latch on block was temporarily released */ +int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size, + rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err) +{ + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + ut_ad(page_id.space() == block->page.id().space()); + + const auto prev_savepoint= mtr->get_savepoint(); + ut_ad(block == mtr->at_savepoint(prev_savepoint - 1)); + + page_id.set_page_no(btr_page_get_prev(block->page.frame)); + buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr, + BUF_GET, mtr, err, false); + if (UNIV_UNLIKELY(!prev)) + return 0; + + int ret= 1; + if (UNIV_UNLIKELY(rw_latch == RW_S_LATCH)) + { + if (UNIV_LIKELY(prev->page.lock.s_lock_try())) + { + mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_S_FIX); + goto prev_latched; + } + block->page.lock.s_unlock(); + } + else + { + if (UNIV_LIKELY(prev->page.lock.x_lock_try())) + { + mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_X_FIX); + goto prev_latched; + } + block->page.lock.x_unlock(); + } + + ret= -1; + mtr->lock_register(prev_savepoint - 1, MTR_MEMO_BUF_FIX); + mtr->rollback_to_savepoint(prev_savepoint); + prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev, + BUF_GET, mtr, err, false); + if (UNIV_UNLIKELY(!prev)) + return 0; + mtr->upgrade_buffer_fix(prev_savepoint - 1, rw_latch); + + prev_latched: + if (memcmp_aligned<2>(FIL_PAGE_TYPE + prev->page.frame, + FIL_PAGE_TYPE + block->page.frame, 2) || + memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + prev->page.frame, + PAGE_HEADER + PAGE_INDEX_ID + block->page.frame, 8) || + page_is_comp(prev->page.frame) != page_is_comp(block->page.frame)) + { + ut_ad("corrupted" == 0); // FIXME: remove this + *err= DB_CORRUPTION; + ret= 0; + } + + return ret; +} + +dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, + btr_latch_mode latch_mode, mtr_t *mtr) +{ + ut_ad(index()->is_btree() || index()->is_ibuf()); + ut_ad(!index()->is_ibuf() || ibuf_inside(mtr)); + + buf_block_t *guess; + btr_op_t btr_op; + btr_intention_t lock_intention; + bool detected_same_key_root= false; + + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets2_; + rec_offs_init(offsets_); + rec_offs_init(offsets2_); + + ut_ad(dict_index_check_search_tuple(index(), tuple)); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(index()->page != FIL_NULL); + + MEM_UNDEFINED(&up_match, sizeof up_match); + MEM_UNDEFINED(&up_bytes, sizeof up_bytes); + MEM_UNDEFINED(&low_match, sizeof low_match); + MEM_UNDEFINED(&low_bytes, sizeof low_bytes); + ut_d(up_match= ULINT_UNDEFINED); + ut_d(low_match= ULINT_UNDEFINED); + + ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) || + mtr->memo_contains_flagged(&index()->lock, + MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK | + MTR_MEMO_X_LOCK)); + + /* These flags are mutually exclusive, they are lumped together + with the latch mode for historical reasons. It's possible for + none of the flags to be set. */ + switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) { + default: + btr_op= BTR_NO_OP; + break; + case BTR_INSERT: + btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE) + ? BTR_INSERT_IGNORE_UNIQUE_OP + : BTR_INSERT_OP; + break; + case BTR_DELETE: + btr_op= BTR_DELETE_OP; + ut_a(purge_node); + break; + case BTR_DELETE_MARK: + btr_op= BTR_DELMARK_OP; + break; + } + + /* Operations on the insert buffer tree cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf()); + /* Operations on the clustered index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index()->is_clust()); + /* Operations on the temporary table(indexes) cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary()); + + const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; + lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!latch_by_caller + || latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_MODIFY_LEAF + || latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_MODIFY_ROOT_AND_LEAF); + + flag= BTR_CUR_BINARY; +#ifndef BTR_CUR_ADAPT + guess= nullptr; +#else + btr_search_t *info= btr_search_get_info(index()); + guess= info->root_guess; + +# ifdef BTR_CUR_HASH_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; +# endif + bool ahi_enabled= btr_search_enabled && !index()->is_ibuf(); + /* We do a dirty read of btr_search_enabled below, + and btr_search_guess_on_hash() will have to check it again. */ + if (!ahi_enabled); + else if (btr_search_guess_on_hash(index(), info, tuple, mode, + latch_mode, this, mtr)) + { + /* Search using the hash index succeeded */ + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ++btr_cur_n_sea; + + return DB_SUCCESS; + } + else + ++btr_cur_n_non_sea; +# endif +#endif + + /* If the hash search did not succeed, do binary search down the + tree */ + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + const ulint savepoint= mtr->get_savepoint(); + + ulint node_ptr_max_size= 0, compress_limit= 0; + rw_lock_type_t rw_latch= RW_S_LATCH; + + switch (latch_mode) { + case BTR_MODIFY_TREE: + rw_latch= RW_X_LATCH; + node_ptr_max_size= btr_node_ptr_max_size(index()); + if (latch_by_caller) + { + ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK)); + break; + } + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index()); + if (os_aio_pending_reads_approx() && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + /* Most delete-intended operations are due to the purge of history. + Prioritize them when the history list is growing huge. */ + mtr_x_lock_index(index(), mtr); + break; + } + } + mtr_sx_lock_index(index(), mtr); + break; +#ifdef UNIV_DEBUG + case BTR_CONT_MODIFY_TREE: + ut_ad("invalid mode" == 0); + break; +#endif + case BTR_MODIFY_ROOT_AND_LEAF: + rw_latch= RW_SX_LATCH; + /* fall through */ + default: + if (!latch_by_caller) + mtr_s_lock_index(index(), mtr); + } + + const ulint zip_size= index()->table->space->zip_size(); + + /* Start with the root page. */ + page_id_t page_id(index()->table->space_id, index()->page); + + const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode); + ulint height= ULINT_UNDEFINED; + up_match= 0; + up_bytes= 0; + low_match= 0; + low_bytes= 0; + ulint buf_mode= BUF_GET; + search_loop: + dberr_t err; + auto block_savepoint= mtr->get_savepoint(); + buf_block_t *block= + buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr, + &err, height == 0 && !index()->is_clust()); + if (!block) + { + switch (err) { + case DB_DECRYPTION_FAILED: + btr_decryption_failed(*index()); + /* fall through */ + default: + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + case DB_SUCCESS: + /* This must be a search to perform an insert, delete mark, or delete; + try using the change buffer */ + ut_ad(height == 0); + ut_ad(thr); + break; + } + + switch (btr_op) { + default: + MY_ASSERT_UNREACHABLE(); + break; + case BTR_INSERT_OP: + case BTR_INSERT_IGNORE_UNIQUE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + + if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr)) + { + flag= BTR_CUR_INSERT_TO_IBUF; + goto func_exit; + } + break; + + case BTR_DELMARK_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + + if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, + index(), page_id, zip_size, thr)) + { + flag = BTR_CUR_DEL_MARK_IBUF; + goto func_exit; + } + + break; + + case BTR_DELETE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); + auto& chain = buf_pool.page_hash.cell_get(page_id.fold()); + + if (!row_purge_poss_sec(purge_node, index(), tuple)) + /* The record cannot be purged yet. */ + flag= BTR_CUR_DELETE_REF; + else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(), + page_id, zip_size, thr)) + /* The purge was buffered. */ + flag= BTR_CUR_DELETE_IBUF; + else + { + /* The purge could not be buffered. */ + buf_pool.watch_unset(page_id, chain); + break; + } + + buf_pool.watch_unset(page_id, chain); + goto func_exit; + } + + /* Change buffering did not succeed, we must read the page. */ + buf_mode= BUF_GET; + goto search_loop; + } + + if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index()->id || + fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || + !fil_page_index_page_check(block->page.frame)) + { + corrupted: + ut_ad("corrupted" == 0); // FIXME: remove this + err= DB_CORRUPTION; + goto func_exit; + } + + page_cur.block= block; + ut_ad(block == mtr->at_savepoint(block_savepoint)); + ut_ad(rw_latch != RW_NO_LATCH); +#ifdef UNIV_ZIP_DEBUG + if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) + ut_a(page_zip_validate(page_zip, block->page.frame, index())); +#endif /* UNIV_ZIP_DEBUG */ + + const uint32_t page_level= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + /* We are in the B-tree index root page. */ +#ifdef BTR_CUR_ADAPT + info->root_guess= block; +#endif + height= page_level; + tree_height= height + 1; + + if (!height) + { + /* The root page is also a leaf page. + We may have to reacquire the page latch in a different mode. */ + switch (rw_latch) { + case RW_S_LATCH: + if ((latch_mode & ~12) != RW_S_LATCH) + { + ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH); + goto relatch_x; + } + if (latch_mode != BTR_MODIFY_PREV) + { + if (!latch_by_caller) + /* Release the tree s-latch */ + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + goto reached_latched_leaf; + } + /* fall through */ + case RW_SX_LATCH: + ut_ad(rw_latch == RW_S_LATCH || + latch_mode == BTR_MODIFY_ROOT_AND_LEAF); + relatch_x: + mtr->rollback_to_savepoint(block_savepoint); + height= ULINT_UNDEFINED; + rw_latch= RW_X_LATCH; + goto search_loop; + case RW_X_LATCH: + if (latch_mode == BTR_MODIFY_TREE) + goto reached_index_root_and_leaf; + goto reached_root_and_leaf; + case RW_NO_LATCH: + ut_ad(0); + } + goto reached_leaf; + } + } + else if (UNIV_UNLIKELY(height != page_level)) + goto corrupted; + else + switch (latch_mode) { + case BTR_MODIFY_TREE: + break; + case BTR_MODIFY_ROOT_AND_LEAF: + ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() == + index()->page) == (tree_height <= height + 2)); + if (tree_height <= height + 2) + /* Retain the root page latch. */ + break; + /* fall through */ + default: + ut_ad(block_savepoint > savepoint); + mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint); + block_savepoint--; + } + + if (!height) + { + reached_leaf: + /* We reached the leaf level. */ + ut_ad(block == mtr->at_savepoint(block_savepoint)); + + if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF) + { + reached_root_and_leaf: + if (!latch_by_caller) + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + reached_index_root_and_leaf: + ut_ad(rw_latch == RW_X_LATCH); +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif + if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + goto func_exit; + } + + switch (latch_mode) { + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); + static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, ""); + ut_ad(!latch_by_caller); + ut_ad(rw_latch == + rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH))); + + /* latch also siblings from left to right */ + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err)) + goto func_exit; + if (page_has_next(block->page.frame) && + !btr_block_get(*index(), btr_page_get_next(block->page.frame), + rw_latch, false, mtr, &err)) + goto func_exit; + goto release_tree; + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + if (!latch_by_caller) + { +release_tree: + /* Release the tree s-latch */ + block_savepoint--; + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + } + /* release upper blocks */ + if (savepoint < block_savepoint) + mtr->rollback_to_savepoint(savepoint, block_savepoint); + break; + default: + ut_ad(latch_mode == BTR_MODIFY_TREE); + ut_ad(rw_latch == RW_X_LATCH); + /* x-latch also siblings from left to right */ + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err)) + goto func_exit; + if (page_has_next(block->page.frame) && + !btr_block_get(*index(), btr_page_get_next(block->page.frame), + RW_X_LATCH, false, mtr, &err)) + goto func_exit; + if (btr_cur_need_opposite_intention(block->page, index()->is_clust(), + lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + goto need_opposite_intention; + } + + reached_latched_leaf: +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_enabled && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)) + { + if (page_cur_search_with_match_bytes(tuple, mode, + &up_match, &up_bytes, + &low_match, &low_bytes, &page_cur)) + goto corrupted; + } + else +#endif /* BTR_CUR_HASH_ADAPT */ + if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + +#ifdef BTR_CUR_HASH_ADAPT + /* We do a dirty read of btr_search_enabled here. We will + properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a page hash + index, while holding search latch. */ + if (!btr_search_enabled); + else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) + /* This may be a search tuple for btr_pcur_t::restore_position(). */ + ut_ad(tuple->is_metadata() || + (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT))); + else if (index()->table->is_temporary()); + else if (!rec_is_metadata(page_cur.rec, *index())) + btr_search_info_update(index(), this); +#endif /* BTR_CUR_HASH_ADAPT */ + + goto func_exit; + } + + guess= nullptr; + if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED, + &heap); + + ut_ad(block == mtr->at_savepoint(block_savepoint)); + + switch (latch_mode) { + default: + break; + case BTR_MODIFY_TREE: + if (btr_cur_need_opposite_intention(block->page, index()->is_clust(), + lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + /* If the rec is the first or last in the page for pessimistic + delete intention, it might cause node_ptr insert for the upper + level. We should change the intention and retry. */ + need_opposite_intention: + return pessimistic_search_leaf(tuple, mode, mtr); + + if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH || + index()->is_unique() || + (up_match <= rec_offs_n_fields(offsets) && + low_match <= rec_offs_n_fields(offsets))) + break; + + /* If the first or the last record of the page or the same key + value to the first record or last record, then another page might + be chosen when BTR_CONT_MODIFY_TREE. So, the parent page should + not released to avoiding deadlock with blocking the another search + with the same key value. */ + const rec_t *first= + page_rec_get_next_const(page_get_infimum_rec(block->page.frame)); + ulint matched_fields; + + if (UNIV_UNLIKELY(!first)) + goto corrupted; + if (page_cur.rec == first || + page_rec_is_last(page_cur.rec, block->page.frame)) + { + same_key_root: + detected_same_key_root= true; + break; + } + + matched_fields= 0; + offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED, + &heap); + cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false, + &matched_fields); + if (matched_fields >= rec_offs_n_fields(offsets) - 1) + goto same_key_root; + if (const rec_t* last= + page_rec_get_prev_const(page_get_supremum_rec(block->page.frame))) + { + matched_fields= 0; + offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED, + &heap); + cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false, + &matched_fields); + if (matched_fields >= rec_offs_n_fields(offsets) - 1) + goto same_key_root; + } + else + goto corrupted; + + /* Release the non-root parent page unless it may need to be modified. */ + if (tree_height > height + 1 && + !btr_cur_will_modify_tree(index(), block->page.frame, lock_intention, + page_cur.rec, node_ptr_max_size, + zip_size, mtr)) + { + mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint); + block_savepoint--; + } + } + + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets)); + + if (!--height) + { + /* We are about to access the leaf level. */ + + switch (latch_mode) { + case BTR_MODIFY_ROOT_AND_LEAF: + rw_latch= RW_X_LATCH; + break; + case BTR_MODIFY_PREV: /* ibuf_insert() or btr_pcur_move_to_prev() */ + case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */ + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + + if (page_has_prev(block->page.frame) && + page_rec_is_first(page_cur.rec, block->page.frame)) + { + ut_ad(block_savepoint + 1 == mtr->get_savepoint()); + + /* Latch the previous page if the node pointer is the leftmost + of the current page. */ + int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err); + if (!ret) + goto func_exit; + ut_ad(block_savepoint + 2 == mtr->get_savepoint()); + if (ret < 0) + { + /* While our latch on the level-2 page prevents splits or + merges of this level-1 block, other threads may have + modified it due to splitting or merging some level-0 (leaf) + pages underneath it. Thus, we must search again. */ + if (page_cur_search_with_match(tuple, page_mode, + &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, + ULINT_UNDEFINED, &heap); + page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, + offsets)); + } + } + rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH)); + break; + case BTR_MODIFY_LEAF: + case BTR_SEARCH_LEAF: + rw_latch= rw_lock_type_t(latch_mode); + if (btr_op != BTR_NO_OP && !index()->is_ibuf() && + ibuf_should_try(index(), btr_op != BTR_INSERT_OP)) + /* Try to buffer the operation if the leaf page + is not in the buffer pool. */ + buf_mode= btr_op == BTR_DELETE_OP + ? BUF_GET_IF_IN_POOL_OR_WATCH + : BUF_GET_IF_IN_POOL; + break; + case BTR_MODIFY_TREE: + ut_ad(rw_latch == RW_X_LATCH); + + if (lock_intention == BTR_INTENTION_INSERT && + page_has_next(block->page.frame) && + page_rec_is_last(page_cur.rec, block->page.frame)) + { + /* btr_insert_into_right_sibling() might cause deleting node_ptr + at upper level */ + mtr->rollback_to_savepoint(block_savepoint); + goto need_opposite_intention; + } + break; + default: + ut_ad(rw_latch == RW_X_LATCH); + } + } + + goto search_loop; +} + +ATTRIBUTE_COLD void mtr_t::index_lock_upgrade() +{ + auto &slot= m_memo[get_savepoint() - 1]; + if (slot.type == MTR_MEMO_X_LOCK) + return; + ut_ad(slot.type == MTR_MEMO_SX_LOCK); + index_lock *lock= static_cast<index_lock*>(slot.object); + lock->u_x_upgrade(SRW_LOCK_CALL); + slot.type= MTR_MEMO_X_LOCK; +} + +ATTRIBUTE_COLD +dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, + page_cur_mode_t mode, mtr_t *mtr) +{ + ut_ad(index()->is_btree() || index()->is_ibuf()); + ut_ad(!index()->is_ibuf() || ibuf_inside(mtr)); + + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets= offsets_; + rec_offs_init(offsets_); + + ut_ad(flag == BTR_CUR_BINARY); + ut_ad(dict_index_check_search_tuple(index(), tuple)); + ut_ad(dtuple_check_typed(tuple)); + buf_block_t *block= mtr->at_savepoint(1); + ut_ad(block->page.id().page_no() == index()->page); + block->page.fix(); + mtr->rollback_to_savepoint(1); + mtr->index_lock_upgrade(); + + const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)}; + + mtr->page_lock(block, RW_X_LATCH); + + up_match= 0; + up_bytes= 0; + low_match= 0; + low_bytes= 0; + ulint height= btr_page_get_level(block->page.frame); + tree_height= height + 1; + mem_heap_t *heap= nullptr; + + search_loop: + dberr_t err; + page_cur.block= block; + + if (UNIV_UNLIKELY(!height)) + { + if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, + &page_cur, nullptr)) + corrupted: + err= DB_CORRUPTION; + else + { + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + +#ifdef BTR_CUR_HASH_ADAPT + /* We do a dirty read of btr_search_enabled here. We will + properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a page hash + index, while holding search latch. */ + if (!btr_search_enabled); + else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) + /* This may be a search tuple for btr_pcur_t::restore_position(). */ + ut_ad(tuple->is_metadata() || + (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT))); + else if (index()->table->is_temporary()); + else if (!rec_is_metadata(page_cur.rec, *index())) + btr_search_info_update(index(), this); +#endif /* BTR_CUR_HASH_ADAPT */ + err= DB_SUCCESS; + } + + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + } + + if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + + page_id_t page_id{block->page.id()}; + + offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED, + &heap); + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets)); + + block= + buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET, + mtr, &err, !--height && !index()->is_clust()); + + if (!block) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index()); + goto func_exit; + } + + if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index()->id || + fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || + !fil_page_index_page_check(block->page.frame)) + goto corrupted; + + if (height != btr_page_get_level(block->page.frame)) + goto corrupted; + +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t *page_zip= buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index())); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, page_id, block->zip_size(), + RW_X_LATCH, mtr, &err)) + goto func_exit; + if (page_has_next(block->page.frame) && + !btr_block_get(*index(), btr_page_get_next(block->page.frame), + RW_X_LATCH, false, mtr, &err)) + goto func_exit; + goto search_loop; +} + +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given non-leaf level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +cursor->up_match and cursor->low_match both will have sensible values. +Cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. +@param level the tree level of search +@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that + it cannot get compared to the node ptr page number field! +@param latch RW_S_LATCH or RW_X_LATCH +@param cursor tree cursor; the cursor page is s- or x-latched, but see also + above! +@param mtr mini-transaction +@return DB_SUCCESS on success or error code otherwise */ +TRANSACTIONAL_TARGET +dberr_t btr_cur_search_to_nth_level(ulint level, + const dtuple_t *tuple, + rw_lock_type_t rw_latch, + btr_cur_t *cursor, mtr_t *mtr) +{ + dict_index_t *const index= cursor->index(); + + ut_ad(index->is_btree() || index->is_ibuf()); + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs_init(offsets_); + ut_ad(level); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree()); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(index->page != FIL_NULL); + + MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes); + MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes); + cursor->up_match= 0; + cursor->low_match= 0; + cursor->flag= BTR_CUR_BINARY; + +#ifndef BTR_CUR_ADAPT + buf_block_t *block= nullptr; +#else + btr_search_t *info= btr_search_get_info(index); + buf_block_t *block= info->root_guess; +#endif /* BTR_CUR_ADAPT */ + + ut_ad(mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + + const ulint zip_size= index->table->space->zip_size(); + + /* Start with the root page. */ + page_id_t page_id(index->table->space_id, index->page); + ulint height= ULINT_UNDEFINED; + +search_loop: + dberr_t err= DB_SUCCESS; + if (buf_block_t *b= + mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch))) + block= b; + else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch, + block, BUF_GET, mtr, &err))) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + goto func_exit; + } + +#ifdef UNIV_ZIP_DEBUG + if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) + ut_a(page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (!!page_is_comp(block->page.frame) != index->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index->id || + fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || + !fil_page_index_page_check(block->page.frame)) + { + corrupted: + err= DB_CORRUPTION; + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + } + + const uint32_t page_level= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + /* We are in the root node */ + height= page_level; + if (!height) + goto corrupted; + cursor->tree_height= height + 1; + } + else if (height != ulint{page_level}) + goto corrupted; + + cursor->page_cur.block= block; + + /* Search for complete index fields. */ + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match, + &cursor->low_match, &cursor->page_cur, + nullptr)) + goto corrupted; + + /* If this is the desired level, leave the loop */ + if (level == height) + goto func_exit; + + ut_ad(height > level); + height--; + + offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec, + offsets)); + block= nullptr; + goto search_loop; +} + +dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, + btr_latch_mode latch_mode, mtr_t *mtr) +{ + ulint n_blocks= 0; + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + dberr_t err; + + rec_offs_init(offsets_); + + const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; + latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED); + + btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched the leaf node */ + + auto savepoint= mtr->get_savepoint(); + + rw_lock_type_t upper_rw_latch= RW_X_LATCH; + ulint node_ptr_max_size= 0, compress_limit= 0; + + if (latch_mode == BTR_MODIFY_TREE) + { + node_ptr_max_size= btr_node_ptr_max_size(index); + /* Most of delete-intended operations are purging. Free blocks + and read IO bandwidth should be prioritized for them, when the + history list is growing huge. */ + savepoint++; + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index); + + if (os_aio_pending_reads_approx() && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + mtr_x_lock_index(index, mtr); + goto index_locked; + } + } + mtr_sx_lock_index(index, mtr); + } + else + { + static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), ""); + ut_ad(!(latch_mode & 8)); + /* This function doesn't need to lock left page of the leaf page */ + static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), ""); + static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), ""); + latch_mode= btr_latch_mode(latch_mode & ~4); + ut_ad(!latch_by_caller || + mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK)); + upper_rw_latch= RW_S_LATCH; + if (!latch_by_caller) + { + savepoint++; + mtr_s_lock_index(index, mtr); + } + } + +index_locked: + ut_ad(savepoint == mtr->get_savepoint()); + + const rw_lock_type_t root_leaf_rw_latch= + rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH)); + + page_cur.index = index; + + uint32_t page= index->page; + const auto zip_size= index->table->space->zip_size(); + + for (ulint height= ULINT_UNDEFINED;;) + { + ut_ad(n_blocks < BTR_MAX_LEVELS); + ut_ad(savepoint + n_blocks == mtr->get_savepoint()); + + buf_block_t* block= + btr_block_get(*index, page, + height ? upper_rw_latch : root_leaf_rw_latch, + !height, mtr, &err); + ut_ad(!block == (err != DB_SUCCESS)); + + if (!block) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + break; + } + + if (first) + page_cur_set_before_first(block, &page_cur); + else + page_cur_set_after_last(block, &page_cur); + + const uint32_t l= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + /* We are in the root node */ + height= l; + if (height); + else if (upper_rw_latch != root_leaf_rw_latch) + { + /* We should retry to get the page, because the root page + is latched with different level as a leaf page. */ + ut_ad(n_blocks == 0); + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + upper_rw_latch= root_leaf_rw_latch; + mtr->rollback_to_savepoint(savepoint); + height= ULINT_UNDEFINED; + continue; + } + else + { + reached_leaf: + const auto leaf_savepoint= mtr->get_savepoint(); + ut_ad(leaf_savepoint); + ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1)); + + if (latch_mode == BTR_MODIFY_TREE) + { + /* x-latch also siblings from left to right */ + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH, + mtr, &err)) + break; + if (page_has_next(block->page.frame) && + !btr_block_get(*index, btr_page_get_next(block->page.frame), + RW_X_LATCH, false, mtr, &err)) + break; + + if (!index->lock.have_x() && + btr_cur_need_opposite_intention(block->page, index->is_clust(), + lock_intention, + node_ptr_max_size, + compress_limit, page_cur.rec)) + goto need_opposite_intention; + } + else + { + if (latch_mode != BTR_CONT_MODIFY_TREE) + { + ut_ad(latch_mode == BTR_MODIFY_LEAF || + latch_mode == BTR_SEARCH_LEAF); + /* Release index->lock if needed, and the non-leaf pages. */ + mtr->rollback_to_savepoint(savepoint - !latch_by_caller, + leaf_savepoint - 1); + } + } + break; + } + } + else if (UNIV_UNLIKELY(height != l)) + { + corrupted: + err= DB_CORRUPTION; + break; + } + + if (!height) + goto reached_leaf; + + height--; + + if (first + ? !page_cur_move_to_next(&page_cur) + : !page_cur_move_to_prev(&page_cur)) + goto corrupted; + + offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED, + &heap); + + ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH); + + if (latch_mode != BTR_MODIFY_TREE); + else if (btr_cur_need_opposite_intention(block->page, index->is_clust(), + lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + { + need_opposite_intention: + /* If the rec is the first or last in the page for pessimistic + delete intention, it might cause node_ptr insert for the upper + level. We should change the intention and retry. */ + + mtr->rollback_to_savepoint(savepoint); + mtr->index_lock_upgrade(); + /* X-latch all pages from now on */ + latch_mode= BTR_CONT_MODIFY_TREE; + page= index->page; + height= ULINT_UNDEFINED; + n_blocks= 0; + continue; + } + else + { + if (!btr_cur_will_modify_tree(index, block->page.frame, + lock_intention, page_cur.rec, + node_ptr_max_size, zip_size, mtr)) + { + ut_ad(n_blocks); + /* release buffer-fixes on pages that will not be modified + (except the root) */ + if (n_blocks > 1) + { + mtr->rollback_to_savepoint(savepoint + 1, savepoint + n_blocks - 1); + n_blocks= 1; + } + } + } + + /* Go to the child node */ + page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); + n_blocks++; + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + return err; +} + +/*==================== B-TREE INSERT =========================*/ + +/*************************************************************//** +Inserts a record if there is enough space, or if enough space can +be freed by reorganizing. Differs from btr_cur_optimistic_insert because +no heuristics is applied to whether it pays to use CPU time for +reorganizing the page or not. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to inserted record if succeed, else NULL */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +rec_t* +btr_cur_insert_if_possible( +/*=======================*/ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not + have been stored to tuple */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t* page_cursor; + rec_t* rec; + + ut_ad(dtuple_check_typed(tuple)); + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext, + mtr); + + /* If the record did not fit, reorganize. + For compressed pages, page_cur_tuple_insert() + attempted this already. */ + if (!rec && !page_cur_get_page_zip(page_cursor) + && btr_page_reorganize(page_cursor, mtr) == DB_SUCCESS) { + rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, + n_ext, mtr); + } + + ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets)); + return(rec); +} + +/*************************************************************//** +For an insert, checks the locks and does the undo logging if desired. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6))) +dberr_t +btr_cur_ins_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if + not zero, the parameters index and thr + should be specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert */ + dtuple_t* entry, /*!< in/out: entry to insert */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + bool* inherit)/*!< out: true if the inserted new record maybe + should inherit LOCK_GAP type locks from the + successor record */ +{ + if (!(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))) { + return DB_SUCCESS; + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + rec_t* rec = btr_cur_get_rec(cursor); + dict_index_t* index = cursor->index(); + + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad((flags & BTR_NO_UNDO_LOG_FLAG) + || !index->table->skip_alter_undo); + + ut_ad(mtr->is_named_space(index->table->space)); + + /* Check if there is predicate or GAP lock preventing the insertion */ + if (!(flags & BTR_NO_LOCKING_FLAG)) { + const unsigned type = index->type; + if (UNIV_UNLIKELY(type & DICT_SPATIAL)) { + lock_prdt_t prdt; + rtr_mbr_t mbr; + + rtr_get_mbr_from_tuple(entry, &mbr); + + /* Use on stack MBR variable to test if a lock is + needed. If so, the predicate (MBR) will be allocated + from lock heap in lock_prdt_insert_check_and_lock() */ + lock_init_prdt_from_mbr(&prdt, &mbr, 0, nullptr); + + if (dberr_t err = lock_prdt_insert_check_and_lock( + rec, btr_cur_get_block(cursor), + index, thr, mtr, &prdt)) { + return err; + } + *inherit = false; + } else { + ut_ad(!dict_index_is_online_ddl(index) + || index->is_primary() + || (flags & BTR_CREATE_FLAG)); +#ifdef WITH_WSREP + trx_t* trx= thr_get_trx(thr); + /* If transaction scanning an unique secondary + key is wsrep high priority thread (brute + force) this scanning may involve GAP-locking + in the index. As this locking happens also + when applying replication events in high + priority applier threads, there is a + probability for lock conflicts between two + wsrep high priority threads. To avoid this + GAP-locking we mark that this transaction + is using unique key scan here. */ + if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE + && trx->is_wsrep() + && wsrep_thd_is_BF(trx->mysql_thd, false)) { + trx->wsrep = 3; + } +#endif /* WITH_WSREP */ + if (dberr_t err = lock_rec_insert_check_and_lock( + rec, btr_cur_get_block(cursor), + index, thr, mtr, inherit)) { + return err; + } + } + } + + if (!index->is_primary() || !page_is_leaf(page_align(rec))) { + return DB_SUCCESS; + } + + constexpr roll_ptr_t dummy_roll_ptr = roll_ptr_t{1} + << ROLL_PTR_INSERT_FLAG_POS; + roll_ptr_t roll_ptr = dummy_roll_ptr; + + if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + if (dberr_t err = trx_undo_report_row_operation( + thr, index, entry, NULL, 0, NULL, NULL, + &roll_ptr)) { + return err; + } + + if (roll_ptr != dummy_roll_ptr) { + dfield_t* r = dtuple_get_nth_field(entry, + index->db_trx_id()); + trx_write_trx_id(static_cast<byte*>(r->data), + thr_get_trx(thr)->id); + } + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + dfield_t* r = dtuple_get_nth_field( + entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr); + } + + return DB_SUCCESS; +} + +/** +Prefetch siblings of the leaf for the pessimistic operation. +@param block leaf page +@param index index of the page */ +static void btr_cur_prefetch_siblings(const buf_block_t *block, + const dict_index_t *index) +{ + ut_ad(page_is_leaf(block->page.frame)); + + if (index->is_ibuf()) + return; + + const page_t *page= block->page.frame; + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); + + fil_space_t *space= index->table->space; + + if (prev == FIL_NULL); + else if (space->acquire()) + buf_read_page_background(space, page_id_t(space->id, prev), + block->zip_size()); + if (next == FIL_NULL); + else if (space->acquire()) + buf_read_page_background(space, page_id_t(space->id, next), + block->zip_size()); +} + +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */ +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + dict_index_t* index; + page_cur_t* page_cursor; + buf_block_t* block; + page_t* page; + rec_t* dummy; + bool leaf; + bool reorg __attribute__((unused)); + bool inherit = true; + ulint rec_size; + dberr_t err; + + ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG))); + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = cursor->index(); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(dtuple_check_typed(entry)); + +#ifdef HAVE_valgrind + if (block->page.zip.data) { + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size()); + } +#endif /* HAVE_valgrind */ + + leaf = page_is_leaf(page); + + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + ut_ad(leaf); + goto convert_big_rec; + } + + /* Calculate the record size when entry is converted to a record */ + rec_size = rec_get_converted_size(index, entry, n_ext); + + if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), + dtuple_get_n_fields(entry), + block->zip_size())) { +convert_big_rec: + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); + + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(index, entry, n_ext); + } + + if (block->page.zip.data && page_zip_is_too_big(index, entry)) { + if (big_rec_vec != NULL) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(DB_TOO_BIG_RECORD); + } + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail); + + if (block->page.zip.data && leaf + && (page_get_data_size(page) + rec_size + >= dict_index_zip_pad_optimal_page_size(index))) { + /* If compression padding tells us that insertion will + result in too packed up page i.e.: which is likely to + cause compression failure then don't do an optimistic + insertion. */ +fail: + err = DB_FAIL; + + /* prefetch siblings of the leaf for the pessimistic + operation, if the page is leaf. */ + if (leaf) { + btr_cur_prefetch_siblings(block, index); + } +fail_err: + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(err); + } + + ulint max_size = page_get_max_insert_size_after_reorganize(page, 1); + if (max_size < rec_size) { + goto fail; + } + + const ulint n_recs = page_get_n_recs(page); + if (UNIV_UNLIKELY(n_recs >= 8189)) { + ut_ad(srv_page_size == 65536); + goto fail; + } + + if (page_has_garbage(page)) { + if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + && n_recs > 1 + && page_get_max_insert_size(page, 1) < rec_size) { + + goto fail; + } + } + + /* If there have been many consecutive inserts to the + clustered index leaf page of an uncompressed table, check if + we have to split the page to reserve enough free space for + future updates of records. */ + + if (leaf && !block->page.zip.data && dict_index_is_clust(index) + && page_get_n_recs(page) >= 2 + && dict_index_get_space_reserve() + rec_size > max_size + && (btr_page_get_split_rec_to_right(cursor, &dummy) + || btr_page_get_split_rec_to_left(cursor))) { + goto fail; + } + + page_cursor = btr_cur_get_page_cur(cursor); + + DBUG_LOG("ib_cur", + "insert " << index->name << " (" << index->id << ") by " + << ib::hex(thr ? thr->graph->trx->id : 0) + << ' ' << rec_printer(entry).str()); + DBUG_EXECUTE_IF("do_page_reorganize", + ut_a(!n_recs || btr_page_reorganize(page_cursor, mtr) + == DB_SUCCESS);); + + /* Now, try the insert */ + { + const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); + + /* Check locks and write to the undo log, + if specified */ + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + if (err != DB_SUCCESS) { + goto fail_err; + } + +#ifdef UNIV_DEBUG + if (!(flags & BTR_CREATE_FLAG) + && leaf && index->is_primary()) { + const dfield_t* trx_id = dtuple_get_nth_field( + entry, dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, + DATA_TRX_ID), + index)); + + ut_ad(trx_id->len == DATA_TRX_ID_LEN); + ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN); + ut_ad(*static_cast<const byte*> + (trx_id[1].data) & 0x80); + if (flags & BTR_NO_UNDO_LOG_FLAG) { + ut_ad(!memcmp(trx_id->data, reset_trx_id, + DATA_TRX_ID_LEN)); + } else { + ut_ad(thr->graph->trx->id); + ut_ad(thr->graph->trx->bulk_insert + || thr->graph->trx->id + == trx_read_trx_id( + static_cast<const byte*>( + trx_id->data)) + || index->table->is_temporary()); + } + } +#endif + + *rec = page_cur_tuple_insert(page_cursor, entry, offsets, heap, + n_ext, mtr); + + reorg = page_cursor_rec != page_cur_get_rec(page_cursor); + } + + if (*rec) { + } else if (block->page.zip.data) { + ut_ad(!index->table->is_temporary()); + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + if (leaf + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(block); + } + + goto fail; + } else { + ut_ad(!reorg); + reorg = true; + + /* If the record did not fit, reorganize */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS + || page_get_max_insert_size(page, 1) != max_size + || !(*rec = page_cur_tuple_insert(page_cursor, entry, + offsets, heap, n_ext, + mtr))) { + err = DB_CORRUPTION; + goto fail_err; + } + } + +#ifdef BTR_CUR_HASH_ADAPT + if (!leaf) { + } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(flags == BTR_NO_LOCKING_FLAG); + } else if (index->table->is_temporary()) { + } else { + srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index); + if (!reorg && cursor->flag == BTR_CUR_HASH) { + btr_search_update_hash_node_on_insert( + cursor, ahi_latch); + } else { + btr_search_update_hash_on_insert(cursor, ahi_latch); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { + + lock_update_insert(block, *rec); + } + + if (leaf + && !dict_index_is_clust(index) + && !index->table->is_temporary()) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (block->page.zip.data) { + /* Update the bits in the same mini-transaction. */ + ibuf_update_free_bits_zip(block, mtr); + } else { + /* Decrement the bits in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full( + block, max_size, + rec_size + PAGE_DIR_SLOT_SIZE); + } + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index = cursor->index(); + big_rec_t* big_rec_vec = NULL; + bool inherit = false; + uint32_t n_reserved = 0; + + ut_ad(dtuple_check_typed(entry)); + ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG))); + + *big_rec = NULL; + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + + cursor->flag = BTR_CUR_BINARY; + + /* Check locks and write to undo log, if specified */ + + dberr_t err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + + if (err != DB_SUCCESS) { + return(err); + } + + /* First reserve enough free space for the file segments of + the index tree, so that the insert will not fail because of + lack of space */ + + if (!index->is_ibuf() + && (err = fsp_reserve_free_extents(&n_reserved, index->table->space, + uint32_t(cursor->tree_height / 16 + + 3), + FSP_NORMAL, mtr)) + != DB_SUCCESS) { + return err; + } + + if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), + index->table->not_redundant(), + dtuple_get_n_fields(entry), + btr_cur_get_block(cursor)->zip_size()) + || UNIV_UNLIKELY(entry->is_alter_metadata() + && !dfield_is_ext( + dtuple_get_nth_field( + entry, + index->first_user_field())))) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + + if (UNIV_LIKELY_NULL(big_rec_vec)) { + /* This should never happen, but we handle + the situation in a robust manner. */ + ut_ad(0); + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); + + if (big_rec_vec == NULL) { + + index->table->space->release_free_extents(n_reserved); + return(DB_TOO_BIG_RECORD); + } + } + + *rec = index->page == btr_cur_get_block(cursor)->page.id().page_no() + ? btr_root_raise_and_insert(flags, cursor, offsets, heap, + entry, n_ext, mtr, &err) + : btr_page_split_and_insert(flags, cursor, offsets, heap, + entry, n_ext, mtr, &err); + + if (!*rec) { + goto func_exit; + } + + ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec + || dict_index_is_spatial(index)); + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + ut_ad(!index->table->is_temporary()); + if (dict_index_is_spatial(index)) { + /* Do nothing */ + } else { + /* The cursor might be moved to the other page + and the max trx id field should be updated after + the cursor was fixed. */ + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + thr_get_trx(thr)->id, mtr); + } + + if (!page_rec_is_infimum(btr_cur_get_rec(cursor)) + || !page_has_prev(btr_cur_get_page(cursor))) { + /* split and inserted need to call + lock_update_insert() always. */ + inherit = true; + } + } + } + + if (!page_is_leaf(btr_cur_get_page(cursor))) { + ut_ad(!big_rec_vec); + } else { +#ifdef BTR_CUR_HASH_ADAPT + if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(!(flags & BTR_CREATE_FLAG)); + } else if (index->table->is_temporary()) { + } else { + btr_search_update_hash_on_insert( + cursor, btr_search_sys.get_latch(*index)); + } +#endif /* BTR_CUR_HASH_ADAPT */ + if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) { + + lock_update_insert(btr_cur_get_block(cursor), *rec); + } + } + + err = DB_SUCCESS; +func_exit: + index->table->space->release_free_extents(n_reserved); + *big_rec = big_rec_vec; + + return err; +} + +/*==================== B-TREE UPDATE =========================*/ + +/*************************************************************//** +For an update, checks the locks and does the undo logging. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +dberr_t +btr_cur_upd_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on record to update */ + const rec_offs* offsets,/*!< in: rec_get_offsets() on cursor */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + roll_ptr_t* roll_ptr)/*!< out: roll pointer */ +{ + dict_index_t* index; + const rec_t* rec; + dberr_t err; + + ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG)); + + rec = btr_cur_get_rec(cursor); + index = cursor->index(); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->is_named_space(index->table->space)); + + if (!dict_index_is_clust(index)) { + ut_ad(dict_index_is_online_ddl(index) + == !!(flags & BTR_CREATE_FLAG)); + + /* We do undo logging only when we update a clustered index + record */ + return(lock_sec_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, + index, thr, mtr)); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + err = lock_clust_rec_modify_check_and_lock( + btr_cur_get_block(cursor), rec, index, + offsets, thr); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Append the info about the update in the undo log */ + + return((flags & BTR_NO_UNDO_LOG_FLAG) + ? DB_SUCCESS + : trx_undo_report_row_operation( + thr, index, NULL, update, + cmpl_info, rec, offsets, roll_ptr)); +} + +/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry. +@param[in,out] entry clustered index entry +@param[in] index clustered index +@param[in] trx_id DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR */ +static void btr_cur_write_sys( + dtuple_t* entry, + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr) +{ + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(t->data), trx_id); + dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr); +} + +MY_ATTRIBUTE((warn_unused_result)) +/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record. +@param[in,out] block clustered index leaf page +@param[in,out] rec clustered index record +@param[in] index clustered index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx transaction +@param[in] roll_ptr DB_ROLL_PTR value +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec, + dict_index_t *index, const rec_offs *offsets, + const trx_t *trx, roll_ptr_t roll_ptr, + mtr_t *mtr) +{ + ut_ad(index->is_primary()); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(), + trx->id, roll_ptr, mtr); + return DB_SUCCESS; + } + + ulint offset= index->trx_id_offset; + + if (!offset) + offset= row_get_trx_id_offset(index, offsets); + + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + + /* During IMPORT the trx id in the record can be in the future, if + the .ibd file is being imported from another instance. During IMPORT + roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 || + lock_check_trx_id_sanity(trx_read_trx_id(rec + offset), + rec, index, offsets)); + + byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + + trx_write_trx_id(sys, trx->id); + trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr); + + ulint d= 0; + const byte *src= nullptr; + byte *dest= rec + offset; + ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + if (UNIV_LIKELY(index->trx_id_offset)) + { + const rec_t *prev= page_rec_get_prev_const(rec); + if (UNIV_UNLIKELY(!prev || prev == rec)) + return DB_CORRUPTION; + else if (page_rec_is_infimum(prev)); + else + for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++) + if (src[d] != sys[d]) + break; + if (d > 6 && memcmp(dest, sys, d)) + { + /* We save space by replacing a single record + + WRITE,page_offset(dest),byte[13] + + with two records: + + MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes), + WRITE|0x80,0,byte[13-d] + + The single WRITE record would be x+13 bytes long, with x>2. + The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the + second WRITE would be 1+1+13-d = 15-d bytes. + + The total size is: x+13 versus x+4+15-d = x+19-d bytes. + To save space, we must have d>6, that is, the complete DB_TRX_ID and + the first byte(s) of DB_ROLL_PTR must match the previous record. */ + memcpy(dest, src, d); + mtr->memmove(*block, page_offset(dest), page_offset(src), d); + dest+= d; + len-= d; + /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when + DB_TRX_ID refers to an active transaction. */ + ut_ad(len); + } + else + d= 0; + } + + if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */ + mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len); + + return DB_SUCCESS; +} + +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ +#ifdef UNIV_DEBUG + rec_offs* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index = cursor->index; + + /* Have a local copy of the variables as these can change + dynamically. */ + const page_t* page = page_cur_get_page(cursor); + + ut_ad(page_zip == page_cur_get_page_zip(cursor)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(true); + } + + if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + return(false); + } + + if (create && page_is_leaf(page) + && (length + page_get_data_size(page) + >= dict_index_zip_pad_optimal_page_size(index))) { + return(false); + } + + if (btr_page_reorganize(cursor, mtr) == DB_SUCCESS) { + rec_offs_make_valid(page_cur_get_rec(cursor), index, + page_is_leaf(page), offsets); + + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the btr_page_reorganize() above did not reduce + the free space available on the page. */ + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return true; + } + } + + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(page)) { + ibuf_reset_free_bits(page_cur_get_block(cursor)); + } + + return(false); +} + +/** Apply an update vector to a record. No field size changes are allowed. + +This is usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). +@param[in,out] rec index record +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] update update vector +@param[in,out] block index page +@param[in,out] mtr mini-transaction */ +void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, + const rec_offs *offsets, const upd_t *update, + buf_block_t *block, mtr_t *mtr) +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!index->table->skip_alter_undo); + ut_ad(!block->page.zip.data || index->table->not_redundant()); + +#ifdef UNIV_DEBUG + if (rec_offs_comp(offsets)) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_INSTANT: + ut_ad(index->is_instant()); + break; + case REC_STATUS_NODE_PTR: + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad("wrong record status in update" == 0); + } + } +#endif /* UNIV_DEBUG */ + + static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility"); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(rec_offs_comp(offsets)); + byte* info_bits = &rec[-REC_NEW_INFO_BITS]; + const bool flip_del_mark = (*info_bits ^ update->info_bits) + & REC_INFO_DELETED_FLAG; + *info_bits &= byte(~REC_INFO_BITS_MASK); + *info_bits |= update->info_bits; + + if (flip_del_mark) { + page_zip_rec_set_deleted(block, rec, update->info_bits + & REC_INFO_DELETED_FLAG, mtr); + } + } else { + byte* info_bits = &rec[rec_offs_comp(offsets) + ? -REC_NEW_INFO_BITS + : -REC_OLD_INFO_BITS]; + + mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits, + (*info_bits + & ~REC_INFO_BITS_MASK) + | update->info_bits); + } + + for (ulint i = 0; i < update->n_fields; i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) { + continue; + } + const ulint n = uf->field_no; + + ut_ad(!dfield_is_ext(&uf->new_val) + == !rec_offs_nth_extern(offsets, n)); + ut_ad(!rec_offs_nth_default(offsets, n)); + + if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) { + if (rec_offs_nth_sql_null(offsets, n)) { + ut_ad(index->table->is_instant()); + ut_ad(n >= index->n_core_fields); + continue; + } + + ut_ad(!index->table->not_redundant()); + switch (ulint size = rec_get_nth_field_size(rec, n)) { + case 0: + break; + case 1: + mtr->write<1,mtr_t::MAYBE_NOP>( + *block, + rec_get_field_start_offs(rec, n) + rec, + 0U); + break; + default: + mtr->memset( + block, + page_offset(rec_get_field_start_offs( + rec, n) + rec), + size, 0); + } + ulint l = rec_get_1byte_offs_flag(rec) + ? (n + 1) : (n + 1) * 2; + byte* b = rec - REC_N_OLD_EXTRA_BYTES - l; + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + mtr->write<1>(*block, b, + byte(*b | REC_1BYTE_SQL_NULL_MASK)); + continue; + } + + ulint len; + byte* data = rec_get_nth_field(rec, offsets, n, &len); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(len == uf->new_val.len); + memcpy(data, uf->new_val.data, len); + continue; + } + + if (UNIV_UNLIKELY(len != uf->new_val.len)) { + ut_ad(len == UNIV_SQL_NULL); + ut_ad(!rec_offs_comp(offsets)); + len = uf->new_val.len; + ut_ad(len == rec_get_nth_field_size(rec, n)); + ulint l = rec_get_1byte_offs_flag(rec) + ? (n + 1) : (n + 1) * 2; + byte* b = rec - REC_N_OLD_EXTRA_BYTES - l; + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + mtr->write<1>(*block, b, + byte(*b & ~REC_1BYTE_SQL_NULL_MASK)); + } + + if (len) { + mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data, + uf->new_val.data, len); + } + } + + if (UNIV_LIKELY(!block->page.zip.data)) { + return; + } + + switch (update->n_fields) { + case 0: + /* We only changed the delete-mark flag. */ + return; + case 1: + if (!index->is_clust() + || update->fields[0].field_no != index->db_roll_ptr()) { + break; + } + goto update_sys; + case 2: + if (!index->is_clust() + || update->fields[0].field_no != index->db_trx_id() + || update->fields[1].field_no != index->db_roll_ptr()) { + break; + } + update_sys: + ulint len; + const byte* sys = rec_get_nth_field(rec, offsets, + index->db_trx_id(), &len); + ut_ad(len == DATA_TRX_ID_LEN); + page_zip_write_trx_id_and_roll_ptr( + block, rec, offsets, index->db_trx_id(), + trx_read_trx_id(sys), + trx_read_roll_ptr(sys + DATA_TRX_ID_LEN), mtr); + return; + } + + page_zip_write_rec(block, rec, index, offsets, 0, mtr); +} + +/** Check if a ROW_FORMAT=COMPRESSED page can be updated in place +@param cur cursor pointing to ROW_FORMAT=COMPRESSED page +@param offsets rec_get_offsets(btr_cur_get_rec(cur)) +@param update index fields being updated +@param mtr mini-transaction +@return the record in the ROW_FORMAT=COMPRESSED page +@retval nullptr if the page cannot be updated in place */ +ATTRIBUTE_COLD static +rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets, + const upd_t& update, mtr_t *mtr) +{ + dict_index_t *index= cur->index(); + ut_ad(!index->table->is_temporary()); + + switch (update.n_fields) { + case 0: + /* We are only changing the delete-mark flag. */ + break; + case 1: + if (!index->is_clust() || + update.fields[0].field_no != index->db_roll_ptr()) + goto check_for_overflow; + /* We are only changing the delete-mark flag and DB_ROLL_PTR. */ + break; + case 2: + if (!index->is_clust() || + update.fields[0].field_no != index->db_trx_id() || + update.fields[1].field_no != index->db_roll_ptr()) + goto check_for_overflow; + /* We are only changing DB_TRX_ID, DB_ROLL_PTR, and the delete-mark. + They can be updated in place in the uncompressed part of the + ROW_FORMAT=COMPRESSED page. */ + break; + check_for_overflow: + default: + if (!btr_cur_update_alloc_zip(btr_cur_get_page_zip(cur), + btr_cur_get_page_cur(cur), + offsets, rec_offs_size(offsets), + false, mtr)) + return nullptr; + } + + return btr_cur_get_rec(cur); +} + +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + dberr_t err; + rec_t* rec; + roll_ptr_t roll_ptr = 0; + ulint was_delete_marked; + + ut_ad(page_is_leaf(cursor->page_cur.block->page.frame)); + rec = btr_cur_get_rec(cursor); + index = cursor->index(); + ut_ad(!index->is_ibuf()); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor))); + ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id); + ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG)); + + DBUG_LOG("ib_cur", + "update-in-place " << index->name << " (" << index->id + << ") by " << ib::hex(trx_id) << ": " + << rec_printer(rec, offsets).str()); + + buf_block_t* block = btr_cur_get_block(cursor); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + /* Check that enough space is available on the compressed page. */ + if (UNIV_LIKELY_NULL(page_zip) + && !(rec = btr_cur_update_in_place_zip_check( + cursor, offsets, *update, mtr))) { + return DB_ZIP_OVERFLOW; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + err = btr_cur_upd_rec_sys(block, rec, index, offsets, + thr_get_trx(thr), roll_ptr, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto func_exit; + } + } + + was_delete_marked = rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block))); + /* In delete-marked records, DB_TRX_ID must always refer to an + existing undo log record. */ + ut_ad(!was_delete_marked + || !dict_index_is_clust(index) + || row_get_rec_trx_id(rec, index, offsets)); + +#ifdef BTR_CUR_HASH_ADAPT + { + srw_spin_lock* ahi_latch = block->index + ? btr_search_sys.get_latch(*index) : NULL; + if (ahi_latch) { + /* TO DO: Can we skip this if none of the fields + index->search_info->curr_n_fields + are being updated? */ + + /* The function row_upd_changes_ord_field_binary + does not work on a secondary index. */ + + if (!dict_index_is_clust(index) + || row_upd_changes_ord_field_binary( + index, update, thr, NULL, NULL)) { + ut_ad(!(update->info_bits + & REC_INFO_MIN_REC_FLAG)); + /* Remove possible hash index pointer + to this record */ + btr_search_update_hash_on_delete(cursor); + } + + ahi_latch->wr_lock(SRW_LOCK_CALL); + } + + assert_block_ahi_valid(block); +#endif /* BTR_CUR_HASH_ADAPT */ + + btr_cur_upd_rec_in_place(rec, index, offsets, update, block, + mtr); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_latch) { + ahi_latch->wr_unlock(); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (was_delete_marked + && !rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block)))) { + /* The new updated record owns its possible externally + stored fields */ + + btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr); + } + + ut_ad(err == DB_SUCCESS); + +func_exit: + if (page_zip + && !(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + /* Update the free bits in the insert buffer. */ + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } + + return(err); +} + +/** Trim a metadata record during the rollback of instant ALTER TABLE. +@param[in] entry metadata tuple +@param[in] index primary key +@param[in] update update vector for the rollback */ +ATTRIBUTE_COLD +static void btr_cur_trim_alter_metadata(dtuple_t* entry, + const dict_index_t* index, + const upd_t* update) +{ + ut_ad(index->is_instant()); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->is_alter_metadata()); + + ut_ad(update->fields[0].field_no == index->first_user_field()); + ut_ad(update->fields[0].new_val.ext); + ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE); + ut_ad(entry->n_fields - 1 == index->n_fields); + + const byte* ptr = static_cast<const byte*>( + update->fields[0].new_val.data); + ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN)); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + == index->table->space->id); + + ulint n_fields = update->fields[1].field_no; + ut_ad(n_fields <= index->n_fields); + if (n_fields != index->n_uniq) { + ut_ad(n_fields + >= index->n_core_fields); + entry->n_fields = n_fields; + return; + } + + /* This is based on dict_table_t::deserialise_columns() + and btr_cur_instant_init_low(). */ + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, &mtr); + if (!block) { + ut_ad("corruption" == 0); + mtr.commit(); + return; + } + ut_ad(fil_page_get_type(block->page.frame) == FIL_PAGE_TYPE_BLOB); + ut_ad(mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO]) + == FIL_NULL); + ut_ad(mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN]) + == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4)); + n_fields = mach_read_from_4( + &block->page.frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE]) + + index->first_user_field(); + /* Rollback should not increase the number of fields. */ + ut_ad(n_fields <= index->n_fields); + ut_ad(n_fields + 1 <= entry->n_fields); + /* dict_index_t::clear_instant_alter() cannot be invoked while + rollback of an instant ALTER TABLE transaction is in progress + for an is_alter_metadata() record. */ + ut_ad(n_fields >= index->n_core_fields); + + mtr.commit(); + entry->n_fields = n_fields + 1; +} + +/** Trim an update tuple due to instant ADD COLUMN, if needed. +For normal records, the trailing instantly added fields that match +the initial default values are omitted. + +For the special metadata record on a table on which instant +ADD COLUMN has already been executed, both ADD COLUMN and the +rollback of ADD COLUMN need to be handled specially. + +@param[in,out] entry index entry +@param[in] index index +@param[in] update update vector +@param[in] thr execution thread */ +static inline +void +btr_cur_trim( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + const que_thr_t* thr) +{ + if (!index->is_instant()) { + } else if (UNIV_UNLIKELY(update->is_metadata())) { + /* We are either updating a metadata record + (instant ALTER TABLE on a table where instant ALTER was + already executed) or rolling back such an operation. */ + ut_ad(!upd_get_nth_field(update, 0)->orig_len); + ut_ad(entry->is_metadata()); + + if (thr->graph->trx->in_rollback) { + /* This rollback can occur either as part of + ha_innobase::commit_inplace_alter_table() rolling + back after a failed innobase_add_instant_try(), + or as part of crash recovery. Either way, the + table will be in the data dictionary cache, with + the instantly added columns going to be removed + later in the rollback. */ + ut_ad(index->table->cached); + /* The DB_TRX_ID,DB_ROLL_PTR are always last, + and there should be some change to roll back. + The first field in the update vector is the + first instantly added column logged by + innobase_add_instant_try(). */ + ut_ad(update->n_fields > 2); + if (update->is_alter_metadata()) { + btr_cur_trim_alter_metadata( + entry, index, update); + return; + } + ut_ad(!entry->is_alter_metadata()); + + ulint n_fields = upd_get_nth_field(update, 0) + ->field_no; + ut_ad(n_fields + 1 >= entry->n_fields); + entry->n_fields = n_fields; + } + } else { + entry->trim(*index); + } +} + +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + page_cur_t* page_cursor; + dberr_t err; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + ulint max_size; + ulint new_rec_size; + ulint old_rec_size; + ulint max_ins_size = 0; + dtuple_t* new_entry; + roll_ptr_t roll_ptr; + ulint i; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + rec = btr_cur_get_rec(cursor); + index = cursor->index(); + ut_ad(index->has_locking()); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + /* This is intended only for leaf page updates */ + ut_ad(page_is_leaf(page)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_index_page_check(page)); + ut_ad(btr_page_get_index_id(page) == index->id); + + *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, + ULINT_UNDEFINED, heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, *offsets) + || thr_get_trx(thr) == trx_roll_crash_recv_trx); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (UNIV_LIKELY(!update->is_metadata()) + && !row_upd_changes_field_size_or_external(index, *offsets, + update)) { + + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + + return(btr_cur_update_in_place( + flags, cursor, *offsets, update, + cmpl_info, thr, trx_id, mtr)); + } + + if (rec_offs_any_extern(*offsets)) { +any_extern: + ut_ad(!index->is_ibuf()); + /* Externally stored fields are treated in pessimistic + update */ + + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, index); + + return(DB_OVERFLOW); + } + + if (rec_is_metadata(rec, *index) && index->table->instant) { + goto any_extern; + } + + for (i = 0; i < upd_get_n_fields(update); i++) { + if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { + + goto any_extern; + } + } + + DBUG_LOG("ib_cur", + "update " << index->name << " (" << index->id << ") by " + << ib::hex(trx_id) << ": " + << rec_printer(rec, *offsets).str()); + + page_cursor = btr_cur_get_page_cur(cursor); + + if (!*heap) { + *heap = mem_heap_create( + rec_offs_size(*offsets) + + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets))); + } + + new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap); + ut_ad(!dtuple_get_n_ext(new_entry)); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. + Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + *heap); + btr_cur_trim(new_entry, index, update, thr); + old_rec_size = rec_offs_size(*offsets); + new_rec_size = rec_get_converted_size(index, new_entry, 0); + + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ut_ad(!index->table->is_temporary()); + + if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page), + dict_index_get_n_fields(index), + block->zip_size())) { + goto any_extern; + } + + if (!btr_cur_update_alloc_zip( + page_zip, page_cursor, *offsets, + new_rec_size, true, mtr)) { + return(DB_ZIP_OVERFLOW); + } + + rec = page_cur_get_rec(page_cursor); + } + + /* We limit max record size to 16k even for 64k page size. */ + if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || + (!dict_table_is_comp(index->table) + && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { + err = DB_OVERFLOW; + goto func_exit; + } + + if (UNIV_UNLIKELY(new_rec_size + >= (page_get_free_space_of_empty(page_is_comp(page)) + / 2))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + err = DB_OVERFLOW; + goto func_exit; + } + + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* The page would become too empty */ + err = DB_UNDERFLOW; + goto func_exit; + } + + /* We do not attempt to reorganize if the page is compressed. + This is because the page may fail to compress after reorganization. */ + max_size = page_zip + ? page_get_max_insert_size(page, 1) + : (old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1)); + + if (!page_zip) { + max_ins_size = page_get_max_insert_size_after_reorganize( + page, 1); + } + + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) + && (max_size >= new_rec_size)) + || (page_get_n_recs(page) <= 1))) { + + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* There was not enough space, or it did not pay to + reorganize: for simplicity, we decide what to do assuming a + reorganization is needed, though it might not be necessary */ + + err = DB_OVERFLOW; + goto func_exit; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + /* Ok, we may do the replacement. Store on the page infimum the + explicit locks on rec, before deleting rec (see the comment in + btr_cur_pessimistic_update). */ + if (index->has_locking()) { + lock_rec_store_on_page_infimum(block, rec); + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(new_entry->is_metadata()); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ADD COLUMN, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + } + + page_cur_delete_rec(page_cursor, *offsets, mtr); + + if (!page_cur_move_to_prev(page_cursor)) { + return DB_CORRUPTION; + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); + } + + rec = btr_cur_insert_if_possible(cursor, new_entry, offsets, heap, + 0/*n_ext*/, mtr); + if (UNIV_UNLIKELY(!rec)) { + goto corrupted; + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + } else { + /* Restore the old explicit lock state on the record */ + lock_rec_restore_from_page_infimum(*block, rec, + block->page.id()); + } + + ut_ad(err == DB_SUCCESS); + if (!page_cur_move_to_next(page_cursor)) { +corrupted: + err = DB_CORRUPTION; + } + +func_exit: + if (!(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index)) { + /* Update the free bits in the insert buffer. */ + if (page_zip) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } else if (!index->table->is_temporary()) { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } + } + + if (err != DB_SUCCESS) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, index); + } + + return(err); +} + +/*************************************************************//** +If, in a split, a new supremum record was created as the predecessor of the +updated record, the supremum record must inherit exactly the locks on the +updated record. In the split it may have inherited locks from the successor +of the updated record, which is not correct. This function restores the +right locks for the new supremum. */ +static +dberr_t +btr_cur_pess_upd_restore_supremum( +/*==============================*/ + buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: updated record */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + + page = buf_block_get_frame(block); + + if (page_rec_get_next(page_get_infimum_rec(page)) != rec) { + /* Updated record is not the first user record on its page */ + return DB_SUCCESS; + } + + const uint32_t prev_page_no = btr_page_get_prev(page); + + const page_id_t block_id{block->page.id()}; + const page_id_t prev_id(block_id.space(), prev_page_no); + dberr_t err; + buf_block_t* prev_block + = buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr, + BUF_PEEK_IF_IN_POOL, mtr, &err); + /* Since we already held an x-latch on prev_block, it must + be available and not be corrupted unless the buffer pool got + corrupted somehow. */ + if (UNIV_UNLIKELY(!prev_block)) { + return err; + } + ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT, + block->page.frame + FIL_PAGE_OFFSET, 4)); + + /* We must already have an x-latch on prev_block! */ + ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX)); + + lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id, + PAGE_HEAP_NO_SUPREMUM, + page_rec_get_heap_no(rec)); + return DB_SUCCESS; +} + +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. +@return DB_SUCCESS or error code */ +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + upd_t* update, /*!< in/out: update vector; this is allowed to + also contain trx id and roll ptr fields. + Non-updated columns that are moved offpage will + be appended to this. */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be + committed before latching any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; + dict_index_t* index; + buf_block_t* block; + page_zip_des_t* page_zip; + rec_t* rec; + page_cur_t* page_cursor; + dberr_t err; + dberr_t optim_err; + roll_ptr_t roll_ptr; + bool was_first; + uint32_t n_reserved = 0; + + *offsets = NULL; + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page_zip = buf_block_get_page_zip(block); + index = cursor->index(); + ut_ad(index->has_locking()); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | + MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(!page_zip || !index->table->is_temporary()); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~BTR_KEEP_POS_FLAG)) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + + err = optim_err = btr_cur_optimistic_update( + flags | BTR_KEEP_IBUF_BITMAP, + cursor, offsets, offsets_heap, update, + cmpl_info, thr, trx_id, mtr); + + switch (err) { + case DB_ZIP_OVERFLOW: + case DB_UNDERFLOW: + case DB_OVERFLOW: + break; + default: + err_exit: + /* We suppressed this with BTR_KEEP_IBUF_BITMAP. + For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were + already reset by btr_cur_update_alloc_zip() if the + page was recompressed. */ + if (page_zip + && optim_err != DB_ZIP_OVERFLOW + && !dict_index_is_clust(index) + && page_is_leaf(block->page.frame)) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } + + if (big_rec_vec != NULL) { + dtuple_big_rec_free(big_rec_vec); + } + + return(err); + } + + rec = btr_cur_get_rec(cursor); + ut_ad(rec_offs_validate(rec, index, *offsets)); + + dtuple_t* new_entry; + + const bool is_metadata = rec_is_metadata(rec, *index); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(update->is_metadata()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + new_entry = row_metadata_to_tuple( + rec, index, *offsets, entry_heap, + update->info_bits, !thr_get_trx(thr)->in_rollback); + ut_ad(new_entry->n_fields + == ulint(index->n_fields) + + update->is_alter_metadata()); + } else { + new_entry = row_rec_to_index_entry(rec, index, *offsets, + entry_heap); + } + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. If the + clustered index record is delete-marked, then its externally + stored fields cannot have been purged yet, because then the + purge would also have removed the clustered index record + itself. Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + entry_heap); + btr_cur_trim(new_entry, index, update, thr); + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ut_ad(!page_is_comp(block->page.frame) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_validate(rec, index, *offsets)); + + if ((flags & BTR_NO_UNDO_LOG_FLAG) + && rec_offs_any_extern(*offsets)) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update, if they are not + inherited values. They can be inherited if we have + updated the primary key to another value, and then + update it back again. */ + + ut_ad(big_rec_vec == NULL); + ut_ad(dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->in_rollback); + + DEBUG_SYNC_C("blob_rollback_middle"); + + btr_rec_free_updated_extern_fields( + index, rec, block, *offsets, update, true, mtr); + } + + ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0; + + if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(block->page.frame), + dict_index_get_n_fields(index), + block->zip_size()) + || (UNIV_UNLIKELY(update->is_alter_metadata()) + && !dfield_is_ext(dtuple_get_nth_field( + new_entry, + index->first_user_field())))) { + big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext); + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + /* We cannot goto return_after_reservations, + because we may need to update the + IBUF_BITMAP_FREE bits, which was suppressed by + BTR_KEEP_IBUF_BITMAP. */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, + index)); +#endif /* UNIV_ZIP_DEBUG */ + index->table->space->release_free_extents(n_reserved); + err = DB_TOO_BIG_RECORD; + goto err_exit; + } + + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(dict_index_is_clust(index)); + if (UNIV_UNLIKELY(!(flags & BTR_KEEP_POS_FLAG))) { + ut_ad(page_zip != NULL); + dtuple_convert_back_big_rec(index, new_entry, + big_rec_vec); + big_rec_vec = NULL; + n_ext = dtuple_get_n_ext(new_entry); + } + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + if (optim_err == DB_OVERFLOW) { + /* First reserve enough free space for the file segments + of the index tree, so that the update will not fail because + of lack of space */ + + err = fsp_reserve_free_extents( + &n_reserved, index->table->space, + uint32_t(cursor->tree_height / 16 + 3), + flags & BTR_NO_UNDO_LOG_FLAG + ? FSP_CLEANING : FSP_NORMAL, + mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + err = DB_OUT_OF_FILE_SPACE; + goto err_exit; + } + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); + } + + const ulint max_ins_size = page_zip + ? 0 + : page_get_max_insert_size_after_reorganize(block->page.frame, + 1); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(new_entry->is_metadata()); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ALTER TABLE, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + + /* Store state of explicit locks on rec on the page + infimum record, before deleting rec. The page infimum + acts as a dummy carrier of the locks, taking care also + of lock releases, before we can move the locks back on + the actual record. There is a special case: if we are + inserting on the root page and the insert causes a + call of btr_root_raise_and_insert. Therefore we cannot + in the lock system delete the lock structs set on the + root page even if the root page carries just node + pointers. */ + lock_rec_store_on_page_infimum(block, rec); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_delete_rec(page_cursor, *offsets, mtr); + + if (!page_cur_move_to_prev(page_cursor)) { + err = DB_CORRUPTION; + goto return_after_reservations; + } + + rec = btr_cur_insert_if_possible(cursor, new_entry, + offsets, offsets_heap, n_ext, mtr); + + if (rec) { + page_cursor->rec = rec; + + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS) { + goto return_after_reservations; + } + rec = page_cursor->rec; + rec_offs_make_valid(rec, index, true, *offsets); + if (page_cursor->block->page.id().page_no() + == index->page) { + btr_set_instant(page_cursor->block, *index, + mtr); + } + } else { + lock_rec_restore_from_page_infimum( + *btr_cur_get_block(cursor), rec, + block->page.id()); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets)) + || rec_is_alter_metadata(rec, *index)) { + /* The new inserted record owns its possible externally + stored fields */ + btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), + rec, index, *offsets, mtr); + } else { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + } + + bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG); + ut_ad(!adjust || page_is_leaf(block->page.frame)); + + if (btr_cur_compress_if_useful(cursor, adjust, mtr)) { + if (adjust) { + rec_offs_make_valid(page_cursor->rec, index, + true, *offsets); + } + } else if (!dict_index_is_clust(index) + && page_is_leaf(block->page.frame)) { + /* Update the free bits in the insert buffer. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (page_zip) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } else if (!index->table->is_temporary()) { + ibuf_update_free_bits_low(block, max_ins_size, + mtr); + } + } + +#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled + if (!big_rec_vec + && page_is_leaf(block->page.frame) + && !dict_index_is_online_ddl(index)) { + mtr->release(index->lock); + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } +#endif + + err = DB_SUCCESS; + goto return_after_reservations; + } else { + /* If the page is compressed and it initially + compresses very well, and there is a subsequent insert + of a badly-compressing record, it is possible for + btr_cur_optimistic_update() to return DB_UNDERFLOW and + btr_cur_insert_if_possible() to return FALSE. */ + ut_a(page_zip || optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(block->page.frame)) { + ibuf_reset_free_bits(block); + } + } + + if (big_rec_vec != NULL) { + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr->release(index->lock). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + mtr_sx_lock_index(index, mtr); + } + + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); + + /* Lock checks and undo logging were already performed by + btr_cur_upd_lock_and_undo(). We do not try + btr_cur_optimistic_insert() because + btr_cur_insert_if_possible() already failed above. */ + + err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG, + cursor, offsets, offsets_heap, + new_entry, &rec, + &dummy_big_rec, n_ext, NULL, mtr); + ut_a(err == DB_SUCCESS); + ut_a(rec); + ut_a(dummy_big_rec == NULL); + ut_ad(rec_offs_validate(rec, cursor->index(), *offsets)); + page_cursor->rec = rec; + + /* Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (dict_index_is_sec_or_ibuf(index) + && !index->table->is_temporary()) { + /* Update PAGE_MAX_TRX_ID in the index page header. + It was not updated by btr_cur_pessimistic_insert() + because of BTR_NO_LOCKING_FLAG. */ + page_update_max_trx_id(btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + trx_id, mtr); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* The new inserted record owns its possible externally + stored fields */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec, + index, *offsets, mtr); + } else { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + } + + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS) { + goto return_after_reservations; + } + rec = page_cursor->rec; + } else { + lock_rec_restore_from_page_infimum( + *btr_cur_get_block(cursor), rec, block->page.id()); + } + + /* If necessary, restore also the correct lock state for a new, + preceding supremum record created in a page split. While the old + record was nonexistent, the supremum might have inherited its locks + from a wrong record. */ + + if (!was_first) { + err = btr_cur_pess_upd_restore_supremum( + btr_cur_get_block(cursor), rec, mtr); + } + +return_after_reservations: +#ifdef UNIV_ZIP_DEBUG + ut_a(err || + !page_zip || page_zip_validate(btr_cur_get_page_zip(cursor), + btr_cur_get_page(cursor), index)); +#endif /* UNIV_ZIP_DEBUG */ + + index->table->space->release_free_extents(n_reserved); + *big_rec = big_rec_vec; + return(err); +} + +/*==================== B-TREE DELETE MARK AND UNMARK ===============*/ + +/** Modify the delete-mark flag of a record. +@tparam flag the value of the delete-mark flag +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in,out] mtr mini-transaction */ +template<bool flag> +void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr) +{ + if (page_rec_is_comp(rec)) + { + byte *b= &rec[-REC_NEW_INFO_BITS]; + const byte v= flag + ? (*b | REC_INFO_DELETED_FLAG) + : (*b & byte(~REC_INFO_DELETED_FLAG)); + if (*b == v); + else if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + *b= v; + page_zip_rec_set_deleted(block, rec, flag, mtr); + } + else + mtr->write<1>(*block, b, v); + } + else + { + ut_ad(!block->page.zip.data); + byte *b= &rec[-REC_OLD_INFO_BITS]; + const byte v = flag + ? (*b | REC_INFO_DELETED_FLAG) + : (*b & byte(~REC_INFO_DELETED_FLAG)); + mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v); + } +} + +template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *); +template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *); + +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + const dtuple_t* entry, /*!< in: dtuple for the deleting record, also + contains the virtual cols if there are any */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + roll_ptr_t roll_ptr; + dberr_t err; + trx_t* trx; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(buf_block_get_frame(block) == page_align(rec)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(mtr->is_named_space(index->table->space)); + + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* We may already have delete-marked this record + when executing an ON DELETE CASCADE operation. */ + ut_ad(row_get_rec_trx_id(rec, index, offsets) + == thr_get_trx(thr)->id); + return(DB_SUCCESS); + } + + err = trx_undo_report_row_operation(thr, index, + entry, NULL, 0, rec, offsets, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* The search latch is not needed here, because + the adaptive hash index does not depend on the delete-mark + and the delete-mark is being updated in place. */ + + btr_rec_set_deleted<true>(block, rec, mtr); + + trx = thr_get_trx(thr); + + DBUG_LOG("ib_cur", + "delete-mark clust " << index->table->name + << " (" << index->id << ") by " + << ib::hex(trx->id) << ": " + << rec_printer(rec, offsets).str()); + + return btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, + mtr); +} + +/*==================== B-TREE RECORD REMOVE =========================*/ + +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return whether compression occurred */ +bool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + bool adjust, /*!< in: whether the cursor position should be + adjusted even when compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + if (cursor->index()->is_spatial()) { + const trx_t* trx = cursor->rtr_info->thr + ? thr_get_trx(cursor->rtr_info->thr) + : NULL; + const buf_block_t* block = btr_cur_get_block(cursor); + + /* Check whether page lock prevents the compression */ + if (!lock_test_prdt_page_lock(trx, block->page.id())) { + return(false); + } + } + + return btr_cur_compress_recommendation(cursor, mtr) + && btr_compress(cursor, adjust, mtr) == DB_SUCCESS; +} + +/*******************************************************//** +Removes the record on which the tree cursor is positioned on a leaf page. +It is assumed that the mtr has an x-latch on the page where the cursor is +positioned, but no latch on the whole tree. +@return error code +@retval DB_FAIL if the page would become too empty */ +dberr_t +btr_cur_optimistic_delete( +/*======================*/ + btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to + delete; cursor stays valid: if deletion + succeeds, on function exit it points to the + successor of the deleted record */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +{ + buf_block_t* block; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(cursor->index()->table->space)); + ut_ad(!cursor->index()->is_dummy); + + /* This is intended only for leaf page deletions */ + + block = btr_cur_get_block(cursor); + + ut_ad(block->page.id().space() == cursor->index()->table->space->id); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + ut_ad(!dict_index_is_online_ddl(cursor->index()) + || cursor->index()->is_clust() + || (flags & BTR_CREATE_FLAG)); + + rec = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(rec, cursor->index(), offsets, + cursor->index()->n_core_fields, + ULINT_UNDEFINED, &heap); + + dberr_t err = DB_SUCCESS; + if (rec_offs_any_extern(offsets) + || !btr_cur_can_delete_without_compress(cursor, + rec_offs_size(offsets), + mtr)) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, cursor->index()); + err = DB_FAIL; + goto func_exit; + } + + if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index()->page + && page_get_n_recs(block->page.frame) == 1 + + (cursor->index()->is_instant() + && !rec_is_metadata(rec, *cursor->index())) + && !cursor->index() + ->must_avoid_clear_instant_add())) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists for instant ADD COLUMN (not generic ALTER TABLE). + If we are deleting the metadata record and the + table becomes empty, clean up the whole page. */ + dict_index_t* index = cursor->index(); + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(block->page.frame)); + if (UNIV_UNLIKELY(!first_rec)) { + err = DB_CORRUPTION; + goto func_exit; + } + ut_ad(!index->is_instant() + || rec_is_metadata(first_rec, *index)); + const bool is_metadata = rec_is_metadata(rec, *index); + /* We can remove the metadata when rolling back an + instant ALTER TABLE operation, or when deleting the + last user record on the page such that only metadata for + instant ADD COLUMN (not generic ALTER TABLE) remains. */ + const bool empty_table = is_metadata + || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index)); + if (UNIV_LIKELY(empty_table)) { + if (UNIV_LIKELY(!is_metadata && !flags)) { + lock_update_delete(block, rec); + } + btr_page_empty(block, buf_block_get_page_zip(block), + index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + + page_cur_set_after_last(block, + btr_cur_get_page_cur(cursor)); + goto func_exit; + } + } + + { + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG)) { + /* This should be rolling back instant ADD COLUMN. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(cursor->index()->table->supports_instant()); + ut_ad(cursor->index()->is_primary()); + ut_ad(!page_zip); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would have too many fields, and we would be + unable to know the size of the freed record. */ + err = btr_page_reorganize(btr_cur_get_page_cur(cursor), + mtr); + goto func_exit; + } else { + if (!flags) { + lock_update_delete(block, rec); + } + + btr_search_update_hash_on_delete(cursor); + } + + if (page_zip) { +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, + cursor->index())); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, + cursor->index())); +#endif /* UNIV_ZIP_DEBUG */ + + /* On compressed pages, the IBUF_BITMAP_FREE + space is not affected by deleting (purging) + records, because it is defined as the minimum + of space available *without* reorganize, and + space available in the modification log. */ + } else { + const ulint max_ins + = page_get_max_insert_size_after_reorganize( + page, 1); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + + /* The change buffer does not handle inserts + into non-leaf pages, into clustered indexes, + or into the change buffer. */ + if (!cursor->index()->is_clust() + && !cursor->index()->table->is_temporary() + && !dict_index_is_ibuf(cursor->index())) { + ibuf_update_free_bits_low(block, max_ins, mtr); + } + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return err; +} + +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred and FALSE if not or something +wrong. */ +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + dict_index_t* index; + rec_t* rec; + uint32_t n_reserved = 0; + ibool ret = FALSE; + mem_heap_t* heap; + rec_offs* offsets; +#ifdef UNIV_DEBUG + bool parent_latched = false; +#endif /* UNIV_DEBUG */ + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = btr_cur_get_index(cursor); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(!index->is_dummy); + ut_ad(block->page.id().space() == index->table->space->id); + + if (!has_reserved_extents) { + /* First reserve enough free space for the file segments + of the index tree, so that the node pointer updates will + not fail because of lack of space */ + + uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1); + + *err = fsp_reserve_free_extents(&n_reserved, + index->table->space, + n_extents, + FSP_CLEANING, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return(FALSE); + } + } + + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + + if (rec_offs_any_extern(offsets)) { + btr_rec_free_externally_stored_fields(index, + rec, offsets, block, + rollback, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + rec_t* next_rec = NULL; + bool min_mark_next_rec = false; + + if (page_is_leaf(page)) { + const bool is_metadata = rec_is_metadata( + rec, page_rec_is_comp(rec)); + if (UNIV_UNLIKELY(is_metadata)) { + /* This should be rolling back instant ALTER TABLE. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(rollback); + ut_ad(index->table->supports_instant()); + ut_ad(index->is_primary()); + } else if (flags == 0) { + lock_update_delete(block, rec); + } + + if (block->page.id().page_no() != index->page) { + if (page_get_n_recs(page) < 2) { + goto discard_page; + } + } else if (page_get_n_recs(page) == 1 + + (index->is_instant() && !is_metadata) + && !index->must_avoid_clear_instant_add()) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists for instant ADD COLUMN + (not generic ALTER TABLE). + If we are deleting the metadata record + (in the rollback of instant ALTER TABLE) and the + table becomes empty, clean up the whole page. */ + + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); + if (UNIV_UNLIKELY(!first_rec)) { + *err = DB_CORRUPTION; + goto err_exit; + } + ut_ad(!index->is_instant() + || rec_is_metadata(first_rec, *index)); + if (is_metadata || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index))) { + btr_page_empty(block, page_zip, index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + + page_cur_set_after_last( + block, + btr_cur_get_page_cur(cursor)); + ret = TRUE; + goto return_after_reservations; + } + } + + if (UNIV_LIKELY(!is_metadata)) { + btr_search_update_hash_on_delete(cursor); + } else { + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would carry too many fields, and we would be + unable to know the size of the freed record. */ + *err = btr_page_reorganize(btr_cur_get_page_cur(cursor), + mtr); + ut_ad(!ret); + goto err_exit; + } + } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) { + if (page_rec_is_last(rec, page)) { +discard_page: + ut_ad(page_get_n_recs(page) == 1); + /* If there is only one record, drop + the whole page. */ + + btr_discard_page(cursor, mtr); + + ret = TRUE; + goto return_after_reservations; + } + + if (UNIV_UNLIKELY(!(next_rec = page_rec_get_next(rec)))) { + ut_ad(!ret); + *err = DB_CORRUPTION; + goto err_exit; + } + + btr_cur_t cursor; + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + if (!page_has_prev(page)) { + /* If we delete the leftmost node pointer on a + non-leaf level, we must mark the new leftmost node + pointer as the predefined minimum record */ + + min_mark_next_rec = true; + } else if (index->is_spatial()) { + /* For rtree, if delete the leftmost node pointer, + we need to update parent page. */ + rtr_mbr_t father_mbr; + rec_t* father_rec; + rec_offs* offsets; + ulint len; + + rtr_page_get_father_block(NULL, heap, mtr, NULL, + &cursor); + father_rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(father_rec, index, NULL, + 0, ULINT_UNDEFINED, &heap); + + rtr_read_mbr(rec_get_nth_field( + father_rec, offsets, 0, &len), &father_mbr); + + rtr_update_mbr_field(&cursor, offsets, NULL, + page, &father_mbr, next_rec, mtr); + ut_d(parent_latched = true); + } else { + /* Otherwise, if we delete the leftmost node pointer + on a page, we have to change the parent node pointer + so that it is equal to the new leftmost node pointer + on the page */ + ret = btr_page_get_father(mtr, &cursor); + if (!ret) { + *err = DB_CORRUPTION; + goto err_exit; + } + *err = btr_cur_node_ptr_delete(&cursor, mtr); + if (*err != DB_SUCCESS) { +got_err: + ret = FALSE; + goto err_exit; + } + + const ulint level = btr_page_get_level(page); + // FIXME: reuse the node_ptr from above + dtuple_t* node_ptr = dict_index_build_node_ptr( + index, next_rec, block->page.id().page_no(), + heap, level); + + *err = btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr, mtr); + if (*err != DB_SUCCESS) { + ret = FALSE; + goto got_err; + } + + ut_d(parent_latched = true); + } + } + + /* SPATIAL INDEX never use U locks; we can allow page merges + while holding X lock on the spatial index tree. + Do not allow merges of non-leaf B-tree pages unless it is + safe to do so. */ + { + const bool allow_merge = page_is_leaf(page) + || dict_index_is_spatial(index) + || btr_cur_will_modify_tree( + index, page, BTR_INTENTION_DELETE, rec, + btr_node_ptr_max_size(index), + block->zip_size(), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + + if (min_mark_next_rec) { + btr_set_min_rec_mark(next_rec, *block, mtr); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(!parent_latched + || btr_check_node_ptr(index, block, mtr)); + + if (!ret && btr_cur_compress_recommendation(cursor, mtr)) { + if (UNIV_LIKELY(allow_merge)) { + ret = btr_cur_compress_if_useful( + cursor, FALSE, mtr); + } else { + ib::warn() << "Not merging page " + << block->page.id() + << " in index " << index->name + << " of " << index->table->name; + ut_ad("MDEV-14637" == 0); + } + } + } + +return_after_reservations: + *err = DB_SUCCESS; +err_exit: + mem_heap_free(heap); + +#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled + if (page_is_leaf(page) + && !dict_index_is_online_ddl(index)) { + mtr->release(index->lock); + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } +#endif + + index->table->space->release_free_extents(n_reserved); + return(ret); +} + +/** Delete the node pointer in a parent page. +@param[in,out] parent cursor pointing to parent record +@param[in,out] mtr mini-transaction */ +dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent), + MTR_MEMO_PAGE_X_FIX)); + dberr_t err; + ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent, + BTR_CREATE_FLAG, false, + mtr); + if (err == DB_SUCCESS && !compressed) { + btr_cur_compress_if_useful(parent, FALSE, mtr); + } + + return err; +} + +/** Represents the cursor for the number of rows estimation. The +content is used for level-by-level diving and estimation the number of rows +on each level. */ +class btr_est_cur_t +{ + /* Assume a page like: + records: (inf, a, b, c, d, sup) + index of the record: 0, 1, 2, 3, 4, 5 + */ + + /** Index of the record where the page cursor stopped on this level + (index in alphabetical order). In the above example, if the search stopped on + record 'c', then nth_rec will be 3. */ + ulint m_nth_rec; + + /** Number of the records on the page, not counting inf and sup. + In the above example n_recs will be 4. */ + ulint m_n_recs; + + /** Search tuple */ + const dtuple_t &m_tuple; + /** Cursor search mode */ + page_cur_mode_t m_mode; + /** Page cursor which is used for search */ + page_cur_t m_page_cur; + /** Page id of the page to get on level down, can differ from + m_block->page.id at the moment when the child's page id is already found, but + the child's block has not fetched yet */ + page_id_t m_page_id; + /** Current block */ + buf_block_t *m_block; + /** Page search mode, can differ from m_mode for non-leaf pages, see c-tor + comments for details */ + page_cur_mode_t m_page_mode; + + /** Matched fields and bytes which are used for on-page search, see + btr_cur_t::(up|low)_(match|bytes) comments for details */ + ulint m_up_match= 0; + ulint m_up_bytes= 0; + ulint m_low_match= 0; + ulint m_low_bytes= 0; + +public: + btr_est_cur_t(dict_index_t *index, const dtuple_t &tuple, + page_cur_mode_t mode) + : m_tuple(tuple), m_mode(mode), + m_page_id(index->table->space_id, index->page), m_block(nullptr) + { + + ut_ad(dict_index_check_search_tuple(index, &tuple)); + ut_ad(dtuple_check_typed(&tuple)); + + m_page_cur.index = index; + /* We use these modified search modes on non-leaf levels of the B-tree. + These let us end up in the right B-tree leaf. In that leaf we use the + original search mode. */ + switch (mode) { + case PAGE_CUR_GE: + m_page_mode= PAGE_CUR_L; + break; + case PAGE_CUR_G: + m_page_mode= PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE || + mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + m_page_mode= mode; + break; + } + } + + /** Retrieve block with m_page_id, release the previously gotten block + if necessary. If this is a left border block cursor and both left and right + border blocks have the same parent, don't unlatch the parent, as it must be + latched to get the right block, and will be unlatched after the right block + is fetched. + @param level distance from the leaf page level; ULINT_UNDEFINED when + fetching the root page + @param mtr mtr + @param right_parent right border block parent, nullptr if the function + is called for the right block itself + @return true on success or false otherwise. */ + bool fetch_child(ulint level, mtr_t &mtr, const buf_block_t *right_parent) + { + buf_block_t *parent_block= m_block; + + m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level, + &mtr, nullptr); + if (!m_block) + return false; + + if (parent_block && parent_block != right_parent) + { + ut_ad(mtr.get_savepoint() >= 2); + mtr.rollback_to_savepoint(1, 2); + } + + return level == ULINT_UNDEFINED || + btr_page_get_level(m_block->page.frame) == level; + } + + /** Sets page mode for leaves */ + void set_page_mode_for_leaves() { m_page_mode= m_mode; } + + /** Does search on the current page. If there is no border in m_tuple, then + just move the cursor to the most left or right record. + @param level current level on tree. + @param root_height root height + @param left true if this is left border, false otherwise. + @return true on success, false otherwise. */ + bool search_on_page(ulint level, ulint root_height, bool left) + { + if (level != btr_page_get_level(m_block->page.frame)) + return false; + + m_n_recs= page_get_n_recs(m_block->page.frame); + + if (dtuple_get_n_fields(&m_tuple) > 0) + { + m_up_bytes= m_low_bytes= 0; + m_page_cur.block= m_block; + if (page_cur_search_with_match(&m_tuple, m_page_mode, + &m_up_match, &m_low_match, &m_page_cur, + nullptr)) + return false; + m_nth_rec= page_rec_get_n_recs_before(page_cur_get_rec(&m_page_cur)); + } + else if (left) + { + page_cur_set_before_first(m_block, &m_page_cur); + if (level) + { + if (!page_cur_move_to_next(&m_page_cur)) + return false; + m_nth_rec= 1; + } + else + m_nth_rec= 0; + } + else + { + m_nth_rec= m_n_recs; + if (!level) + { + page_cur_set_after_last(m_block, &m_page_cur); + ++m_nth_rec; + } + else + { + m_page_cur.block= m_block; + m_page_cur.rec= page_rec_get_nth(m_block->page.frame, m_nth_rec); + } + } + + return true; + } + + /** Read page id of the current record child. + @param offsets offsets array. + @param heap heap for offsets array */ + void read_child_page_id(rec_offs **offsets, mem_heap_t **heap) + { + const rec_t *node_ptr= page_cur_get_rec(&m_page_cur); + + /* FIXME: get the child page number directly without computing offsets */ + *offsets= rec_get_offsets(node_ptr, index(), *offsets, 0, ULINT_UNDEFINED, + heap); + + /* Go to the child node */ + m_page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, *offsets)); + } + + /** @return true if left border should be counted */ + bool should_count_the_left_border() const + { + if (dtuple_get_n_fields(&m_tuple) > 0) + { + ut_ad(!page_rec_is_infimum(page_cur_get_rec(&m_page_cur))); + return !page_rec_is_supremum(page_cur_get_rec(&m_page_cur)); + } + ut_ad(page_rec_is_infimum(page_cur_get_rec(&m_page_cur))); + return false; + } + + /** @return true if right border should be counted */ + bool should_count_the_right_border() const + { + if (dtuple_get_n_fields(&m_tuple) > 0) + { + const rec_t *rec= page_cur_get_rec(&m_page_cur); + ut_ad(!(m_mode == PAGE_CUR_L && page_rec_is_supremum(rec))); + + return (m_mode == PAGE_CUR_LE /* if the range is '<=' */ + /* and the record was found */ + && m_low_match >= dtuple_get_n_fields(&m_tuple)) || + (m_mode == PAGE_CUR_L /* or if the range is '<' */ + /* and there are any records to match the criteria, i.e. if the + minimum record on the tree is 5 and x < 7 is specified then the + cursor will be positioned at 5 and we should count the border, + but if x < 2 is specified, then the cursor will be positioned at + 'inf' and we should not count the border */ + && !page_rec_is_infimum(rec)); + /* Notice that for "WHERE col <= 'foo'" the server passes to + ha_innobase::records_in_range(): min_key=NULL (left-unbounded) which is + expected max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is + unexpected - one would expect flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In + this case the cursor will be positioned on the first record to the right + of the requested one (can also be positioned on the 'sup') and we should + not count the right border. */ + } + ut_ad(page_rec_is_supremum(page_cur_get_rec(&m_page_cur))); + + /* The range specified is without a right border, just 'x > 123' + or 'x >= 123' and search_on_page() positioned the cursor on the + supremum record on the rightmost page, which must not be counted. */ + return false; + } + + /** @return index */ + const dict_index_t *index() const { return m_page_cur.index; } + + /** @return current block */ + const buf_block_t *block() const { return m_block; } + + /** @return current page id */ + page_id_t page_id() const { return m_page_id; } + + /** Copies block pointer and savepoint from another btr_est_cur_t in the case + if both left and right border cursors point to the same block. + @param o reference to the other btr_est_cur_t object. */ + void set_block(const btr_est_cur_t &o) { m_block= o.m_block; } + + /** @return current record number. */ + ulint nth_rec() const { return m_nth_rec; } + + /** @return number of records in the current page. */ + ulint n_recs() const { return m_n_recs; } +}; + +/** Estimate the number of rows between the left record of the path and the +right one(non-inclusive) for the certain level on a B-tree. This function +starts from the page next to the left page and reads a few pages to the right, +counting their records. If we reach the right page quickly then we know exactly +how many records there are between left and right records and we set +is_n_rows_exact to true. After some page is latched, the previous page is +unlatched. If we cannot reach the right page quickly then we calculate the +average number of records in the pages scanned so far and assume that all pages +that we did not scan up to the right page contain the same number of records, +then we multiply that average to the number of pages between right and left +records (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to +false. +@param level current level. +@param left_cur the cursor of the left page. +@param right_page_no right page number. +@param n_rows_on_prev_level number of rows on the previous level. +@param[out] is_n_rows_exact true if exact rows number is returned. +@param[in,out] mtr mtr, +@return number of rows, not including the borders (exact or estimated). */ +static ha_rows btr_estimate_n_rows_in_range_on_level( + ulint level, btr_est_cur_t &left_cur, uint32_t right_page_no, + ha_rows n_rows_on_prev_level, bool &is_n_rows_exact, mtr_t &mtr) +{ + ha_rows n_rows= 0; + uint n_pages_read= 0; + /* Do not read more than this number of pages in order not to hurt + performance with this code which is just an estimation. If we read this many + pages before reaching right_page_no, then we estimate the average from the + pages scanned so far. */ + static constexpr uint n_pages_read_limit= 9; + buf_block_t *block= nullptr; + const dict_index_t *index= left_cur.index(); + + /* Assume by default that we will scan all pages between left and right(non + inclusive) pages */ + is_n_rows_exact= true; + + /* Add records from the left page which are to the right of the record which + serves as a left border of the range, if any (we don't include the record + itself in this count). */ + if (left_cur.nth_rec() <= left_cur.n_recs()) + { + n_rows+= left_cur.n_recs() - left_cur.nth_rec(); + } + + /* Count the records in the pages between left and right (non inclusive) + pages */ + + const fil_space_t *space= index->table->space; + page_id_t page_id(space->id, + btr_page_get_next(buf_block_get_frame(left_cur.block()))); + + if (page_id.page_no() == FIL_NULL) + goto inexact; + + do + { + page_t *page; + buf_block_t *prev_block= block; + + /* Fetch the page. */ + block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr, + nullptr); + + if (prev_block) + { + ulint savepoint = mtr.get_savepoint(); + /* Index s-lock, p1, p2 latches, can also be p1 and p2 parent latch if + they are not diverged */ + ut_ad(savepoint >= 3); + mtr.rollback_to_savepoint(savepoint - 2, savepoint - 1); + } + + if (!block || btr_page_get_level(buf_block_get_frame(block)) != level) + goto inexact; + + page= buf_block_get_frame(block); + + /* It is possible but highly unlikely that the page was originally written + by an old version of InnoDB that did not initialize FIL_PAGE_TYPE on other + than B-tree pages. For example, this could be an almost-empty BLOB page + that happens to contain the magic values in the fields + that we checked above. */ + + n_pages_read++; + + n_rows+= page_get_n_recs(page); + + page_id.set_page_no(btr_page_get_next(page)); + + if (n_pages_read == n_pages_read_limit) + { + /* We read too many pages or we reached the end of the level + without passing through right_page_no. */ + goto inexact; + } + + } while (page_id.page_no() != right_page_no); + + if (block) + { + ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1)); + mtr.rollback_to_savepoint(mtr.get_savepoint() - 1); + } + + return (n_rows); + +inexact: + + if (block) + { + ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1)); + mtr.rollback_to_savepoint(mtr.get_savepoint() - 1); + } + + is_n_rows_exact= false; + + /* We did interrupt before reaching right page */ + + if (n_pages_read > 0) + { + /* The number of pages on this level is + n_rows_on_prev_level, multiply it by the + average number of recs per page so far */ + n_rows= n_rows_on_prev_level * n_rows / n_pages_read; + } + else + { + n_rows= 10; + } + + return (n_rows); +} + +/** Estimates the number of rows in a given index range. Do search in the left +page, then if there are pages between left and right ones, read a few pages to +the right, if the right page is reached, count the exact number of rows without +fetching the right page, the right page will be fetched in the caller of this +function and the amount of its rows will be added. If the right page is not +reached, count the estimated(see btr_estimate_n_rows_in_range_on_level() for +details) rows number, and fetch the right page. If leaves are reached, unlatch +non-leaf pages except the right leaf parent. After the right leaf page is +fetched, commit mtr. +@param[in] index index +@param[in] range_start range start +@param[in] range_end range end +@return estimated number of rows; */ +ha_rows btr_estimate_n_rows_in_range(dict_index_t *index, + btr_pos_t *range_start, + btr_pos_t *range_end) +{ + DBUG_ENTER("btr_estimate_n_rows_in_range"); + + if (UNIV_UNLIKELY(index->page == FIL_NULL || index->is_corrupted())) + DBUG_RETURN(0); + + ut_ad(index->is_btree()); + + btr_est_cur_t p1(index, *range_start->tuple, range_start->mode); + btr_est_cur_t p2(index, *range_end->tuple, range_end->mode); + mtr_t mtr; + + ulint height; + ulint root_height= 0; /* remove warning */ + + mem_heap_t *heap= NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs_init(offsets_); + + mtr.start(); + + ut_ad(mtr.get_savepoint() == 0); + mtr_s_lock_index(index, &mtr); + + ha_rows table_n_rows= dict_table_get_n_rows(index->table); + + height= ULINT_UNDEFINED; + + /* This becomes true when the two paths do not pass through the same pages + anymore. */ + bool diverged= false; + /* This is the height, i.e. the number of levels from the root, where paths + are not the same or adjacent any more. */ + ulint divergence_height= ULINT_UNDEFINED; + bool should_count_the_left_border= true; + bool should_count_the_right_border= true; + bool is_n_rows_exact= true; + ha_rows n_rows= 0; + + /* Loop and search until we arrive at the desired level. */ +search_loop: + if (!p1.fetch_child(height, mtr, p2.block())) + goto error; + + if (height == ULINT_UNDEFINED) + { + /* We are in the root node */ + height= btr_page_get_level(buf_block_get_frame(p1.block())); + root_height= height; + } + + if (!height) + { + p1.set_page_mode_for_leaves(); + p2.set_page_mode_for_leaves(); + } + + if (p1.page_id() == p2.page_id()) + p2.set_block(p1); + else + { + ut_ad(diverged); + if (divergence_height != ULINT_UNDEFINED) { + /* We need to call p1.search_on_page() here as + btr_estimate_n_rows_in_range_on_level() uses p1.m_n_recs and + p1.m_nth_rec. */ + if (!p1.search_on_page(height, root_height, true)) + goto error; + n_rows= btr_estimate_n_rows_in_range_on_level( + height, p1, p2.page_id().page_no(), n_rows, is_n_rows_exact, mtr); + } + if (!p2.fetch_child(height, mtr, nullptr)) + goto error; + } + + if (height == 0) + /* There is no need to release non-leaf pages here as they must already be + unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after + releasing the index latch, to decrease contention. */ + mtr.rollback_to_savepoint(0, 1); + + /* There is no need to search on left page if + divergence_height != ULINT_UNDEFINED, as it was already searched before + btr_estimate_n_rows_in_range_on_level() call */ + if (divergence_height == ULINT_UNDEFINED && + !p1.search_on_page(height, root_height, true)) + goto error; + + if (!p2.search_on_page(height, root_height, false)) + goto error; + + if (!diverged && (p1.nth_rec() != p2.nth_rec())) + { + ut_ad(p1.page_id() == p2.page_id()); + diverged= true; + if (p1.nth_rec() < p2.nth_rec()) + { + /* We do not count the borders (nor the left nor the right one), thus + "- 1". */ + n_rows= p2.nth_rec() - p1.nth_rec() - 1; + + if (n_rows > 0) + { + /* There is at least one row between the two borders pointed to by p1 + and p2, so on the level below the slots will point to non-adjacent + pages. */ + divergence_height= root_height - height; + } + } + else + { + /* It is possible that p1->nth_rec > p2->nth_rec if, for example, we have + a single page tree which contains (inf, 5, 6, supr) and we select where x + > 20 and x < 30; in this case p1->nth_rec will point to the supr record + and p2->nth_rec will point to 6. */ + n_rows= 0; + should_count_the_left_border= false; + should_count_the_right_border= false; + } + } + else if (diverged && divergence_height == ULINT_UNDEFINED) + { + + if (p1.nth_rec() < p1.n_recs() || p2.nth_rec() > 1) + { + ut_ad(p1.page_id() != p2.page_id()); + divergence_height= root_height - height; + + n_rows= 0; + + if (p1.nth_rec() < p1.n_recs()) + { + n_rows+= p1.n_recs() - p1.nth_rec(); + } + + if (p2.nth_rec() > 1) + { + n_rows+= p2.nth_rec() - 1; + } + } + } + else if (divergence_height != ULINT_UNDEFINED) + { + /* All records before the right page was already counted. Add records from + p2->page_no which are to the left of the record which servers as a right + border of the range, if any (we don't include the record itself in this + count). */ + if (p2.nth_rec() > 1) + n_rows+= p2.nth_rec() - 1; + } + + if (height) + { + ut_ad(height > 0); + height--; + ut_ad(mtr.memo_contains(p1.index()->lock, MTR_MEMO_S_LOCK)); + ut_ad(mtr.memo_contains_flagged(p1.block(), MTR_MEMO_PAGE_S_FIX)); + p1.read_child_page_id(&offsets, &heap); + ut_ad(mtr.memo_contains(p2.index()->lock, MTR_MEMO_S_LOCK)); + ut_ad(mtr.memo_contains_flagged(p2.block(), MTR_MEMO_PAGE_S_FIX)); + p2.read_child_page_id(&offsets, &heap); + goto search_loop; + } + + should_count_the_left_border= + should_count_the_left_border && p1.should_count_the_left_border(); + should_count_the_right_border= + should_count_the_right_border && p2.should_count_the_right_border(); + + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + + range_start->page_id= p1.page_id(); + range_end->page_id= p2.page_id(); + + /* Here none of the borders were counted. For example, if on the leaf level + we descended to: + (inf, a, b, c, d, e, f, sup) + ^ ^ + path1 path2 + then n_rows will be 2 (c and d). */ + + if (is_n_rows_exact) + { + /* Only fiddle to adjust this off-by-one if the number is exact, otherwise + we do much grosser adjustments below. */ + + /* If both paths end up on the same record on the leaf level. */ + if (p1.page_id() == p2.page_id() && p1.nth_rec() == p2.nth_rec()) + { + + /* n_rows can be > 0 here if the paths were first different and then + converged to the same record on the leaf level. + For example: + SELECT ... LIKE 'wait/synch/rwlock%' + mode1=PAGE_CUR_GE, + tuple1="wait/synch/rwlock" + path1[0]={nth_rec=58, n_recs=58, + page_no=3, page_level=1} + path1[1]={nth_rec=56, n_recs=55, + page_no=119, page_level=0} + + mode2=PAGE_CUR_G + tuple2="wait/synch/rwlock" + path2[0]={nth_rec=57, n_recs=57, + page_no=3, page_level=1} + path2[1]={nth_rec=56, n_recs=55, + page_no=119, page_level=0} */ + + /* If the range is such that we should count both borders, then avoid + counting that record twice - once as a left border and once as a right + border. Some of the borders should not be counted, e.g. [3,3). */ + n_rows= should_count_the_left_border && should_count_the_right_border; + } + else + n_rows+= should_count_the_left_border + should_count_the_right_border; + } + + if (root_height > divergence_height && !is_n_rows_exact) + /* In trees whose height is > 1 our algorithm tends to underestimate: + multiply the estimate by 2: */ + n_rows*= 2; + + DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows);); + + /* Do not estimate the number of rows in the range to over 1 / 2 of the + estimated rows in the whole table */ + + if (n_rows > table_n_rows / 2 && !is_n_rows_exact) + { + + n_rows= table_n_rows / 2; + + /* If there are just 0 or 1 rows in the table, then we estimate all rows + are in the range */ + + if (n_rows == 0) + n_rows= table_n_rows; + } + + DBUG_RETURN(n_rows); + +error: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/***********************************************************//** +Gets the offset of the pointer to the externally stored part of a field. +@return offset of the pointer to the externally stored part */ +static +ulint +btr_rec_get_field_ref_offs( +/*=======================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: index of the external field */ +{ + ulint field_ref_offs; + ulint local_len; + + ut_a(rec_offs_nth_extern(offsets, n)); + field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len); + ut_a(len_is_stored(local_len)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE); +} + +/** Gets a pointer to the externally stored part of a field. +@param rec record +@param offsets rec_get_offsets(rec) +@param n index of the externally stored field +@return pointer to the externally stored part */ +#define btr_rec_get_field_ref(rec, offsets, n) \ + ((rec) + btr_rec_get_field_ref_offs(offsets, n)) + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const rec_offs* offsets) +{ + ulint n_fields; + ulint total_extern_len = 0; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + ulint extern_len = mach_read_from_4( + btr_rec_get_field_ref(rec, offsets, i) + + BTR_EXTERN_LEN + 4); + + total_extern_len += ut_calc_align( + extern_len, ulint(srv_page_size)); + } + } + + return total_extern_len >> srv_page_size_shift; +} + +/*******************************************************************//** +Sets the ownership bit of an externally stored field in a record. */ +static +void +btr_cur_set_ownership_of_extern_field( +/*==================================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: clustered index record */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint i, /*!< in: field number */ + bool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + byte* data; + ulint local_len; + ulint byte_val; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + ut_ad(rec_offs_nth_extern(offsets, i)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN); + + if (val) { + byte_val &= ~BTR_EXTERN_OWNER_FLAG; + } else { +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + byte_val |= BTR_EXTERN_OWNER_FLAG; + } + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr); + } else { + mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len + + BTR_EXTERN_LEN, byte_val); + } +} + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +void +btr_cur_disown_inherited_fields( +/*============================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_any_extern(offsets)); + + for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i) + && !upd_get_field_by_field_no(update, i, false)) { + btr_cur_set_ownership_of_extern_field( + block, rec, index, offsets, i, false, mtr); + } + } +} + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + if (!rec_offs_any_extern(offsets)) { + return; + } + + const ulint n = rec_offs_n_fields(offsets); + + for (ulint i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_cur_set_ownership_of_extern_field( + block, rec, index, offsets, i, true, mtr); + } + } +} + +/*******************************************************************//** +Returns the length of a BLOB part stored on the header page. +@return part length */ +static +uint32_t +btr_blob_get_part_len( +/*==================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*******************************************************************//** +Returns the page number where the next BLOB part is stored. +@return page number or FIL_NULL if no more pages */ +static +uint32_t +btr_blob_get_next_page_no( +/*======================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/** Deallocate a buffer block that was reserved for a BLOB part. +@param block buffer block +@param all flag whether to remove a ROW_FORMAT=COMPRESSED page +@param mtr mini-transaction to commit */ +static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr) +{ + const page_id_t page_id(block->page.id()); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + mtr->commit(); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + mysql_mutex_lock(&buf_pool.mutex); + + if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain)) + if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data) + /* Attempt to deallocate the redundant copy of the uncompressed page + if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */ + buf_LRU_free_page(bpage, false); + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Helper class used while writing blob pages, during insert or update. */ +struct btr_blob_log_check_t { + /** Persistent cursor on a clusterex index record with blobs. */ + btr_pcur_t* m_pcur; + /** Mini transaction holding the latches for m_pcur */ + mtr_t* m_mtr; + /** rec_get_offsets(rec, index); offset of clust_rec */ + const rec_offs* m_offsets; + /** The block containing clustered record */ + buf_block_t** m_block; + /** The clustered record pointer */ + rec_t** m_rec; + /** The blob operation code */ + enum blob_op m_op; + + /** Constructor + @param[in] pcur persistent cursor on a clustered + index record with blobs. + @param[in] mtr mini-transaction holding latches for + pcur. + @param[in] offsets offsets of the clust_rec + @param[in,out] block record block containing pcur record + @param[in,out] rec the clustered record pointer + @param[in] op the blob operation code */ + btr_blob_log_check_t( + btr_pcur_t* pcur, + mtr_t* mtr, + const rec_offs* offsets, + buf_block_t** block, + rec_t** rec, + enum blob_op op) + : m_pcur(pcur), + m_mtr(mtr), + m_offsets(offsets), + m_block(block), + m_rec(rec), + m_op(op) + { + ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets)); + ut_ad((*m_block)->page.frame == page_align(*m_rec)); + ut_ad(*m_rec == btr_pcur_get_rec(m_pcur)); + } + + /** Check if there is enough space in log file. Commit and re-start the + mini transaction. */ + void check() + { + dict_index_t* index = m_pcur->index(); + ulint offs = 0; + uint32_t page_no = FIL_NULL; + + if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) { + offs = page_offset(*m_rec); + page_no = (*m_block)->page.id().page_no(); + (*m_block)->page.fix(); + ut_ad(page_no != FIL_NULL); + } else { + btr_pcur_store_position(m_pcur, m_mtr); + } + m_mtr->commit(); + + DEBUG_SYNC_C("blob_write_middle"); + + const mtr_log_t log_mode = m_mtr->get_log_mode(); + m_mtr->start(); + m_mtr->set_log_mode(log_mode); + index->set_modified(*m_mtr); + + log_free_check(); + + DEBUG_SYNC_C("blob_write_middle_after_check"); + + if (UNIV_UNLIKELY(page_no != FIL_NULL)) { + dberr_t err; + if (UNIV_LIKELY(index->page != page_no)) { + ut_a(btr_root_block_get(index, RW_SX_LATCH, + m_mtr, &err)); + } + m_pcur->btr_cur.page_cur.block = btr_block_get( + *index, page_no, RW_X_LATCH, false, m_mtr); + /* The page should not be evicted or corrupted while + we are holding a buffer-fix on it. */ + m_pcur->btr_cur.page_cur.block->page.unfix(); + m_pcur->btr_cur.page_cur.rec + = m_pcur->btr_cur.page_cur.block->page.frame + + offs; + } else { + ut_ad(m_pcur->rel_pos == BTR_PCUR_ON); + mtr_sx_lock_index(index, m_mtr); + ut_a(m_pcur->restore_position( + BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED, + m_mtr) == btr_pcur_t::SAME_ALL); + } + + *m_block = btr_pcur_get_block(m_pcur); + *m_rec = btr_pcur_get_rec(m_pcur); + + rec_offs_make_valid(*m_rec, index, true, + const_cast<rec_offs*>(m_offsets)); + + ut_ad(m_mtr->memo_contains_page_flagged( + *m_rec, + MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); + + ut_ad((m_op == BTR_STORE_INSERT_BULK) + == !m_mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK + | MTR_MEMO_X_LOCK)); + } +}; + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. + +TODO: If the allocation extends the tablespace, it will not be redo logged, in +any mini-transaction. Tablespace extension should be redo-logged, so that +recovery will not fail when the big_rec was written to the extended portion of +the file, in case the file was somehow truncated in the crash. + +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + btr_pcur_t* pcur, /*!< in: a persistent cursor */ + rec_offs* offsets, /*!< in/out: rec_get_offsets() on + pcur. the "external storage" flags + in offsets will correctly correspond + to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in/out: mtr containing the + latches to the clustered index. can be + committed and restarted. */ + enum blob_op op) /*! in: operation code */ +{ + byte* field_ref; + ulint extern_len; + ulint store_len; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + page_zip_des_t* page_zip; + z_stream c_stream; + dberr_t error = DB_SUCCESS; + dict_index_t* index = pcur->index(); + buf_block_t* rec_block = btr_pcur_get_block(pcur); + rec_t* rec = btr_pcur_get_rec(pcur); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(op == BTR_STORE_INSERT_BULK + || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); + ut_a(dict_index_is_clust(index)); + + if (!fil_page_index_page_check(page_align(rec))) { + if (op != BTR_STORE_INSERT_BULK) { + return DB_PAGE_CORRUPTED; + } + } + + btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block, + &rec, op); + page_zip = buf_block_get_page_zip(rec_block); + + if (page_zip) { + int err; + + /* Zlib deflate needs 128 kilobytes for the default + window size, plus 512 << memLevel, plus a few + kilobytes for small objects. We use reduced memLevel + to limit the memory consumption, and preallocate the + heap, hoping to avoid memory fragmentation. */ + heap = mem_heap_create(250000); + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, int(page_zip_level), + Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must either be zero or they must be pointers to inherited + columns, owned by this record or an earlier record version. */ + for (i = 0; i < big_rec_vec->n_fields; i++) { + field_ref = btr_rec_get_field_ref( + rec, offsets, big_rec_vec->fields[i].field_no); + + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + /* Either this must be an update in place, + or the BLOB must be inherited, or the BLOB pointer + must be zero (will be written in this function). */ + ut_a(op == BTR_STORE_UPDATE + || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + || !memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Space available in compressed page to carry blob data */ + const ulint payload_size_zip = rec_block->physical_size() + - FIL_PAGE_DATA; + + /* Space available in uncompressed page to carry blob data */ + const ulint payload_size = payload_size_zip + - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END); + + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + const ulint field_no = big_rec_vec->fields[i].field_no; + + field_ref = btr_rec_get_field_ref(rec, offsets, field_no); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* A zero BLOB pointer should have been initially inserted. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + extern_len = big_rec_vec->fields[i].len; + MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len); + ut_a(extern_len > 0); + + uint32_t prev_page_no = FIL_NULL; + + if (page_zip) { + int err = deflateReset(&c_stream); + ut_a(err == Z_OK); + + c_stream.next_in = (Bytef*) + big_rec_vec->fields[i].data; + c_stream.avail_in = static_cast<uInt>(extern_len); + } + + for (ulint blob_npages = 0;; ++blob_npages) { + buf_block_t* block; + const ulint commit_freq = 4; + uint32_t r_extents; + + ut_ad(page_align(field_ref) == page_align(rec)); + + if (!(blob_npages % commit_freq)) { + + redo_log.check(); + + field_ref = btr_rec_get_field_ref( + rec, offsets, field_no); + + page_zip = buf_block_get_page_zip(rec_block); + } + + ut_ad(btr_mtr->get_already_latched( + page_id_t{index->table->space_id, index->page}, + MTR_MEMO_PAGE_SX_FIX)); + + mtr.start(); + index->set_modified(mtr); + mtr.set_log_mode_sub(*btr_mtr); + + rec_block->page.fix(); + rec_block->page.lock.x_lock(); + + mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!btr_search_check_marked_free_index(rec_block)); +#endif + + uint32_t hint_prev = prev_page_no; + if (hint_prev == FIL_NULL) { + hint_prev = rec_block->page.id().page_no(); + } + + error = fsp_reserve_free_extents( + &r_extents, index->table->space, 1, + FSP_BLOB, &mtr, 1); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { +alloc_fail: + mtr.commit(); + goto func_exit; + } + + block = btr_page_alloc(index, hint_prev + 1, + FSP_NO_DIR, 0, &mtr, &mtr, + &error); + + index->table->space->release_free_extents(r_extents); + if (!block) { + goto alloc_fail; + } + + const uint32_t space_id = block->page.id().space(); + const uint32_t page_no = block->page.id().page_no(); + + if (prev_page_no == FIL_NULL) { + } else if (buf_block_t* prev_block = + buf_page_get_gen(page_id_t(space_id, + prev_page_no), + rec_block->zip_size(), + RW_X_LATCH, nullptr, + BUF_GET, &mtr, &error)) { + if (page_zip) { + mtr.write<4>(*prev_block, + prev_block->page.frame + + FIL_PAGE_NEXT, + page_no); + memcpy_aligned<4>( + buf_block_get_page_zip( + prev_block) + ->data + FIL_PAGE_NEXT, + prev_block->page.frame + + FIL_PAGE_NEXT, 4); + } else { + mtr.write<4>(*prev_block, + BTR_BLOB_HDR_NEXT_PAGE_NO + + FIL_PAGE_DATA + + prev_block->page.frame, + page_no); + } + } else { + goto alloc_fail; + } + + ut_ad(!page_has_siblings(block->page.frame)); + ut_ad(!fil_page_get_type(block->page.frame)); + + if (page_zip) { + int err; + page_zip_des_t* blob_page_zip; + + mtr.write<1>(*block, + FIL_PAGE_TYPE + 1 + + block->page.frame, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2); + block->page.zip.data[FIL_PAGE_TYPE + 1] + = block->page.frame[FIL_PAGE_TYPE + 1]; + + c_stream.next_out = block->page.frame + + FIL_PAGE_DATA; + c_stream.avail_out = static_cast<uInt>( + payload_size_zip); + + err = deflate(&c_stream, Z_FINISH); + ut_a(err == Z_OK || err == Z_STREAM_END); + ut_a(err == Z_STREAM_END + || c_stream.avail_out == 0); + + mtr.memcpy(*block, + FIL_PAGE_DATA, + page_zip_get_size(page_zip) + - FIL_PAGE_DATA + - c_stream.avail_out); + /* Copy the page to compressed storage, + because it will be flushed to disk + from there. */ + blob_page_zip = buf_block_get_page_zip(block); + ut_ad(blob_page_zip); + ut_ad(page_zip_get_size(blob_page_zip) + == page_zip_get_size(page_zip)); + memcpy(blob_page_zip->data, block->page.frame, + page_zip_get_size(page_zip)); + + if (err == Z_OK && prev_page_no != FIL_NULL) { + + goto next_zip_page; + } + + if (err == Z_STREAM_END) { + mach_write_to_4(field_ref + + BTR_EXTERN_LEN, 0); + mach_write_to_4(field_ref + + BTR_EXTERN_LEN + 4, + c_stream.total_in); + } else { + memset(field_ref + BTR_EXTERN_LEN, + 0, 8); + } + + if (prev_page_no == FIL_NULL) { + ut_ad(blob_npages == 0); + mach_write_to_4(field_ref + + BTR_EXTERN_SPACE_ID, + space_id); + + mach_write_to_4(field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mach_write_to_4(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_NEXT); + } + + /* We compress a page when finish bulk insert.*/ + if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) { + page_zip_write_blob_ptr( + rec_block, rec, index, offsets, + field_no, &mtr); + } + +next_zip_page: + prev_page_no = page_no; + + /* Commit mtr and release the + uncompressed page frame to save memory. */ + btr_blob_free(block, FALSE, &mtr); + + if (err == Z_STREAM_END) { + break; + } + } else { + mtr.write<1>(*block, FIL_PAGE_TYPE + 1 + + block->page.frame, + FIL_PAGE_TYPE_BLOB); + + if (extern_len > payload_size) { + store_len = payload_size; + } else { + store_len = extern_len; + } + + mtr.memcpy<mtr_t::MAYBE_NOP>( + *block, + FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE + + block->page.frame, + static_cast<const byte*> + (big_rec_vec->fields[i].data) + + big_rec_vec->fields[i].len + - extern_len, store_len); + mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN + + FIL_PAGE_DATA + + block->page.frame, + store_len); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO + + FIL_PAGE_DATA, 4, 0xff); + + extern_len -= store_len; + + ut_ad(!mach_read_from_4(BTR_EXTERN_LEN + + field_ref)); + mtr.write<4>(*rec_block, + BTR_EXTERN_LEN + 4 + field_ref, + big_rec_vec->fields[i].len + - extern_len); + + if (prev_page_no == FIL_NULL) { + ut_ad(blob_npages == 0); + mtr.write<4,mtr_t::MAYBE_NOP>( + *rec_block, + field_ref + BTR_EXTERN_SPACE_ID, + space_id); + + mtr.write<4>(*rec_block, field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mtr.write<4>(*rec_block, field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA); + } + + prev_page_no = page_no; + + mtr.commit(); + + if (extern_len == 0) { + break; + } + } + } + + DBUG_EXECUTE_IF("btr_store_big_rec_extern", + error = DB_OUT_OF_FILE_SPACE; + goto func_exit;); + + rec_offs_make_nth_extern(offsets, field_no); + } + +func_exit: + if (page_zip) { + deflateEnd(&c_stream); + } + + if (heap != NULL) { + mem_heap_free(heap); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must be valid. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + /* The pointer must not be zero if the operation + succeeded. */ + ut_a(0 != memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE) + || error != DB_SUCCESS); + /* The column must not be disowned by this record. */ + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + return(error); +} + +/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page. +@param block uncompressed BLOB page +@param op operation +@return whether the type is invalid */ +static bool btr_check_blob_fil_page_type(const buf_block_t& block, + const char *op) +{ + uint16_t type= fil_page_get_type(block.page.frame); + + if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB)); + else if (fil_space_t *space= fil_space_t::get(block.page.id().space())) + { + /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB + pages. Do not print anything about the type mismatch when reading + a BLOB page that may be from old versions. */ + bool fail= space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags); + if (fail) + sql_print_error("InnoDB: FIL_PAGE_TYPE=%u on BLOB %s file %s page %u", + type, op, space->chain.start->name, + block.page.id().page_no()); + space->release(); + return fail; + } + return false; +} + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned by the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + buf_block_t* block, /*!< in/out: page of field_ref */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + bool rollback, /*!< in: performing rollback? */ + mtr_t* local_mtr) /*!< in: mtr + containing the latch to data an an + X-latch to the index tree */ +{ + const uint32_t space_id = mach_read_from_4( + field_ref + BTR_EXTERN_SPACE_ID); + + ut_ad(index->is_primary()); + ut_ad(block->page.lock.have_x()); + ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(local_mtr->memo_contains_page_flagged(field_ref, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i)); + ut_ad(index->table->space_id == index->table->space->id); + ut_ad(local_mtr->is_named_space(index->table->space)); + + if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* In the rollback, we may encounter a clustered index + record with some unwritten off-page columns. There is + nothing to free then. */ + ut_a(rollback); + return; + } + + ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN) + & ~((BTR_EXTERN_OWNER_FLAG + | BTR_EXTERN_INHERITED_FLAG) << 24))); + ut_ad(space_id == index->table->space_id); + + const ulint ext_zip_size = index->table->space->zip_size(); + /* !rec holds in a call from purge when field_ref is in an undo page */ + ut_ad(rec || !block->page.zip.data); + + for (;;) { + mtr_t mtr; + + mtr.start(); + mtr.set_spaces(*local_mtr); + mtr.set_log_mode_sub(*local_mtr); + + ut_ad(!index->table->is_temporary() + || local_mtr->get_log_mode() == MTR_LOG_NO_REDO); + + const uint32_t page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + buf_block_t* ext_block; + + if (/* There is no external storage data */ + page_no == FIL_NULL + /* This field does not own the externally stored field */ + || (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_OWNER_FLAG) + /* Rollback and inherited field */ + || (rollback + && (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_INHERITED_FLAG))) { +skip_free: + /* Do not free */ + mtr.commit(); + + return; + } + + ext_block = buf_page_get(page_id_t(space_id, page_no), + ext_zip_size, RW_X_LATCH, &mtr); + + if (!ext_block) { + goto skip_free; + } + + /* The buffer pool block containing the BLOB pointer is + exclusively latched by local_mtr. To satisfy some design + constraints, we must recursively latch it in mtr as well. */ + block->fix(); + block->page.lock.x_lock(); + + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!btr_search_check_marked_free_index(block)); +#endif + + const page_t* page = buf_block_get_frame(ext_block); + + if (ext_zip_size) { + /* Note that page_zip will be NULL + in row_purge_upd_exist_or_extern(). */ + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + default: + MY_ASSERT_UNREACHABLE(); + } + const uint32_t next_page_no = mach_read_from_4( + page + FIL_PAGE_NEXT); + + btr_page_free(index, ext_block, &mtr, true, + local_mtr->memo_contains( + *index->table->space)); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no); + memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4); + page_zip_write_blob_ptr(block, rec, index, + offsets, i, &mtr); + } else { + mtr.write<4>(*block, + BTR_EXTERN_PAGE_NO + field_ref, + next_page_no); + mtr.write<4,mtr_t::MAYBE_NOP>(*block, + BTR_EXTERN_LEN + + 4 + field_ref, + 0U); + } + } else { + ut_ad(!block->page.zip.data); + btr_check_blob_fil_page_type(*ext_block, "purge"); + + const uint32_t next_page_no = mach_read_from_4( + page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + btr_page_free(index, ext_block, &mtr, true, + local_mtr->memo_contains( + *index->table->space)); + + mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref, + next_page_no); + /* Zero out the BLOB length. If the server + crashes during the execution of this function, + trx_rollback_all_recovered() could + dereference the half-deleted BLOB, fetching a + wrong prefix for the BLOB. */ + mtr.write<4,mtr_t::MAYBE_NOP>(*block, + BTR_EXTERN_LEN + 4 + + field_ref, 0U); + } + + /* Commit mtr and release the BLOB block to save memory. */ + btr_blob_free(ext_block, TRUE, &mtr); + } +} + +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + buf_block_t* block, /*!< in: index page of rec */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(index->is_primary()); + ut_ad(page_rec_is_leaf(rec)); + /* Free possible externally stored fields in the record */ + + ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_free_externally_stored_field( + index, btr_rec_get_field_ref(rec, offsets, i), + rec, offsets, block, i, rollback, mtr); + } + } +} + +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in/out: record */ + buf_block_t* block, /*!< in: index page of rec */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const upd_field_t* ufield = upd_get_nth_field(update, i); + + if (rec_offs_nth_extern(offsets, ufield->field_no)) { + ulint len; + byte* data = rec_get_nth_field( + rec, offsets, ufield->field_no, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, block, + ufield->field_no, rollback, mtr); + } + } +} + +/*******************************************************************//** +Copies the prefix of an uncompressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_blob_prefix( +/*=================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + uint32_t len, /*!< in: length of buf, in bytes */ + page_id_t id, /*!< in: page identifier of the first BLOB page */ + uint32_t offset) /*!< in: offset on the first BLOB page */ +{ + ulint copied_len = 0; + + for (;;) { + mtr_t mtr; + buf_block_t* block; + const page_t* page; + const byte* blob_header; + ulint part_len; + ulint copy_len; + + mtr_start(&mtr); + + block = buf_page_get(id, 0, RW_S_LATCH, &mtr); + if (!block || btr_check_blob_fil_page_type(*block, "read")) { + mtr.commit(); + return copied_len; + } + page = buf_block_get_frame(block); + + blob_header = page + offset; + part_len = btr_blob_get_part_len(blob_header); + copy_len = ut_min(part_len, len - copied_len); + + memcpy(buf + copied_len, + blob_header + BTR_BLOB_HDR_SIZE, copy_len); + copied_len += copy_len; + + id.set_page_no(btr_blob_get_next_page_no(blob_header)); + + mtr_commit(&mtr); + + if (id.page_no() == FIL_NULL || copy_len != part_len) { + MEM_CHECK_DEFINED(buf, copied_len); + return(copied_len); + } + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + ut_ad(copied_len <= len); + } +} + +/** Copies the prefix of a compressed BLOB. +The clustered index record that points to this BLOB must be protected +by a lock or a page latch. +@param[out] buf the externally stored part of the field, +or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size +@param[in] id page identifier of the BLOB pages +@return number of bytes written to buf */ +static +ulint +btr_copy_zblob_prefix( + byte* buf, + uint32_t len, + ulint zip_size, + page_id_t id, + uint32_t offset) +{ + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = static_cast<uInt>(len); + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); + + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(id.space()); + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + for (;;) { + buf_page_t* bpage; + uint32_t next_page_no; + + /* There is no latch on bpage directly. Instead, + bpage is protected by the B-tree page latch that + is being held on the clustered index record, or, + in row_merge_copy_blobs(), by an exclusive table lock. */ + bpage = buf_page_get_zip(id, zip_size); + + if (UNIV_UNLIKELY(!bpage)) { + ib::error() << "Cannot load compressed BLOB " << id; + goto func_exit; + } + + if (UNIV_UNLIKELY + (fil_page_get_type(bpage->zip.data) != page_type)) { + + ib::error() << "Unexpected type " + << fil_page_get_type(bpage->zip.data) + << " of compressed BLOB page " << id; + + ut_ad(0); + goto end_of_blob; + } + + next_page_no = mach_read_from_4(bpage->zip.data + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_DATA; + } else { + offset += 4; + } + + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = uInt(zip_size - offset); + + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + if (!d_stream.avail_out) { + goto end_of_blob; + } + break; + case Z_STREAM_END: + if (next_page_no == FIL_NULL) { + goto end_of_blob; + } + /* fall through */ + default: +inflate_error: + ib::error() << "inflate() of compressed BLOB page " + << id + << " returned " << err + << " (" << d_stream.msg << ")"; + + case Z_BUF_ERROR: + goto end_of_blob; + } + + if (next_page_no == FIL_NULL) { + if (!d_stream.avail_in) { + ib::error() + << "Unexpected end of compressed " + << "BLOB page " << id; + } else { + err = inflate(&d_stream, Z_FINISH); + switch (err) { + case Z_STREAM_END: + case Z_BUF_ERROR: + break; + default: + goto inflate_error; + } + } + +end_of_blob: + bpage->lock.s_unlock(); + bpage->unfix(); + goto func_exit; + } + + bpage->lock.s_unlock(); + bpage->unfix(); + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + id.set_page_no(next_page_no); + offset = FIL_PAGE_NEXT; + page_type = FIL_PAGE_TYPE_ZBLOB2; + } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + MEM_CHECK_DEFINED(buf, d_stream.total_out); + return(d_stream.total_out); +} + +/** Copies the prefix of an externally stored field of a record. +The clustered index record that points to this BLOB must be protected +by a lock or a page latch. +@param[out] buf the externally stored part of the +field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] id page identifier of the first BLOB page +@param[in] offset offset on the first BLOB page +@return number of bytes written to buf */ +static +ulint +btr_copy_externally_stored_field_prefix_low( + byte* buf, + uint32_t len, + ulint zip_size, + page_id_t id, + uint32_t offset) +{ + if (len == 0) + return 0; + + return zip_size + ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset) + : btr_copy_blob_prefix(buf, len, id, offset); +} + +/** Copies the prefix of an externally stored field of a record. +The clustered index record must be protected by a lock or a page latch. +@param[out] buf the field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] local_len length of data, in bytes +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +ulint +btr_copy_externally_stored_field_prefix( + byte* buf, + ulint len, + ulint zip_size, + const byte* data, + ulint local_len) +{ + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(local_len >= len)) { + memcpy(buf, data, len); + return(len); + } + + memcpy(buf, data, local_len); + data += local_len; + + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) { + /* The externally stored part of the column has been + (partially) deleted. Signal the half-deleted BLOB + to the caller. */ + + return(0); + } + + uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID); + uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO); + uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET); + len -= local_len; + + return(local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + uint32_t(len), + zip_size, + page_id_t( + space_id, + page_no), + offset)); +} + +/** Copies an externally stored field of a record to mem heap. +The clustered index record must be protected by a lock or a page latch. +@param[out] len length of the whole field +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] local_len length of data +@param[in,out] heap mem heap +@return the whole field copied to heap */ +byte* +btr_copy_externally_stored_field( + ulint* len, + const byte* data, + ulint zip_size, + ulint local_len, + mem_heap_t* heap) +{ + byte* buf; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + uint32_t space_id = mach_read_from_4(data + local_len + + BTR_EXTERN_SPACE_ID); + uint32_t page_no = mach_read_from_4(data + local_len + + BTR_EXTERN_PAGE_NO); + uint32_t offset = mach_read_from_4(data + local_len + + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger than 4 GB; we + leave the 4 upper bytes in the length field unused */ + + uint32_t extern_len = mach_read_from_4(data + local_len + + BTR_EXTERN_LEN + 4); + + buf = (byte*) mem_heap_alloc(heap, local_len + extern_len); + + memcpy(buf, data, local_len); + *len = local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + extern_len, + zip_size, + page_id_t( + space_id, + page_no), + offset); + + return(buf); +} + +/** Copies an externally stored field of a record to mem heap. +@param[in] rec record in a clustered index; must be +protected by a lock or a page latch +@param[in] offset array returned by rec_get_offsets() +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] no field number +@param[out] len length of the field +@param[in,out] heap mem heap +@return the field copied to heap, or NULL if the field is incomplete */ +byte* +btr_rec_copy_externally_stored_field( + const rec_t* rec, + const rec_offs* offsets, + ulint zip_size, + ulint no, + ulint* len, + mem_heap_t* heap) +{ + ulint local_len; + const byte* data; + + ut_a(rec_offs_nth_extern(offsets, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, offsets, no, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY + (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + trx_rollback_recovered() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(NULL); + } + + return(btr_copy_externally_stored_field(len, data, + zip_size, local_len, heap)); +} |