diff options
Diffstat (limited to 'storage/innobase/btr/btr0btr.cc')
-rw-r--r-- | storage/innobase/btr/btr0btr.cc | 5433 |
1 files changed, 5433 insertions, 0 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc new file mode 100644 index 00000000..08be1991 --- /dev/null +++ b/storage/innobase/btr/btr0btr.cc @@ -0,0 +1,5433 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0btr.cc +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#include "page0page.h" +#include "page0zip.h" +#include "gis0rtree.h" + +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "btr0defragment.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "gis0geo.h" +#include "dict0boot.h" +#include "row0sel.h" /* row_search_max_autoinc() */ +#include "log.h" + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return true if possible to merge. */ +static +bool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + uint32_t page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/* +Latching strategy of the InnoDB B-tree +-------------------------------------- + +Node pointer page latches acquisition is protected by index->lock latch. + +Before MariaDB 10.2.2, all node pointer pages were protected by index->lock +either in S (shared) or X (exclusive) mode and block->lock was not acquired on +node pointer pages. + +After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect +node pointer pages and obtaiment of node pointer page latches is protected by +index->lock. + +(0) Definition: B-tree level. + +(0.1) The leaf pages of the B-tree are at level 0. + +(0.2) The parent of a page at level L has level L+1. (The level of the +root page is equal to the tree height.) + +(0.3) The B-tree lock (index->lock) is the parent of the root page and +has a level = tree height + 1. + +Index->lock has 3 possible locking modes: + +(1) S-latch: + +(1.1) All latches for pages must be obtained in descending order of tree level. + +(1.2) Before obtaining the first node pointer page latch at a given B-tree +level, parent latch must be held (at level +1 ). + +(1.3) If a node pointer page is already latched at the same level +we can only obtain latch to its right sibling page latch at the same level. + +(1.4) Release of the node pointer page latches must be done in +child-to-parent order. (Prevents deadlocks when obtained index->lock +in SX mode). + +(1.4.1) Level L node pointer page latch can be released only when +no latches at children level i.e. level < L are hold. + +(1.4.2) All latches from node pointer pages must be released so +that no latches are obtained between. + +(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer +latch obtained. + +(2) SX-latch: + +In this case rules (1.2) and (1.3) from S-latch case are relaxed and +merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition +can be skipped at some tree levels and latches can be obtained in +a less restricted order. + +(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending +order of tree level. + +(2.2) When a node pointer latch at level L is obtained, +the left sibling page latch in the same level or some ancestor +page latch (at level > L) must be hold. + +(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can +be any node pointer page. + +(3) X-latch: + +Node pointer latches can be obtained in any order. + +NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages: + +index->lock S-latch is needed in read for the node pointer traversal. When the leaf +level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all +node pointer latches). Left to right index travelsal in leaf page level can be safely done +by obtaining right sibling leaf page latch and then releasing the old page latch. + +Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock +S-latch. + +B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page +allocations are protected by index->lock X-latch. + +Node pointers +------------- +Leaf pages of a B-tree contain the index records stored in the +tree. On levels n > 0 we store 'node pointers' to pages on level +n - 1. For each page there is exactly one node pointer stored: +thus the our tree is an ordinary B-tree, not a B-link tree. + +A node pointer contains a prefix P of an index record. The prefix +is long enough so that it determines an index record uniquely. +The file page number of the child page is added as the last +field. To the child page we can store node pointers or index records +which are >= P in the alphabetical order, but < P1 if there is +a next node pointer on the level, and P1 is its prefix. + +If a node pointer with a prefix P points to a non-leaf child, +then the leftmost record in the child must have the same +prefix P. If it points to a leaf node, the child is not required +to contain any record with a prefix equal to P. The leaf case +is decided this way to allow arbitrary deletions in a leaf node +without touching upper levels of the tree. + +We have predefined a special minimum record which we +define as the smallest record in any alphabetical order. +A minimum record is denoted by setting a bit in the record +header. A minimum record acts as the prefix of a node pointer +which points to a leftmost node on any level of the tree. + +File page allocation +-------------------- +In the root node of a B-tree there are two file segment headers. +The leaf pages of a tree are allocated from one file segment, to +make them consecutive on disk if possible. From the other file segment +we allocate pages for the non-leaf levels of the tree. +*/ + +/** Check a file segment header within a B-tree root page. +@param offset file segment header offset +@param block B-tree root page +@param space tablespace +@return whether the segment header is valid */ +static bool btr_root_fseg_validate(ulint offset, + const buf_block_t &block, + const fil_space_t &space) +{ + ut_ad(block.page.id().space() == space.id); + const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET + + block.page.frame); + if (FIL_PAGE_DATA <= hdr && hdr <= srv_page_size - FIL_PAGE_DATA_END && + mach_read_from_4(block.page.frame + offset + FSEG_HDR_SPACE) == space.id) + return true; + sql_print_error("InnoDB: Index root page " UINT32PF " in %s is corrupted " + "at " ULINTPF, + block.page.id().page_no(), + UT_LIST_GET_FIRST(space.chain)->name); + return false; +} + +/** Report a decryption failure. */ +ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index) +{ + ib_push_warning(static_cast<void*>(nullptr), DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index.table->name.m_name); + index.table->file_unreadable= true; +} + +/** Get an index page and declare its latching order level. +@param[in] index index tree +@param[in] page page number +@param[in] mode latch mode +@param[in] merge whether change buffer merge should be attempted +@param[in,out] mtr mini-transaction +@param[out] err error code +@return block */ +buf_block_t *btr_block_get(const dict_index_t &index, + uint32_t page, rw_lock_type_t mode, bool merge, + mtr_t *mtr, dberr_t *err) +{ + ut_ad(mode != RW_NO_LATCH); + dberr_t local_err; + if (!err) + err= &local_err; + buf_block_t *block= + buf_page_get_gen(page_id_t{index.table->space->id, page}, + index.table->space->zip_size(), mode, nullptr, BUF_GET, + mtr, err, merge && !index.is_clust()); + ut_ad(!block == (*err != DB_SUCCESS)); + + if (UNIV_LIKELY(block != nullptr)) + { + if (!!page_is_comp(block->page.frame) != index.table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index.id || + !fil_page_index_page_check(block->page.frame) || + index.is_spatial() != + (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) + { + *err= DB_PAGE_CORRUPTED; + block= nullptr; + } + } + else if (*err == DB_DECRYPTION_FAILED) + btr_decryption_failed(index); + + return block; +} + +/**************************************************************//** +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ +buf_block_t* +btr_root_block_get( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + rw_lock_type_t mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + if (!index->table || !index->table->space) + { + *err= DB_TABLESPACE_NOT_FOUND; + return nullptr; + } + + buf_block_t *block; +#ifndef BTR_CUR_ADAPT + static constexpr buf_block_t *guess= nullptr; +#else + buf_block_t *&guess= btr_search_get_info(index)->root_guess; + guess= +#endif + block= + buf_page_get_gen(page_id_t{index->table->space->id, index->page}, + index->table->space->zip_size(), mode, guess, BUF_GET, + mtr, err, false); + ut_ad(!block == (*err != DB_SUCCESS)); + + if (UNIV_LIKELY(block != nullptr)) + { + if (UNIV_UNLIKELY(mode == RW_NO_LATCH)); + else if (!!page_is_comp(block->page.frame) != + index->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index->id || + !fil_page_index_page_check(block->page.frame) || + index->is_spatial() != + (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) + { + *err= DB_PAGE_CORRUPTED; + block= nullptr; + } + else if (index->is_ibuf()); + else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *block, *index->table->space) || + !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *index->table->space)) + { + *err= DB_CORRUPTION; + block= nullptr; + } + } + else if (*err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + + return block; +} + +/**************************************************************//** +Gets the root node of a tree and sx-latches it for segment access. +@return root page, sx-latched */ +static +page_t* +btr_root_get( +/*=========*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + /* Intended to be used for accessing file segment lists. + Concurrent read of other data is allowed. */ + if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err)) + return root->page.frame; + return nullptr; +} + +/**************************************************************//** +Checks a file segment header within a B-tree root page and updates +the segment header space id. +@return TRUE if valid */ +static +bool +btr_root_fseg_adjust_on_import( +/*===========================*/ + fseg_header_t* seg_header, /*!< in/out: segment header */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + ulint space) /*!< in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + if (offset < FIL_PAGE_DATA + || offset > srv_page_size - FIL_PAGE_DATA_END) { + return false; + } + + seg_header += FSEG_HDR_SPACE; + + mach_write_to_4(seg_header, space); + if (UNIV_LIKELY_NULL(page_zip)) { + memcpy(page_zip->data + page_offset(seg_header), seg_header, + 4); + } + + return true; +} + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ +{ + dberr_t err; + mtr_t mtr; + page_t* page; + page_zip_des_t* page_zip; + dict_table_t* table = index->table; + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", + return(DB_CORRUPTION);); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + buf_block_t* block = buf_page_get_gen( + page_id_t(table->space->id, index->page), + table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET, + &mtr, &err); + if (!block) { + ut_ad(err != DB_SUCCESS); + goto func_exit; + } + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + if (!fil_page_index_page_check(page) || page_has_siblings(page)) { + err = DB_CORRUPTION; + + } else if (dict_index_is_clust(index)) { + bool page_is_compact_format; + + page_is_compact_format = page_is_comp(page) > 0; + + /* Check if the page format and table format agree. */ + if (page_is_compact_format != dict_table_is_comp(table)) { + err = DB_CORRUPTION; + } else { + /* Check that the table flags and the tablespace + flags match. */ + uint32_t tf = dict_tf_to_fsp_flags(table->flags); + uint32_t sf = table->space->flags; + sf &= ~FSP_FLAGS_MEM_MASK; + tf &= ~FSP_FLAGS_MEM_MASK; + if (fil_space_t::is_flags_equal(tf, sf) + || fil_space_t::is_flags_equal(sf, tf)) { + mysql_mutex_lock(&fil_system.mutex); + table->space->flags = (table->space->flags + & ~FSP_FLAGS_MEM_MASK) + | (tf & FSP_FLAGS_MEM_MASK); + mysql_mutex_unlock(&fil_system.mutex); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } + } + } else { + err = DB_SUCCESS; + } + + /* Check and adjust the file segment headers, if all OK so far. */ + if (err == DB_SUCCESS + && (!btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + page, page_zip, table->space_id) + || !btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + page, page_zip, table->space_id))) { + + err = DB_CORRUPTION; + } + +func_exit: + mtr_commit(&mtr); + + return(err); +} + +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + + block->page.frame); + + if (UNIV_LIKELY_NULL(page_zip)) + { + mach_write_to_8(index_id, index->id); + page_create_zip(block, index, level, 0, mtr); + } + else + { + page_create(block, mtr, dict_table_is_comp(index->table)); + if (index->is_spatial()) + { + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::MAYBE_NOP>(*block, + my_assume_aligned<2>(PAGE_HEADER + + PAGE_LEVEL + + block->page.frame), + level); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id); + } +} + +buf_block_t * +mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const +{ + ut_ad(is_active()); + ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX || + type == MTR_MEMO_PAGE_S_FIX); + for (ulint i= 0; i < m_memo.size(); i++) + { + const mtr_memo_slot_t &slot= m_memo[i]; + const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY); + if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type) + { + buf_block_t *block= static_cast<buf_block_t*>(slot.object); + if (block->page.id() == id) + return block; + } + } + return nullptr; +} + +/** Fetch an index root page that was already latched in the +mini-transaction. */ +static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr) +{ + return mtr->get_already_latched(page_id_t{index.table->space_id, index.page}, + MTR_MEMO_PAGE_SX_FIX); +} + +/** Fetch an index page that should have been already latched in the +mini-transaction. */ +static buf_block_t * +btr_block_reget(mtr_t *mtr, const dict_index_t &index, + const page_id_t id, dberr_t *err) +{ + if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX)) + { + *err= DB_SUCCESS; + return block; + } + + ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK)); + return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err); +} + +/**************************************************************//** +Allocates a new file page to be used in an ibuf tree. Takes the page from +the free list of the tree, which must contain pages! +@return new allocated block, x-latched */ +static +buf_block_t* +btr_page_alloc_for_ibuf( +/*====================*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t *root= btr_get_latched_root(*index, mtr); + if (UNIV_UNLIKELY(!root)) + return root; + buf_block_t *new_block= + buf_page_get_gen(page_id_t(IBUF_SPACE_ID, + mach_read_from_4(PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST + + FLST_FIRST + FIL_ADDR_PAGE + + root->page.frame)), + 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); + if (new_block) + *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block, + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + ut_d(if (*err == DB_SUCCESS) + flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + return new_block; +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +buf_block_t* +btr_page_alloc_low( +/*===============*/ + dict_index_t* index, /*!< in: index */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr, /*!< in/out: mtr or another + mini-transaction in which the + page should be initialized. */ + dberr_t* err) /*!< out: error code */ +{ + const auto savepoint= mtr->get_savepoint(); + buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err); + if (UNIV_UNLIKELY(!root)) + return root; + + const bool have_latch= mtr->have_u_or_x_latch(*root); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!have_latch || !root->index || !root->index->freed()); +#endif + mtr->rollback_to_savepoint(savepoint); + + if (!have_latch && + UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err)))) + return root; + + fseg_header_t *seg_header= root->page.frame + + (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF); + return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction, + true, mtr, init_mtr, err); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr, /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + dberr_t* err) /*!< out: error code */ +{ + ut_ad(level < BTR_MAX_NODE_LEVEL); + return index->is_ibuf() + ? btr_page_alloc_for_ibuf(index, mtr, err) + : btr_page_alloc_low(index, hint_page_no, file_direction, level, + mtr, init_mtr, err); +} + +/**************************************************************//** +Frees a page used in an ibuf tree. Puts the page to the free list of the +ibuf tree. */ +static +dberr_t +btr_page_free_for_ibuf( +/*===================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + buf_block_t *root= btr_get_latched_root(*index, mtr); + dberr_t err= + flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + ut_d(if (err == DB_SUCCESS) + flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + return err; +} + +/** Free an index page. +@param[in,out] index index tree +@param[in,out] block block to be freed +@param[in,out] mtr mini-transaction +@param[in] blob whether this is freeing a BLOB page +@param[in] latched whether index->table->space->x_lock() was called +@return error code */ +dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, + bool blob, bool space_latched) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +#if defined BTR_CUR_HASH_ADAPT && defined UNIV_DEBUG + if (btr_search_check_marked_free_index(block)) + { + ut_ad(!blob); + ut_ad(page_is_leaf(block->page.frame)); + } +#endif + const uint32_t page{block->page.id().page_no()}; + ut_ad(index->table->space_id == block->page.id().space()); + /* The root page is freed by btr_free_root(). */ + ut_ad(page != index->page); + ut_ad(mtr->is_named_space(index->table->space)); + + /* The page gets invalid for optimistic searches: increment the frame + modify clock */ + buf_block_modify_clock_inc(block); + + /* TODO: Discard any operations for block from mtr->m_log. + The page will be freed, so previous changes to it by this + mini-transaction should not matter. */ + + if (index->is_ibuf()) + return btr_page_free_for_ibuf(index, block, mtr); + + fil_space_t *space= index->table->space; + dberr_t err; + + const auto savepoint= mtr->get_savepoint(); + if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err)) + { + const bool have_latch= mtr->have_u_or_x_latch(*root); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!have_latch || !root->index || !root->index->freed()); +#endif + mtr->rollback_to_savepoint(savepoint); + if (have_latch || + (root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err))) + err= fseg_free_page(&root->page.frame[blob || + page_is_leaf(block->page.frame) + ? PAGE_HEADER + PAGE_BTR_SEG_LEAF + : PAGE_HEADER + PAGE_BTR_SEG_TOP], + space, page, mtr, space_latched); + } + if (err == DB_SUCCESS) + buf_page_free(space, page, mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain exclusively latched until mtr_t::commit() or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + return err; +} + +/** Set the child page number in a node pointer record. +@param[in,out] block non-leaf index page +@param[in,out] rec node pointer record in the page +@param[in] offsets rec_get_offsets(rec) +@param[in] page_no child page number +@param[in,out] mtr mini-transaction +Sets the child node file address in a node pointer. */ +inline void btr_node_ptr_set_child_page_no(buf_block_t *block, + rec_t *rec, const rec_offs *offsets, + ulint page_no, mtr_t *mtr) +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!page_rec_is_leaf(rec)); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + const ulint offs= rec_offs_data_size(offsets); + ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) == + REC_NODE_PTR_SIZE); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) + page_zip_write_node_ptr(block, rec, offs, page_no, mtr); + else + mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no); +} + +MY_ATTRIBUTE((nonnull(1,2,3,4),warn_unused_result)) +/************************************************************//** +Returns the child page of a node pointer and sx-latches it. +@return child page, sx-latched */ +static +buf_block_t* +btr_node_ptr_get_child( +/*===================*/ + const rec_t* node_ptr,/*!< in: node pointer */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err = nullptr) /*!< out: error code */ +{ + ut_ad(rec_offs_validate(node_ptr, index, offsets)); + ut_ad(index->table->space_id + == page_get_space_id(page_align(node_ptr))); + + return btr_block_get( + *index, btr_node_ptr_get_child_page_no(node_ptr, offsets), + RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1, + mtr, err); +} + +MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an sx-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_get_father_node_ptr_for_validate( + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); + dict_index_t* index = btr_cur_get_index(cursor); + ut_ad(!dict_index_is_spatial(index)); + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); + ut_ad(dict_index_get_page(index) != page_no); + + const auto level = btr_page_get_level(btr_cur_get_page(cursor)); + + const rec_t* user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + + if (btr_cur_search_to_nth_level(level + 1, + dict_index_build_node_ptr(index, + user_rec, 0, + heap, level), + RW_S_LATCH, + cursor, mtr) != DB_SUCCESS) { + return nullptr; + } + + const rec_t* node_ptr = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { + offsets = nullptr; + } + + return(offsets); +} + +MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) +/** Return the node pointer to a page. +@param offsets work area for the return value +@param heap memory heap +@param cursor in: child page; out: node pointer to it +@param mtr mini-transaction +@return rec_get_offsets() of the node pointer record +@retval nullptr if the parent page had not been latched in mtr */ +static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap, + btr_cur_t *cursor, mtr_t *mtr) +{ + const uint32_t page_no= cursor->block()->page.id().page_no(); + const dict_index_t *index= cursor->index(); + ut_ad(!index->is_spatial()); + ut_ad(index->page != page_no); + + uint32_t p= index->page; + auto level= btr_page_get_level(cursor->block()->page.frame); + const dtuple_t *tuple= + dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, level); + level++; + + ulint i; + for (i= 0; i < mtr->get_savepoint(); i++) + if (buf_block_t *block= mtr->block_at_savepoint(i)) + if (block->page.id().page_no() == p) + { + ut_ad(block->page.lock.have_u_or_x() || + (!block->page.lock.have_s() && index->lock.have_x())); + ulint up_match= 0, low_match= 0; + cursor->page_cur.block= block; + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, + &low_match, &cursor->page_cur, + nullptr)) + return nullptr; + offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets); + if (p != page_no) + { + if (btr_page_get_level(block->page.frame) == level) + return nullptr; + i= 0; // MDEV-29835 FIXME: require all pages to be latched in order! + continue; + } + ut_ad(block->page.lock.have_u_or_x()); + if (block->page.lock.have_u_not_x()) + { + /* btr_cur_t::search_leaf(BTR_MODIFY_TREE) only U-latches the + root page initially. */ + ut_ad(block->page.id().page_no() == index->page); + block->page.lock.u_x_upgrade(); + mtr->page_lock_upgrade(*block); + } + return offsets; + } + + return nullptr; +} + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_get_father_block( +/*======================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + rec_t *rec= + page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); + if (UNIV_UNLIKELY(!rec)) + return nullptr; + cursor->page_cur.rec= rec; + return btr_page_get_parent(offsets, heap, cursor, mtr); +} + +/** Seek to the parent page of a B-tree page. +@param[in,out] mtr mini-transaction +@param[in,out] cursor cursor pointing to the x-latched parent page +@return whether the cursor was successfully positioned */ +bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor) +{ + rec_t *rec= + page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); + if (UNIV_UNLIKELY(!rec)) + return false; + cursor->page_cur.rec= rec; + mem_heap_t *heap= mem_heap_create(100); + const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr); + mem_heap_free(heap); + return got; +} + +#ifdef UNIV_DEBUG +/** PAGE_INDEX_ID value for freed index B-trees */ +constexpr index_id_t BTR_FREED_INDEX_ID = 0; +#endif + +/** Free a B-tree root page. btr_free_but_not_root() must already +have been called. +@param block index root page +@param space tablespace +@param mtr mini-transaction */ +static void btr_free_root(buf_block_t *block, const fil_space_t &space, + mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->is_named_space(&space)); + + btr_search_drop_page_hash_index(block, false); + + if (btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP, *block, space)) + { + /* Free the entire segment in small steps. */ + ut_d(mtr->freeing_tree()); + while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + + block->page.frame, mtr)); + } +} + +MY_ATTRIBUTE((warn_unused_result)) +/** Prepare to free a B-tree. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction +@return root block, to invoke btr_free_but_not_root() and btr_free_root() +@retval NULL if the page is no longer a matching B-tree page */ +static +buf_block_t *btr_free_root_check(const page_id_t page_id, ulint zip_size, + index_id_t index_id, mtr_t *mtr) +{ + ut_ad(page_id.space() != SRV_TMP_SPACE_ID); + ut_ad(index_id != BTR_FREED_INDEX_ID); + + buf_block_t *block= buf_page_get_gen(page_id, zip_size, RW_X_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr); + + if (!block); + else if (fil_page_index_page_check(block->page.frame) && + index_id == btr_page_get_index_id(block->page.frame)) + /* This should be a root page. It should not be possible to + reassign the same index_id for some other index in the + tablespace. */ + ut_ad(!page_has_siblings(block->page.frame)); + else + block= nullptr; + + return block; +} + +/** Initialize the root page of the b-tree +@param[in,out] block root block +@param[in] index_id index id +@param[in] index index of root page +@param[in,out] mtr mini-transaction */ +static void btr_root_page_init(buf_block_t *block, index_id_t index_id, + dict_index_t *index, mtr_t *mtr) +{ + constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID; + byte *page_index_id= my_assume_aligned<2>(field + block->page.frame); + + /* Create a new index page on the allocated segment page */ + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + mach_write_to_8(page_index_id, index_id); + ut_ad(!page_has_siblings(block->page.zip.data)); + page_create_zip(block, index, 0, 0, mtr); + } + else + { + page_create(block, mtr, index && index->table->not_redundant()); + if (index && index->is_spatial()) + { + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::MAYBE_NOP>( + *block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, 0U); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, index_id); + } +} + +/** Create the root node for a new index tree. +@param[in] type type of the index +@param[in] index_id index id +@param[in,out] space tablespace where created +@param[in] index index, or NULL to create a system table +@param[in,out] mtr mini-transaction +@param[out] err error code +@return page number of the created root +@retval FIL_NULL if did not succeed */ +uint32_t +btr_create( + ulint type, + fil_space_t* space, + index_id_t index_id, + dict_index_t* index, + mtr_t* mtr, + dberr_t* err) +{ + buf_block_t* block; + + ut_ad(mtr->is_named_space(space)); + ut_ad(index_id != BTR_FREED_INDEX_ID); + ut_ad(index || space == fil_system.sys_space); + + /* Create the two new segments (one, in the case of an ibuf tree) for + the index tree; the segment headers are put on the allocated root page + (for an ibuf tree, not in the root, but on a separate ibuf header + page) */ + + if (UNIV_UNLIKELY(type & DICT_IBUF)) { + /* Allocate first the ibuf header page */ + buf_block_t* ibuf_hdr_block = fseg_create( + space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err); + + if (ibuf_hdr_block == NULL) { + return(FIL_NULL); + } + + ut_ad(ibuf_hdr_block->page.id().page_no() + == IBUF_HEADER_PAGE_NO); + /* Allocate then the next page to the segment: it will be the + tree root page */ + + block = fseg_alloc_free_page_general( + buf_block_get_frame(ibuf_hdr_block) + + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + IBUF_TREE_ROOT_PAGE_NO, + FSP_UP, false, mtr, mtr, err); + + if (block == NULL) { + return(FIL_NULL); + } + + ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO)); + + flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); + } else { + block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP, + mtr, err); + + if (block == NULL) { + return(FIL_NULL); + } + + if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr, + err, false, block)) { + /* Not enough space for new segment, free root + segment before return. */ + btr_free_root(block, *space, mtr); + return(FIL_NULL); + } + } + + ut_ad(!page_has_siblings(block->page.frame)); + + btr_root_page_init(block, index_id, index, mtr); + + /* We reset the free bits for the page in a separate + mini-transaction to allow creation of several trees in the + same mtr, otherwise the latch on a bitmap page would prevent + it because of the latching order. + + Note: Insert Buffering is disabled for temporary tables given that + most temporary tables are smaller in size and short-lived. */ + if (!(type & DICT_CLUSTERED) + && (!index || !index->table->is_temporary())) { + ibuf_reset_free_bits(block); + } + + /* In the following assertion we test that two records of maximum + allowed size fit on the root page: this fact is needed to ensure + correctness of split algorithms */ + + ut_ad(page_get_max_insert_size(block->page.frame, 2) + > 2 * BTR_PAGE_MAX_REC_SIZE); + + return(block->page.id().page_no()); +} + +/** Free a B-tree except the root page. The root page MUST be freed after +this by calling btr_free_root. +@param[in,out] block root page +@param[in] log_mode mtr logging mode */ +static +void +btr_free_but_not_root( + buf_block_t* block, + mtr_log_t log_mode +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif + ) +{ + mtr_t mtr; + + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(!page_has_siblings(block->page.frame)); +leaf_loop: + mtr_start(&mtr); + ut_d(mtr.freeing_tree()); + mtr_set_log_mode(&mtr, log_mode); + fil_space_t *space = mtr.set_named_space_id(block->page.id().space()); + + if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *block, *space) + || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *space)) { + mtr_commit(&mtr); + return; + } + + /* NOTE: page hash indexes are dropped when a page is freed inside + fsp0fsp. */ + + bool finished = fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_LEAF + + block->page.frame, &mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ); + mtr_commit(&mtr); + + if (!finished) { + + goto leaf_loop; + } +top_loop: + mtr_start(&mtr); + mtr_set_log_mode(&mtr, log_mode); + space = mtr.set_named_space_id(block->page.id().space()); + + finished = !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *space) + || fseg_free_step_not_header(PAGE_HEADER + PAGE_BTR_SEG_TOP + + block->page.frame, &mtr +#ifdef BTR_CUR_HASH_ADAPT + ,ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ); + mtr_commit(&mtr); + + if (!finished) { + goto top_loop; + } +} + +/** Clear the index tree and reinitialize the root page, in the +rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized. +@param thr query thread +@return error code */ +TRANSACTIONAL_TARGET +dberr_t dict_index_t::clear(que_thr_t *thr) +{ + mtr_t mtr; + mtr.start(); + if (table->is_temporary()) + mtr.set_log_mode(MTR_LOG_NO_REDO); + else + set_modified(mtr); + mtr_sx_lock_index(this, &mtr); + + dberr_t err; + if (buf_block_t *root_block= + buf_page_get_gen(page_id_t(table->space->id, page), + table->space->zip_size(), + RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) + { + btr_free_but_not_root(root_block, mtr.get_log_mode() +#ifdef BTR_CUR_HASH_ADAPT + ,n_ahi_pages() != 0 +#endif + ); + +#ifdef BTR_CUR_HASH_ADAPT + if (root_block->index) + btr_search_drop_page_hash_index(root_block, false); + ut_ad(n_ahi_pages() == 0); +#endif + mtr.memset(root_block, PAGE_HEADER + PAGE_BTR_SEG_LEAF, + FSEG_HEADER_SIZE, 0); + if (fseg_create(table->space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr, + &err, false, root_block)) + btr_root_page_init(root_block, id, this, &mtr); + } + + mtr.commit(); + return err; +} + +/** Free a persistent index tree if it exists. +@param[in,out] space tablespce +@param[in] page root page number +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction */ +void btr_free_if_exists(fil_space_t *space, uint32_t page, + index_id_t index_id, mtr_t *mtr) +{ + if (buf_block_t *root= btr_free_root_check(page_id_t(space->id, page), + space->zip_size(), + index_id, mtr)) + { + btr_free_but_not_root(root, mtr->get_log_mode()); + mtr->set_named_space(space); + btr_free_root(root, *space, mtr); + } +} + +/** Drop a temporary table +@param table temporary table */ +void btr_drop_temporary_table(const dict_table_t &table) +{ + ut_ad(table.is_temporary()); + ut_ad(table.space == fil_system.temp_space); + mtr_t mtr; + mtr.start(); + for (const dict_index_t *index= table.indexes.start; index; + index= dict_table_get_next_index(index)) + { + if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0, + RW_X_LATCH, nullptr, BUF_GET, &mtr, + nullptr, false)) + { + btr_free_but_not_root(block, MTR_LOG_NO_REDO); + mtr.set_log_mode(MTR_LOG_NO_REDO); + btr_free_root(block, *fil_system.temp_space, &mtr); + mtr.commit(); + mtr.start(); + } + } + mtr.commit(); +} + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@return the last used AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc(dict_index_t* index) +{ + ut_ad(index->is_primary()); + ut_ad(index->table->persistent_autoinc); + ut_ad(!index->table->is_temporary()); + mtr_t mtr; + mtr.start(); + ib_uint64_t autoinc; + if (buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + index->table->space->zip_size(), + RW_S_LATCH, &mtr)) { + autoinc = page_get_autoinc(block->page.frame); + } else { + autoinc = 0; + } + mtr.commit(); + return autoinc; +} + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, +or fall back to MAX(auto_increment_column). +@param[in] table table containing an AUTO_INCREMENT column +@param[in] col_no index of the AUTO_INCREMENT column +@return the AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) +{ + ut_ad(table->persistent_autoinc); + ut_ad(!table->is_temporary()); + + dict_index_t* index = dict_table_get_first_index(table); + + if (index == NULL) { + return 0; + } + + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + index->table->space->zip_size(), + RW_S_LATCH, &mtr); + + ib_uint64_t autoinc = block + ? page_get_autoinc(block->page.frame) : 0; + const bool retry = block && autoinc == 0 + && !page_is_empty(block->page.frame); + mtr.commit(); + + if (retry) { + /* This should be an old data file where + PAGE_ROOT_AUTO_INC was initialized to 0. + Fall back to reading MAX(autoinc_col). + There should be an index on it. */ + const dict_col_t* autoinc_col + = dict_table_get_nth_col(table, col_no); + while (index && index->fields[0].col != autoinc_col) { + index = dict_table_get_next_index(index); + } + + if (index) { + autoinc = row_search_max_autoinc(index); + } + } + + return autoinc; +} + +/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@param[in] autoinc the AUTO_INCREMENT value +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) +{ + ut_ad(index->is_primary()); + ut_ad(index->table->persistent_autoinc); + ut_ad(!index->table->is_temporary()); + + mtr_t mtr; + mtr.start(); + fil_space_t *space= index->table->space; + if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page), + space->zip_size(), RW_SX_LATCH, &mtr)) + { + mtr.set_named_space(space); + page_set_autoinc(root, autoinc, &mtr, reset); + } + + mtr.commit(); +} + +/** Reorganize an index page. +@param cursor index page cursor +@param mtr mini-transaction */ +static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr) +{ + buf_block_t *const block= cursor->block; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!is_buf_block_get_page_zip(block)); + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(cursor->index->is_dummy || + block->page.id().space() == cursor->index->table->space->id); + ut_ad(cursor->index->is_dummy || + block->page.id().page_no() != cursor->index->page || + !page_has_siblings(block->page.frame)); + + /* Save the cursor position. */ + const ulint pos= page_rec_get_n_recs_before(cursor->rec); + + if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) + return DB_CORRUPTION; + + btr_search_drop_page_hash_index(block, false); + + buf_block_t *old= buf_block_alloc(); + /* Copy the old page to temporary space */ + memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->page.frame, block->page.frame, + srv_page_size); + + const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO); + + page_create(block, mtr, cursor->index->table->not_redundant()); + if (cursor->index->is_spatial()) + block->page.frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE); + + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + dberr_t err= + page_copy_rec_list_end_no_locks(block, old, + page_get_infimum_rec(old->page.frame), + cursor->index, mtr); + mtr->set_log_mode(log_mode); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + return err; + + /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */ + ut_ad(!page_get_max_trx_id(block->page.frame)); + memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->page.frame, + PAGE_MAX_TRX_ID + PAGE_HEADER + old->page.frame, 8); +#ifdef UNIV_DEBUG + if (page_get_max_trx_id(block->page.frame)) + /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than + clustered index root pages. */ + ut_ad(dict_index_is_sec_or_ibuf(cursor->index) + ? page_is_leaf(block->page.frame) + : block->page.id().page_no() == cursor->index->page); + else + /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than + the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf + pages, and in temporary tables. It was always zero-initialized in + page_create(). PAGE_MAX_TRX_ID must be nonzero on + dict_index_is_sec_or_ibuf() leaf pages. */ + ut_ad(cursor->index->table->is_temporary() || + !page_is_leaf(block->page.frame) || + !dict_index_is_sec_or_ibuf(cursor->index)); +#endif + + const uint16_t data_size1= page_get_data_size(old->page.frame); + const uint16_t data_size2= page_get_data_size(block->page.frame); + const ulint max1= + page_get_max_insert_size_after_reorganize(old->page.frame, 1); + const ulint max2= + page_get_max_insert_size_after_reorganize(block->page.frame, 1); + + if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2)) + { + sql_print_error("InnoDB: Page old data size %u new data size %u" + ", page old max ins size %zu new max ins size %zu", + data_size1, data_size2, max1, max2); + return DB_CORRUPTION; + } + + /* Restore the cursor position. */ + if (!pos) + ut_ad(cursor->rec == page_get_infimum_rec(block->page.frame)); + else if (!(cursor->rec= page_rec_get_nth(block->page.frame, pos))) + return DB_CORRUPTION; + + if (block->page.id().page_no() != cursor->index->page || + fil_page_get_type(old->page.frame) != FIL_PAGE_TYPE_INSTANT) + ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER)); + else if (!cursor->index->is_instant()) + { + ut_ad(!memcmp(old->page.frame, block->page.frame, FIL_PAGE_TYPE)); + ut_ad(!memcmp(old->page.frame + FIL_PAGE_TYPE + 2, + block->page.frame + FIL_PAGE_TYPE + 2, + PAGE_HEADER - FIL_PAGE_TYPE - 2)); + mtr->write<2,mtr_t::FORCED>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_INDEX); + } + else + { + /* Preserve the PAGE_INSTANT information. */ + memcpy_aligned<2>(FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_TYPE + old->page.frame, 2); + memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->page.frame, + PAGE_HEADER + PAGE_INSTANT + old->page.frame, 2); + if (!cursor->index->table->instant); + else if (page_is_comp(block->page.frame)) + { + memcpy(PAGE_NEW_INFIMUM + block->page.frame, + PAGE_NEW_INFIMUM + old->page.frame, 8); + memcpy(PAGE_NEW_SUPREMUM + block->page.frame, + PAGE_NEW_SUPREMUM + old->page.frame, 8); + } + else + { + memcpy(PAGE_OLD_INFIMUM + block->page.frame, + PAGE_OLD_INFIMUM + old->page.frame, 8); + memcpy(PAGE_OLD_SUPREMUM + block->page.frame, + PAGE_OLD_SUPREMUM + old->page.frame, 8); + } + + ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER)); + } + + ut_ad(!memcmp(old->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER, + block->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER, + PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER))); + + if (!cursor->index->has_locking()); + else if (cursor->index->page == FIL_NULL) + ut_ad(cursor->index->is_dummy); + else + lock_move_reorganize_page(block, old); + + /* Write log for the changes, if needed. */ + if (log_mode == MTR_LOG_ALL) + { + /* Check and log the changes in the page header. */ + ulint a, e; + for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* Write log for the changed page header fields. */ + mtr->memcpy(*block, a, e - a); + break; + } + + const uint16_t top= page_header_get_offs(block->page.frame, PAGE_HEAP_TOP); + + if (page_is_comp(block->page.frame)) + { + /* info_bits=0, n_owned=1, heap_no=0, status */ + ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + + block->page.frame, + PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + + old->page.frame, 3)); + /* If the 'next' pointer of the infimum record has changed, log it. */ + a= PAGE_NEW_INFIMUM - 2; + e= a + 2; + if (block->page.frame[a] == old->page.frame[a]) + a++; + if (--e, block->page.frame[e] != old->page.frame[e]) + e++; + if (ulint len= e - a) + mtr->memcpy(*block, a, len); + /* The infimum record itself must not change. */ + ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->page.frame, + PAGE_NEW_INFIMUM + old->page.frame, 8)); + /* Log any change of the n_owned of the supremum record. */ + a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES; + if (block->page.frame[a] != old->page.frame[a]) + mtr->memcpy(*block, a, 1); + /* The rest of the supremum record must not change. */ + ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1], + PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM + + REC_N_NEW_EXTRA_BYTES - 1)); + + /* Log the differences in the payload. */ + for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* TODO: write MEMMOVE records to minimize this further! */ + mtr->memcpy(*block, a, e - a); + break; + } + } + else + { + /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */ + ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + + block->page.frame, + PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + + old->page.frame, 4)); + /* If the 'next' pointer of the infimum record has changed, log it. */ + a= PAGE_OLD_INFIMUM - 2; + e= a + 2; + if (block->page.frame[a] == old->page.frame[a]) + a++; + if (--e, block->page.frame[e] != old->page.frame[e]) + e++; + if (ulint len= e - a) + mtr->memcpy(*block, a, len); + /* The infimum record itself must not change. */ + ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->page.frame, + PAGE_OLD_INFIMUM + old->page.frame, 8)); + /* Log any change of the n_owned of the supremum record. */ + a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES; + if (block->page.frame[a] != old->page.frame[a]) + mtr->memcpy(*block, a, 1); + ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1], + PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM + + REC_N_OLD_EXTRA_BYTES - 1)); + + /* Log the differences in the payload. */ + for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* TODO: write MEMMOVE records to minimize this further! */ + mtr->memcpy(*block, a, e - a); + break; + } + } + + e= srv_page_size - PAGE_DIR; + a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->page.frame); + + /* Zero out the payload area. */ + mtr->memset(*block, top, a - top, 0); + + /* Log changes to the page directory. */ + for (; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* Write log for the changed page directory slots. */ + mtr->memcpy(*block, a, e - a); + break; + } + } + + buf_block_free(old); + + MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS); + MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL); + return DB_SUCCESS; +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@return error code +@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ +dberr_t +btr_page_reorganize_block( + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (buf_block_get_page_zip(block)) + return page_zip_reorganize(block, index, z_level, mtr, true); + page_cur_t cur; + page_cur_set_before_first(block, &cur); + cur.index= index; + return btr_page_reorganize_low(&cur, mtr); +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@param cursor page cursor +@param mtr mini-transaction +@return error code +@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ +dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr) +{ + if (!buf_block_get_page_zip(cursor->block)) + return btr_page_reorganize_low(cursor, mtr); + + ulint pos= page_rec_get_n_recs_before(cursor->rec); + if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) + return DB_CORRUPTION; + + dberr_t err= page_zip_reorganize(cursor->block, cursor->index, + page_zip_level, mtr, true); + if (err == DB_FAIL); + else if (!pos) + ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->page.frame)); + else if (!(cursor->rec= page_rec_get_nth(cursor->block->page.frame, pos))) + err= DB_CORRUPTION; + + return err; +} + +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ +void +btr_page_empty( + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_zip == buf_block_get_page_zip(block)); + ut_ad(!index->is_dummy); + ut_ad(index->table->space->id == block->page.id().space()); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + btr_search_drop_page_hash_index(block, false); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index + root page. */ + const ib_uint64_t autoinc + = dict_index_is_clust(index) + && index->page == block->page.id().page_no() + ? page_get_autoinc(block->page.frame) + : 0; + + if (page_zip) { + page_create_zip(block, index, level, autoinc, mtr); + } else { + page_create(block, mtr, index->table->not_redundant()); + if (index->is_spatial()) { + static_assert(((FIL_PAGE_INDEX & 0xff00) + | byte(FIL_PAGE_RTREE)) + == FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + + FIL_RTREE_SPLIT_SEQ_NUM)) { + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, + 8, 0); + } + } + mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL + + block->page.frame, level); + if (autoinc) { + mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID + + block->page.frame, autoinc); + } + } +} + +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) +{ + ut_ad(index.n_core_fields > 0); + ut_ad(index.n_core_fields < REC_MAX_N_FIELDS); + ut_ad(index.is_instant()); + ut_ad(fil_page_get_type(root->page.frame) == FIL_PAGE_TYPE_INSTANT + || fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX); + ut_ad(!page_has_siblings(root->page.frame)); + ut_ad(root->page.id().page_no() == index.page); + + rec_t* infimum = page_get_infimum_rec(root->page.frame); + rec_t* supremum = page_get_supremum_rec(root->page.frame); + byte* page_type = root->page.frame + FIL_PAGE_TYPE; + uint16_t i = page_header_get_field(root->page.frame, PAGE_INSTANT); + + switch (mach_read_from_2(page_type)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_instant(root->page.frame) + == index.n_core_fields); + if (memcmp(infimum, "infimum", 8) + || memcmp(supremum, "supremum", 8)) { + ut_ad(index.table->instant); + ut_ad(!memcmp(infimum, field_ref_zero, 8)); + ut_ad(!memcmp(supremum, field_ref_zero, 7)); + /* The n_core_null_bytes only matters for + ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ + ut_ad(supremum[7] == index.n_core_null_bytes + || !index.table->not_redundant()); + return; + } + break; + default: + ut_ad("wrong page type" == 0); + /* fall through */ + case FIL_PAGE_INDEX: + ut_ad(!page_is_comp(root->page.frame) + || !page_get_instant(root->page.frame)); + ut_ad(!memcmp(infimum, "infimum", 8)); + ut_ad(!memcmp(supremum, "supremum", 8)); + mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT); + ut_ad(i <= PAGE_NO_DIRECTION); + i |= static_cast<uint16_t>(index.n_core_fields << 3); + mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + + root->page.frame, i); + break; + } + + if (index.table->instant) { + mtr->memset(root, infimum - root->page.frame, 8, 0); + mtr->memset(root, supremum - root->page.frame, 7, 0); + mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7], + index.n_core_null_bytes); + } +} + +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) +{ + ut_ad(!index.table->is_temporary()); + ut_ad(index.is_primary()); + buf_block_t *root= btr_get_latched_root(index, mtr); + byte *page_type= root->page.frame + FIL_PAGE_TYPE; + if (all) + { + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || + mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX); + byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame; + mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant, + page_ptr_get_direction(instant + 1)); + } + else + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); + static const byte supremuminfimum[8 + 8] = "supremuminfimum"; + uint16_t infimum, supremum; + if (page_is_comp(root->page.frame)) + { + infimum= PAGE_NEW_INFIMUM; + supremum= PAGE_NEW_SUPREMUM; + } + else + { + infimum= PAGE_OLD_INFIMUM; + supremum= PAGE_OLD_SUPREMUM; + } + ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) == + !memcmp(&root->page.frame[supremum], supremuminfimum, 8)); + mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[infimum], + supremuminfimum + 8, 8); + mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[supremum], + supremuminfimum, 8); +} + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + dict_index_t* index; + rec_t* rec; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t* page_cursor; + page_zip_des_t* root_page_zip; + page_zip_des_t* new_page_zip; + buf_block_t* root; + buf_block_t* new_block; + + root = btr_cur_get_block(cursor); + root_page_zip = buf_block_get_page_zip(root); + ut_ad(!page_is_empty(root->page.frame)); + index = btr_cur_get_index(cursor); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!root_page_zip + || page_zip_validate(root_page_zip, root->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + const page_id_t root_id{root->page.id()}; + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX)); + + if (index->page != root_id.page_no()) { + ut_ad("corrupted root page number" == 0); + return nullptr; + } + + if (index->is_ibuf()) { + } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *root, *index->table->space) + || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *root, *index->table->space)) { + return nullptr; + } + + /* Allocate a new page to the tree. Root splitting is done by first + moving the root records to the new page, emptying the root, putting + a node pointer to the new page, and then splitting the new page. */ + + level = btr_page_get_level(root->page.frame); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err); + + if (!new_block) { + return nullptr; + } + + new_page_zip = buf_block_get_page_zip(new_block); + ut_a(!new_page_zip == !root_page_zip); + ut_a(!new_page_zip + || page_zip_get_size(new_page_zip) + == page_zip_get_size(root_page_zip)); + + btr_page_create(new_block, new_page_zip, index, level, mtr); + if (page_has_siblings(new_block->page.frame)) { + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); + memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV, + 0xff, 8); + mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff); + if (UNIV_LIKELY_NULL(new_page_zip)) { + memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV, + 0xff, 8); + } + } + + /* Copy the records from root to the new page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || new_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(new_block, root, + page_get_infimum_rec(root->page.frame), + index, mtr, err)) { + switch (*err) { + case DB_SUCCESS: + break; + case DB_FAIL: + *err = DB_SUCCESS; + break; + default: + return nullptr; + } + + ut_a(new_page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(new_block, root_page_zip, + root->page.frame, index, mtr); + + /* Update the lock table and possible hash index. */ + if (index->has_locking()) { + lock_move_rec_list_end( + new_block, root, + page_get_infimum_rec(root->page.frame)); + } + + /* Move any existing predicate locks */ + if (dict_index_is_spatial(index)) { + lock_prdt_rec_move(new_block, root_id); + } else { + btr_search_move_or_delete_hash_entries( + new_block, root); + } + } + + constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID; + if (dict_index_is_sec_or_ibuf(index)) { + /* In secondary indexes and the change buffer, + PAGE_MAX_TRX_ID can be reset on the root page, because + the field only matters on leaf pages, and the root no + longer is a leaf page. (Older versions of InnoDB did + set PAGE_MAX_TRX_ID on all secondary index pages.) */ + byte* p = my_assume_aligned<8>( + PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame); + if (mach_read_from_8(p)) { + mtr->memset(root, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(root->page.zip.data)) { + memset_aligned<8>(max_trx_id + + root->page.zip.data, 0, 8); + } + } + } else { + /* PAGE_ROOT_AUTO_INC is only present in the clustered index + root page; on other clustered index pages, we want to reserve + the field PAGE_MAX_TRX_ID for future use. */ + byte* p = my_assume_aligned<8>( + PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame); + if (mach_read_from_8(p)) { + mtr->memset(new_block, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + memset_aligned<8>(max_trx_id + + new_block->page.zip.data, + 0, 8); + } + } + } + + /* If this is a pessimistic insert which is actually done to + perform a pessimistic update then we have stored the lock + information of the record to be inserted on the infimum of the + root page: we cannot discard the lock structs on the root page */ + + if (index->has_locking()) { + lock_update_root_raise(*new_block, root_id); + } + + /* Create a memory heap where the node pointer is stored */ + if (!*heap) { + *heap = mem_heap_create(1000); + } + + const uint32_t new_page_no = new_block->page.id().page_no(); + rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame)); + ut_ad(rec); /* We just created the page. */ + + /* Build the node pointer (= node key and page address) for the + child */ + if (dict_index_is_spatial(index)) { + rtr_mbr_t new_mbr; + + rtr_page_cal_mbr(index, new_block, &new_mbr, *heap); + node_ptr = rtr_index_build_node_ptr( + index, &new_mbr, rec, new_page_no, *heap); + } else { + node_ptr = dict_index_build_node_ptr( + index, rec, new_page_no, *heap, level); + } + /* The node pointer must be marked as the predefined minimum record, + as there is no lower alphabetical limit to records in the leftmost + node of a level: */ + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + + /* Rebuild the root page to get free space */ + btr_page_empty(root, root_page_zip, index, level + 1, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(root->page.frame)); + + if (index->is_instant()) { + ut_ad(!root_page_zip); + btr_set_instant(root, *index, mtr); + } + + ut_ad(!page_has_siblings(root->page.frame)); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Insert node pointer to the root */ + + page_cur_set_before_first(root, page_cursor); + + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + offsets, heap, 0, mtr); + + /* The root page should only contain the node pointer + to new_block at this point. Thus, the data should fit. */ + ut_a(node_ptr_rec); + + /* We play safe and reset the free bits for the new page */ + + if (!dict_index_is_clust(index) + && !index->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + } + + page_cursor->block = new_block; + page_cursor->index = index; + + ut_ad(dtuple_check_typed(tuple)); + /* Reposition the cursor to the child node */ + ulint low_match = 0, up_match = 0; + + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, + &up_match, &low_match, + page_cursor, nullptr)) { + *err = DB_CORRUPTION; + return nullptr; + } + + /* Split the child and insert tuple */ + return btr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr, err); +} + +/** Decide if the page should be split at the convergence point of inserts +converging to the left. +@param[in] cursor insert position +@return the first record to be moved to the right half page +@retval NULL if no split is recommended */ +rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor) +{ + rec_t* split_rec = btr_cur_get_rec(cursor); + const page_t* page = page_align(split_rec); + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) + != page_rec_get_next(split_rec)) { + return NULL; + } + + /* The metadata record must be present in the leftmost leaf page + of the clustered index, if and only if index->is_instant(). + However, during innobase_instant_try(), index->is_instant() + would already hold when row_ins_clust_index_entry_low() + is being invoked to insert the the metadata record. + So, we can only assert that when the metadata record exists, + index->is_instant() must hold. */ + ut_ad(!page_is_leaf(page) || page_has_prev(page) + || cursor->index()->is_instant() + || !(rec_get_info_bits(page_rec_get_next_const( + page_get_infimum_rec(page)), + cursor->index()->table->not_redundant()) + & REC_INFO_MIN_REC_FLAG)); + + const rec_t* infimum = page_get_infimum_rec(page); + + /* If the convergence is in the middle of a page, include also + the record immediately before the new insert to the upper + page. Otherwise, we could repeatedly move from page to page + lots of records smaller than the convergence point. */ + + if (split_rec == infimum + || split_rec == page_rec_get_next_const(infimum)) { + split_rec = page_rec_get_next(split_rec); + } + + return split_rec; +} + +/** Decide if the page should be split at the convergence point of inserts +converging to the right. +@param[in] cursor insert position +@param[out] split_rec if split recommended, the first record + on the right half page, or + NULL if the to-be-inserted record + should be first +@return whether split is recommended */ +bool +btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec) +{ + rec_t* insert_point = btr_cur_get_rec(cursor); + const page_t* page = page_align(insert_point); + + /* We use eager heuristics: if the new insert would be right after + the previous insert on the same page, we assume that there is a + pattern of sequential inserts here. */ + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) { + return false; + } + + insert_point = page_rec_get_next(insert_point); + + if (!insert_point || page_rec_is_supremum(insert_point)) { + insert_point = NULL; + } else { + insert_point = page_rec_get_next(insert_point); + if (page_rec_is_supremum(insert_point)) { + insert_point = NULL; + } + + /* If there are >= 2 user records up from the insert + point, split all but 1 off. We want to keep one because + then sequential inserts can use the adaptive hash + index, as they can do the necessary checks of the right + search position just by looking at the records on this + page. */ + } + + *split_rec = insert_point; + return true; +} + +/*************************************************************//** +Calculates a split record such that the tuple will certainly fit on +its half-page when the split is performed. We assume in this function +only that the cursor page has at least one user record. +@return split record, or NULL if tuple will be the first record on +the lower or upper half-page (determined by btr_page_tuple_smaller()) */ +static +rec_t* +btr_page_get_split_rec( +/*===================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + page_t* page; + page_zip_des_t* page_zip; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + ulint total_space; + ulint incl_data; + rec_t* ins_rec; + rec_t* rec; + rec_t* next_rec; + ulint n; + mem_heap_t* heap; + rec_offs* offsets; + + page = btr_cur_get_page(cursor); + + insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + page_zip = btr_cur_get_page_zip(cursor); + if (page_zip) { + /* Estimate the free space of an empty compressed page. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index()->n_fields, + page_zip_get_size(page_zip)); + + if (free_space > (ulint) free_space_zip) { + free_space = (ulint) free_space_zip; + } + } + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = ulint(page_get_n_recs(page)) + 1; + ut_ad(total_n_recs >= 2); + total_space = total_data + page_dir_calc_reserved_space(total_n_recs); + + n = 0; + incl_data = 0; + ins_rec = btr_cur_get_rec(cursor); + rec = page_get_infimum_rec(page); + + heap = NULL; + offsets = NULL; + + /* We start to include records to the left half, and when the + space reserved by them exceeds half of total_space, then if + the included records fit on the left page, they will be put there + if something was left over also for the right page, + otherwise the last included record will be the first on the right + half page */ + + do { + /* Decide the next record to include */ + if (rec == ins_rec) { + rec = NULL; /* NULL denotes that tuple is + now included */ + } else if (rec == NULL) { + rec = page_rec_get_next(ins_rec); + } else { + rec = page_rec_get_next(rec); + } + + if (rec == NULL) { + /* Include tuple */ + incl_data += insert_size; + } else { + offsets = rec_get_offsets(rec, cursor->index(), + offsets, page_is_leaf(page) + ? cursor->index() + ->n_core_fields + : 0, + ULINT_UNDEFINED, &heap); + incl_data += rec_offs_size(offsets); + } + + n++; + } while (incl_data + page_dir_calc_reserved_space(n) + < total_space / 2); + + if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { + /* The next record will be the first on + the right half page if it is not the + supremum record of page */ + + if (rec == ins_rec) { + rec = NULL; + + goto func_exit; + } else if (rec == NULL) { + next_rec = page_rec_get_next(ins_rec); + } else { + next_rec = page_rec_get_next(rec); + } + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; + } + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(rec); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Returns TRUE if the insert fits on the appropriate half-page with the +chosen split_rec. +@return true if fits */ +static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result)) +bool +btr_page_insert_fits( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert + should be made */ + const rec_t* split_rec,/*!< in: suggestion for first record + on upper half-page, or NULL if + tuple to be inserted should be first */ + rec_offs** offsets,/*!< in: rec_get_offsets( + split_rec, cursor->index()); out: garbage */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mem_heap_t** heap) /*!< in: temporary memory heap */ +{ + page_t* page; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + const rec_t* rec; + const rec_t* end_rec; + + page = btr_cur_get_page(cursor); + + ut_ad(!split_rec + || !page_is_comp(page) == !rec_offs_comp(*offsets)); + ut_ad(!split_rec + || rec_offs_validate(split_rec, cursor->index(), *offsets)); + + insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = ulint(page_get_n_recs(page)) + 1; + + /* We determine which records (from rec to end_rec, not including + end_rec) will end up on the other half page from tuple when it is + inserted. */ + + if (!(end_rec = split_rec)) { + end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); + } else if (cmp_dtuple_rec(tuple, split_rec, cursor->index(), + *offsets) < 0) { + rec = split_rec; + end_rec = page_get_supremum_rec(page); + goto got_rec; + } + + if (!(rec = page_rec_get_next(page_get_infimum_rec(page)))) { + return false; + } + +got_rec: + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + while (rec != end_rec) { + /* In this loop we calculate the amount of reserved + space after rec is removed from page. */ + + *offsets = rec_get_offsets(rec, cursor->index(), *offsets, + page_is_leaf(page) + ? cursor->index()->n_core_fields + : 0, + ULINT_UNDEFINED, heap); + + total_data -= rec_offs_size(*offsets); + total_n_recs--; + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + if (!(rec = page_rec_get_next_const(rec))) { + break; + } + } + + return(false); +} +#endif + +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +dberr_t +btr_insert_on_non_leaf_level( + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + mtr_t* mtr) /*!< in: mtr */ +{ + big_rec_t* dummy_big_rec; + btr_cur_t cursor; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + rtr_info_t rtr_info; + + ut_ad(level > 0); + + flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG; + cursor.page_cur.index = index; + + dberr_t err; + + if (index->is_spatial()) { + /* For spatial index, initialize structures to track + its parents etc. */ + rtr_init_rtr_info(&rtr_info, false, &cursor, index, false); + + rtr_info_update_btr(&cursor, &rtr_info); + err = rtr_search_to_nth_level(level, tuple, + PAGE_CUR_RTREE_INSERT, + BTR_CONT_MODIFY_TREE, + &cursor, mtr); + } else { + err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH, + &cursor, mtr); + } + + ut_ad(cursor.flag == BTR_CUR_BINARY); + ut_ad(btr_cur_get_block(&cursor) + != mtr->at_savepoint(mtr->get_savepoint() - 1) + || index->is_spatial() + || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + err = btr_cur_optimistic_insert(flags, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + } + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert(flags, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (index->is_spatial()) { + ut_ad(cursor.rtr_info); + + rtr_clean_rtr_info(&rtr_info, true); + } + + return err; +} + +static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); +static_assert(FIL_PAGE_PREV % 4 == 0, "alignment"); +static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment"); + +MY_ATTRIBUTE((nonnull,warn_unused_result)) +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +static +dberr_t +btr_attach_half_pages( +/*==================*/ + ulint flags, /*!< in: undo logging and + locking flags */ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + const rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* node_ptr_upper; + mem_heap_t* heap; + buf_block_t* prev_block = nullptr; + buf_block_t* next_block = nullptr; + buf_block_t* lower_block; + buf_block_t* upper_block; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + /* Based on split direction, decide upper and lower pages */ + if (direction == FSP_DOWN) { + + btr_cur_t cursor; + rec_offs* offsets; + + lower_block = new_block; + upper_block = block; + + cursor.page_cur.block = block; + cursor.page_cur.index = index; + + /* Look up the index for the node pointer to page */ + offsets = btr_page_get_father_block(nullptr, heap, mtr, + &cursor); + + /* Replace the address of the old child node (= page) with the + address of the new lower half */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_block(&cursor), + btr_cur_get_rec(&cursor), + offsets, lower_block->page.id().page_no(), mtr); + mem_heap_empty(heap); + } else { + lower_block = block; + upper_block = new_block; + } + + /* Get the level of the split pages */ + const ulint level = btr_page_get_level(block->page.frame); + ut_ad(level == btr_page_get_level(new_block->page.frame)); + page_id_t id{block->page.id()}; + + /* Get the previous and next pages of page */ + const uint32_t prev_page_no = btr_page_get_prev(block->page.frame); + const uint32_t next_page_no = btr_page_get_next(block->page.frame); + + /* for consistency, both blocks should be locked, before change */ + if (prev_page_no != FIL_NULL && direction == FSP_DOWN) { + id.set_page_no(prev_page_no); + prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!prev_block) { + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_X_LOCK)); + prev_block = btr_block_get(*index, prev_page_no, + RW_X_LATCH, !level, mtr); + } +#endif + } + if (next_page_no != FIL_NULL && direction != FSP_DOWN) { + id.set_page_no(next_page_no); + next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!next_block) { + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_X_LOCK)); + next_block = btr_block_get(*index, next_page_no, + RW_X_LATCH, !level, mtr); + } +#endif + } + + /* Build the node pointer (= node key and page address) for the upper + half */ + + node_ptr_upper = dict_index_build_node_ptr( + index, split_rec, upper_block->page.id().page_no(), + heap, level); + + /* Insert it next to the pointer to the lower half. Note that this + may generate recursion leading to a split on the higher level. */ + + dberr_t err = btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr_upper, mtr); + + /* Free the memory heap */ + mem_heap_free(heap); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + + /* Update page links of the level */ + + if (prev_block) { + if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + + FIL_PAGE_OFFSET, + 4))) { + return DB_CORRUPTION; + } + btr_page_set_next(prev_block, lower_block->page.id().page_no(), + mtr); + } + + if (next_block) { + if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + + FIL_PAGE_OFFSET, + 4))) { + return DB_CORRUPTION; + } + btr_page_set_prev(next_block, upper_block->page.id().page_no(), + mtr); + } + + if (direction == FSP_DOWN) { + ut_ad(lower_block == new_block); + ut_ad(btr_page_get_next(upper_block->page.frame) + == next_page_no); + btr_page_set_prev(lower_block, prev_page_no, mtr); + } else { + ut_ad(upper_block == new_block); + ut_ad(btr_page_get_prev(lower_block->page.frame) + == prev_page_no); + btr_page_set_next(upper_block, next_page_no, mtr); + } + + btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr); + btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr); + + return DB_SUCCESS; +} + +/*************************************************************//** +Determine if a tuple is smaller than any record on the page. +@return TRUE if smaller */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +btr_page_tuple_smaller( +/*===================*/ + btr_cur_t* cursor, /*!< in: b-tree cursor */ + const dtuple_t* tuple, /*!< in: tuple to consider */ + rec_offs** offsets,/*!< in/out: temporary storage */ + ulint n_uniq, /*!< in: number of unique fields + in the index page records */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + buf_block_t* block; + const rec_t* first_rec; + page_cur_t pcur; + + /* Read the first user record in the page. */ + block = btr_cur_get_block(cursor); + page_cur_set_before_first(block, &pcur); + if (UNIV_UNLIKELY(!(first_rec = page_cur_move_to_next(&pcur)))) { + ut_ad("corrupted page" == 0); + return false; + } + + *offsets = rec_get_offsets(first_rec, cursor->index(), *offsets, + page_is_leaf(block->page.frame) + ? cursor->index()->n_core_fields : 0, + n_uniq, heap); + + return cmp_dtuple_rec(tuple, first_rec, cursor->index(), *offsets) < 0; +} + +/** Insert the tuple into the right sibling page, if the cursor is at the end +of a page. +@param[in] flags undo logging and locking flags +@param[in,out] cursor cursor at which to insert; when the function succeeds, + the cursor is positioned before the insert point. +@param[out] offsets offsets on inserted record +@param[in,out] heap memory heap for allocating offsets +@param[in] tuple tuple to insert +@param[in] n_ext number of externally stored columns +@param[in,out] mtr mini-transaction +@return inserted record (first record on the right sibling page); + the cursor will be positioned on the page infimum +@retval NULL if the operation was not performed */ +static +rec_t* +btr_insert_into_right_sibling( + ulint flags, + btr_cur_t* cursor, + rec_offs** offsets, + mem_heap_t* heap, + const dtuple_t* tuple, + ulint n_ext, + mtr_t* mtr) +{ + buf_block_t* block = btr_cur_get_block(cursor); + page_t* page = buf_block_get_frame(block); + const uint32_t next_page_no = btr_page_get_next(page); + + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(heap); + ut_ad(dtuple_check_typed(tuple)); + + if (next_page_no == FIL_NULL || !page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(cursor)))) { + + return nullptr; + } + + page_cur_t next_page_cursor; + buf_block_t* next_block; + page_t* next_page; + btr_cur_t next_father_cursor; + rec_t* rec = nullptr; + ulint max_size; + + next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH, + page_is_leaf(page), mtr); + if (UNIV_UNLIKELY(!next_block)) { + return nullptr; + } + next_page = buf_block_get_frame(next_block); + const bool is_leaf = page_is_leaf(next_page); + + next_page_cursor.index = cursor->index(); + next_page_cursor.block = next_block; + next_father_cursor.page_cur = next_page_cursor; + + if (!btr_page_get_father(mtr, &next_father_cursor)) { + return nullptr; + } + + ulint up_match = 0, low_match = 0; + + if (page_cur_search_with_match(tuple, + PAGE_CUR_LE, &up_match, &low_match, + &next_page_cursor, nullptr)) { + return nullptr; + } + + max_size = page_get_max_insert_size_after_reorganize(next_page, 1); + + /* Extends gap lock for the next page */ + if (is_leaf && cursor->index()->has_locking()) { + lock_update_node_pointer(block, next_block); + } + + rec = page_cur_tuple_insert(&next_page_cursor, tuple, offsets, &heap, + n_ext, mtr); + + if (!rec) { + if (is_leaf + && next_block->page.zip.ssize + && !dict_index_is_clust(cursor->index()) + && !cursor->index()->table->is_temporary()) { + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + ibuf_reset_free_bits(next_block); + } + return nullptr; + } + + ibool compressed; + dberr_t err; + ulint level = btr_page_get_level(next_page); + + /* adjust cursor position */ + *btr_cur_get_page_cur(cursor) = next_page_cursor; + + ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page)); + ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec); + + /* We have to change the parent node pointer */ + + compressed = btr_cur_pessimistic_delete( + &err, TRUE, &next_father_cursor, + BTR_CREATE_FLAG, false, mtr); + + if (err != DB_SUCCESS) { + return nullptr; + } + + if (!compressed) { + btr_cur_compress_if_useful(&next_father_cursor, false, mtr); + } + + dtuple_t* node_ptr = dict_index_build_node_ptr( + cursor->index(), rec, next_block->page.id().page_no(), + heap, level); + + if (btr_insert_on_non_leaf_level(flags, cursor->index(), level + 1, + node_ptr, mtr) != DB_SUCCESS) { + return nullptr; + } + + ut_ad(rec_offs_validate(rec, cursor->index(), *offsets)); + + if (is_leaf + && !dict_index_is_clust(cursor->index()) + && !cursor->index()->table->is_temporary()) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + if (next_block->page.zip.ssize) { + ibuf_update_free_bits_zip(next_block, mtr); + } else { + ibuf_update_free_bits_if_full( + next_block, max_size, + rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE); + } + } + + return(rec); +} + +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return error code */ +static +dberr_t +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + ulint old_data_size; + ulint new_data_size; + ulint old_n_recs; + ulint new_n_recs; + + ut_ad(!dict_index_is_spatial(index)); + + old_data_size = page_get_data_size(new_page); + old_n_recs = page_get_n_recs(new_page); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!new_page_zip == !page_zip); + ut_a(!new_page_zip + || page_zip_validate(new_page_zip, new_page, index)); + ut_a(!page_zip + || page_zip_validate(page_zip, page_align(split_rec), + index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + dberr_t err; + if (!page_copy_rec_list_end(new_block, block, + split_rec, index, mtr, &err)) { + return err; + } + + new_data_size = page_get_data_size(new_page); + new_n_recs = page_get_n_recs(new_page); + + ut_ad(new_data_size >= old_data_size); + + return page_delete_rec_list_end(split_rec, block, index, + new_n_recs - old_n_recs, + new_data_size - old_data_size, mtr); +} + +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return error code */ +static +dberr_t +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err; + if (page_copy_rec_list_start(new_block, block, split_rec, index, mtr, &err)) + page_delete_rec_list_start(split_rec, block, index, mtr); + return err; +} + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. +@return inserted record or NULL if run out of space */ +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + buf_block_t* left_block; + buf_block_t* right_block; + page_cur_t* page_cursor; + rec_t* first_rec; + byte* buf = 0; /* remove warning */ + rec_t* move_limit; + ulint n_iterations = 0; + ulint n_uniq; + + ut_ad(*err == DB_SUCCESS); + ut_ad(dtuple_check_typed(tuple)); + + buf_pool.pages_split++; + + if (cursor->index()->is_spatial()) { + /* Split rtree page and update parent */ + return rtr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr, err); + } + + if (!*heap) { + *heap = mem_heap_create(1024); + } + n_uniq = dict_index_get_n_unique_in_tree(cursor->index()); +func_start: + mem_heap_empty(*heap); + *offsets = NULL; + + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(!dict_index_is_online_ddl(cursor->index()) + || (flags & BTR_CREATE_FLAG) + || dict_index_is_clust(cursor->index())); + ut_ad(cursor->index()->lock.have_u_or_x()); + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_is_empty(page)); + + /* try to insert to the next page if possible before split */ + if (rec_t* rec = btr_insert_into_right_sibling( + flags, cursor, offsets, *heap, tuple, n_ext, mtr)) { + return(rec); + } + + /* 1. Decide the split record; split_rec == NULL means that the + tuple to be inserted should be the first record on the upper + half-page */ + bool insert_left = false; + uint32_t hint_page_no = block->page.id().page_no() + 1; + byte direction = FSP_UP; + + if (n_iterations > 0) { + split_rec = btr_page_get_split_rec(cursor, tuple, n_ext); + + if (split_rec == NULL) { + insert_left = btr_page_tuple_smaller( + cursor, tuple, offsets, n_uniq, heap); + } + } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { + } else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) { + direction = FSP_DOWN; + hint_page_no -= 2; + } else { + /* If there is only one record in the index page, we + can't split the node in the middle by default. We need + to determine whether the new record will be inserted + to the left or right. */ + + if (page_get_n_recs(page) > 1) { + split_rec = page_get_middle_rec(page); + } else if (btr_page_tuple_smaller(cursor, tuple, + offsets, n_uniq, heap)) { + split_rec = page_rec_get_next( + page_get_infimum_rec(page)); + } else { + split_rec = NULL; + goto got_split_rec; + } + + if (UNIV_UNLIKELY(!split_rec)) { + *err = DB_CORRUPTION; + return nullptr; + } + } + +got_split_rec: + /* 2. Allocate a new page to the index */ + const uint16_t page_level = btr_page_get_level(page); + new_block = btr_page_alloc(cursor->index(), hint_page_no, direction, + page_level, mtr, mtr, err); + + if (!new_block) { + return nullptr; + } + + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + + if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { + /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected + to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ + memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4); + } + btr_page_create(new_block, new_page_zip, cursor->index(), + page_level, mtr); + /* Only record the leaf level page splits. */ + if (!page_level) { + cursor->index()->stat_defrag_n_page_split ++; + cursor->index()->stat_defrag_modified_counter ++; + btr_defragment_save_defrag_stats_if_needed(cursor->index()); + } + + /* 3. Calculate the first record on the upper half-page, and the + first record (move_limit) on original page which ends up on the + upper half */ + + if (split_rec) { + first_rec = move_limit = split_rec; + + *offsets = rec_get_offsets(split_rec, cursor->index(), + *offsets, page_is_leaf(page) + ? cursor->index()->n_core_fields + : 0, + n_uniq, heap); + + insert_left = cmp_dtuple_rec(tuple, split_rec, cursor->index(), + *offsets) < 0; + + if (!insert_left && new_page_zip && n_iterations > 0) { + /* If a compressed page has already been split, + avoid further splits by inserting the record + to an empty page. */ + split_rec = NULL; + goto insert_empty; + } + } else if (insert_left) { + if (UNIV_UNLIKELY(!n_iterations)) { +corrupted: + *err = DB_CORRUPTION; + return nullptr; + } + first_rec = page_rec_get_next(page_get_infimum_rec(page)); +insert_move_limit: + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + if (UNIV_UNLIKELY(!first_rec || !move_limit)) { + goto corrupted; + } + } else { +insert_empty: + ut_ad(!split_rec); + ut_ad(!insert_left); + buf = UT_NEW_ARRAY_NOKEY( + byte, + rec_get_converted_size(cursor->index(), tuple, n_ext)); + + first_rec = rec_convert_dtuple_to_rec(buf, cursor->index(), + tuple, n_ext); + goto insert_move_limit; + } + + /* 4. Do first the modifications in the tree structure */ + + /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */ + *err = btr_attach_half_pages(flags, cursor->index(), block, + first_rec, new_block, direction, mtr); + + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + +#ifdef UNIV_DEBUG + /* If the split is made on the leaf level and the insert will fit + on the appropriate half-page, we may release the tree x-latch. + We can then move the records after releasing the tree latch, + thus reducing the tree latch contention. */ + const bool insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, split_rec, offsets, tuple, + n_ext, heap); +#endif + if (!split_rec && !insert_left) { + UT_DELETE_ARRAY(buf); + buf = NULL; + } + +#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled + if (insert_will_fit + && page_is_leaf(page) + && !dict_index_is_online_ddl(cursor->index())) { + mtr->release(cursor->index()->lock); + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } +#endif + + /* 5. Move then the records to the new page */ + if (direction == FSP_DOWN) { + /* fputs("Split left\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || (*err = page_move_rec_list_start(new_block, block, + move_limit, + cursor->index(), + mtr))) { + if (*err != DB_FAIL) { + return nullptr; + } + + /* For some reason, compressing new_block failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, page_zip, page, + cursor->index(), mtr); + *err = page_delete_rec_list_end(move_limit + - page + new_page, + new_block, + cursor->index(), + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + + /* Update the lock table and possible hash index. */ + if (cursor->index()->has_locking()) { + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); + } + + btr_search_move_or_delete_hash_entries( + new_block, block); + + /* Delete the records from the source page. */ + + page_delete_rec_list_start(move_limit, block, + cursor->index(), mtr); + } + + left_block = new_block; + right_block = block; + + if (cursor->index()->has_locking()) { + lock_update_split_left(right_block, left_block); + } + } else { + /* fputs("Split right\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || (*err = page_move_rec_list_end(new_block, block, + move_limit, + cursor->index(), mtr))) { + if (*err != DB_FAIL) { + return nullptr; + } + + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, page_zip, page, + cursor->index(), mtr); + page_delete_rec_list_start(move_limit - page + + new_page, new_block, + cursor->index(), mtr); + + /* Update the lock table and possible hash index. */ + if (cursor->index()->has_locking()) { + lock_move_rec_list_end(new_block, block, + move_limit); + } + + btr_search_move_or_delete_hash_entries( + new_block, block); + + /* Delete the records from the source page. */ + + *err = page_delete_rec_list_end(move_limit, block, + cursor->index(), + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + } + + left_block = block; + right_block = new_block; + + if (cursor->index()->has_locking()) { + lock_update_split_right(right_block, left_block); + } + } + +#ifdef UNIV_ZIP_DEBUG + if (page_zip) { + ut_a(page_zip_validate(page_zip, page, cursor->index())); + ut_a(page_zip_validate(new_page_zip, new_page, + cursor->index())); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* At this point, split_rec, move_limit and first_rec may point + to garbage on the old page. */ + + /* 6. The split and the tree modification is now completed. Decide the + page where the tuple should be inserted */ + rec_t* rec; + buf_block_t* const insert_block = insert_left + ? left_block : right_block; + + /* 7. Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + page_cursor->block = insert_block; + + ulint up_match = 0, low_match = 0; + + if (page_cur_search_with_match(tuple, + PAGE_CUR_LE, &up_match, &low_match, + page_cursor, nullptr)) { + *err = DB_CORRUPTION; + return nullptr; + } + + rec = page_cur_tuple_insert(page_cursor, tuple, + offsets, heap, n_ext, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_t* insert_page + = buf_block_get_frame(insert_block); + + page_zip_des_t* insert_page_zip + = buf_block_get_page_zip(insert_block); + + ut_a(!insert_page_zip + || page_zip_validate(insert_page_zip, insert_page, + cursor->index())); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (rec != NULL) { + + goto func_exit; + } + + /* 8. If insert did not fit, try page reorganization. + For compressed pages, page_cur_tuple_insert() will have + attempted this already. */ + + if (page_cur_get_page_zip(page_cursor)) { + goto insert_failed; + } + + *err = btr_page_reorganize(page_cursor, mtr); + + if (*err != DB_SUCCESS) { + return nullptr; + } + + rec = page_cur_tuple_insert(page_cursor, tuple, + offsets, heap, n_ext, mtr); + + if (rec == NULL) { + /* The insert did not fit on the page: loop back to the + start of the function for a new split */ +insert_failed: + /* We play safe and reset the free bits for new_page */ + if (!dict_index_is_clust(page_cursor->index) + && !page_cursor->index->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + ibuf_reset_free_bits(block); + } + + n_iterations++; + ut_ad(n_iterations < 2 + || buf_block_get_page_zip(insert_block)); + ut_ad(!insert_will_fit); + + goto func_start; + } + +func_exit: + /* Insert fit on the page: update the free bits for the + left and right pages in the same mtr */ + + if (!dict_index_is_clust(page_cursor->index) + && !page_cursor->index->table->is_temporary() + && page_is_leaf(page)) { + + ibuf_update_free_bits_for_two_pages_low( + left_block, right_block, mtr); + } + + ut_ad(page_validate(buf_block_get_frame(left_block), + page_cursor->index)); + ut_ad(page_validate(buf_block_get_frame(right_block), + page_cursor->index)); + + ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets)); + return(rec); +} + +/** Remove a page from the level list of pages. +@param[in] block page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ +dberr_t btr_level_list_remove(const buf_block_t& block, + const dict_index_t& index, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(block.zip_size() == index.table->space->zip_size()); + ut_ad(index.table->space->id == block.page.id().space()); + /* Get the previous and next page numbers of page */ + const uint32_t prev_page_no= btr_page_get_prev(block.page.frame); + const uint32_t next_page_no= btr_page_get_next(block.page.frame); + page_id_t id{block.page.id()}; + buf_block_t *prev= nullptr, *next; + dberr_t err; + + /* Update page links of the level */ + if (prev_page_no != FIL_NULL) + { + id.set_page_no(prev_page_no); + prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!prev) + { + ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); + prev= btr_block_get(index, id.page_no(), RW_X_LATCH, + page_is_leaf(block.page.frame), mtr, &err); + if (UNIV_UNLIKELY(!prev)) + return err; + } +#endif + } + + if (next_page_no != FIL_NULL) + { + id.set_page_no(next_page_no); + next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!next) + { + ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); + next= btr_block_get(index, id.page_no(), RW_X_LATCH, + page_is_leaf(block.page.frame), mtr, &err); + if (UNIV_UNLIKELY(!next)) + return err; + } +#endif + btr_page_set_prev(next, prev_page_no, mtr); + } + + if (prev) + btr_page_set_next(prev, next_page_no, mtr); + + return DB_SUCCESS; +} + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +buf_block_t* +btr_lift_page_up( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t* father_block; + ulint page_level; + page_zip_des_t* father_page_zip; + page_t* page = buf_block_get_frame(block); + ulint root_page_no; + buf_block_t* blocks[BTR_MAX_LEVELS]; + ulint n_blocks; /*!< last used index in blocks[] */ + ulint i; + bool lift_father_up; + buf_block_t* block_orig = block; + + ut_ad(!page_has_siblings(page)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_is_empty(page)); + + page_level = btr_page_get_level(page); + root_page_no = dict_index_get_page(index); + + { + btr_cur_t cursor; + rec_offs* offsets = NULL; + mem_heap_t* heap = mem_heap_create( + sizeof(*offsets) + * (REC_OFFS_HEADER_SIZE + 1 + 1 + + unsigned(index->n_fields))); + buf_block_t* b; + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + if (index->is_spatial()) { + offsets = rtr_page_get_father_block( + nullptr, heap, mtr, nullptr, &cursor); + } else { + offsets = btr_page_get_father_block(offsets, heap, + mtr, &cursor); + } + father_block = btr_cur_get_block(&cursor); + father_page_zip = buf_block_get_page_zip(father_block); + + n_blocks = 0; + + /* Store all ancestor pages so we can reset their + levels later on. We have to do all the searches on + the tree now because later on, after we've replaced + the first level, the tree is in an inconsistent state + and can not be searched. */ + for (b = father_block; + b->page.id().page_no() != root_page_no; ) { + ut_a(n_blocks < BTR_MAX_LEVELS); + + if (index->is_spatial()) { + offsets = rtr_page_get_father_block( + nullptr, heap, mtr, nullptr, &cursor); + } else { + offsets = btr_page_get_father_block(offsets, + heap, + mtr, + &cursor); + } + + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); + } + + lift_father_up = (n_blocks && page_level == 0); + if (lift_father_up) { + /* The father page also should be the only on its level (not + root). We should lift up the father page at first. + Because the leaf page should be lifted up only for root page. + The freeing page is based on page_level (==0 or !=0) + to choose segment. If the page_level is changed ==0 from !=0, + later freeing of the page doesn't find the page allocation + to be freed.*/ + + block = father_block; + page = buf_block_get_frame(block); + page_level = btr_page_get_level(page); + + ut_ad(!page_has_siblings(page)); + ut_ad(mtr->memo_contains_flagged(block, + MTR_MEMO_PAGE_X_FIX)); + + father_block = blocks[0]; + father_page_zip = buf_block_get_page_zip(father_block); + } + + mem_heap_free(heap); + } + + btr_search_drop_page_hash_index(block, false); + + /* Make the father empty */ + btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(father_block->page.frame)); + + if (index->is_instant() + && father_block->page.id().page_no() == root_page_no) { + ut_ad(!father_page_zip); + + if (page_is_leaf(page)) { + const rec_t* rec = page_rec_get_next( + page_get_infimum_rec(page)); + ut_ad(rec_is_metadata(rec, *index)); + if (rec_is_add_metadata(rec, *index) + && page_get_n_recs(page) == 1) { + index->clear_instant_add(); + goto copied; + } + } + + btr_set_instant(father_block, *index, mtr); + } + + /* Copy the records to the father page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || father_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(father_block, block, + page_get_infimum_rec(page), + index, mtr, err)) { + switch (*err) { + case DB_SUCCESS: + break; + case DB_FAIL: + *err = DB_SUCCESS; + break; + default: + return nullptr; + } + + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(father_page_zip); + ut_a(page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(father_block, + page_zip, page, index, mtr); + + /* Update the lock table and possible hash index. */ + + if (index->has_locking()) { + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); + } + + /* Also update the predicate locks */ + if (dict_index_is_spatial(index)) { + lock_prdt_rec_move(father_block, block->page.id()); + } else { + btr_search_move_or_delete_hash_entries( + father_block, block); + } + } + +copied: + if (index->has_locking()) { + const page_id_t id{block->page.id()}; + /* Free predicate page locks on the block */ + if (index->is_spatial()) { + lock_sys.prdt_page_free_from_discard(id); + } else { + lock_update_copy_and_discard(*father_block, id); + } + } + + page_level++; + + /* Go upward to root page, decrementing levels by one. */ + for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) { + ut_ad(btr_page_get_level(blocks[i]->page.frame) + == page_level + 1); + btr_page_set_level(blocks[i], page_level, mtr); + } + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, NULL, block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* We play it safe and reset the free bits for the father */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary()) { + ibuf_reset_free_bits(father_block); + } + ut_ad(page_validate(father_block->page.frame, index)); + ut_ad(btr_check_node_ptr(index, father_block, mtr)); + + return(lift_father_up ? block_orig : father_block); +} + +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the brother +reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to the +brothers, if they exist. +@return error code */ +dberr_t +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + bool adjust, /*!< in: whether the cursor position should be + adjusted even when compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index; + buf_block_t* merge_block = nullptr; + page_t* merge_page = nullptr; + page_zip_des_t* merge_page_zip; + ibool is_left; + buf_block_t* block; + page_t* page; + btr_cur_t father_cursor; + mem_heap_t* heap; + rec_offs* offsets; + ulint nth_rec = 0; /* remove bogus warning */ + bool mbr_changed = false; +#ifdef UNIV_DEBUG + bool leftmost_child; +#endif + DBUG_ENTER("btr_compress"); + + block = btr_cur_get_block(cursor); + page = btr_cur_get_page(cursor); + index = btr_cur_get_index(cursor); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); + + const uint32_t left_page_no = btr_page_get_prev(page); + const uint32_t right_page_no = btr_page_get_next(page); + dberr_t err = DB_SUCCESS; + + ut_ad(page_is_leaf(page) || left_page_no != FIL_NULL + || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page)))); + + heap = mem_heap_create(100); + father_cursor.page_cur.index = index; + father_cursor.page_cur.block = block; + + if (index->is_spatial()) { + offsets = rtr_page_get_father_block( + NULL, heap, mtr, cursor, &father_cursor); + ut_ad(cursor->page_cur.block->page.id() == block->page.id()); + rec_t* my_rec = father_cursor.page_cur.rec; + + ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets); + + if (page_no != block->page.id().page_no()) { + ib::info() << "father positioned on page " + << page_no << "instead of " + << block->page.id().page_no(); + offsets = btr_page_get_father_block( + NULL, heap, mtr, &father_cursor); + } + } else { + offsets = btr_page_get_father_block( + NULL, heap, mtr, &father_cursor); + } + + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) { + corrupted: + err = DB_CORRUPTION; + err_exit: + /* We play it safe and reset the free bits. */ + if (merge_block && merge_block->zip_size() + && page_is_leaf(merge_block->page.frame) + && !index->is_clust()) { + ibuf_reset_free_bits(merge_block); + } + goto func_exit; + } + } + + if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) { + /* The page is the only one on the level, lift the records + to the father */ + + merge_block = btr_lift_page_up(index, block, mtr, &err); +success: + if (adjust) { + ut_ad(nth_rec > 0); + if (rec_t* nth + = page_rec_get_nth(merge_block->page.frame, + nth_rec)) { + btr_cur_position(index, nth, + merge_block, cursor); + } else { + goto corrupted; + } + } + + MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL); +func_exit: + mem_heap_free(heap); + DBUG_RETURN(err); + } + + ut_d(leftmost_child = + left_page_no != FIL_NULL + && (page_rec_get_next( + page_get_infimum_rec( + btr_cur_get_page(&father_cursor))) + == btr_cur_get_rec(&father_cursor))); + + /* Decide the page to which we try to merge and which will inherit + the locks */ + + is_left = btr_can_merge_with_page(cursor, left_page_no, + &merge_block, mtr); + + DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;); +retry: + if (!is_left + && !btr_can_merge_with_page(cursor, right_page_no, &merge_block, + mtr)) { + if (!merge_block) { + merge_page = NULL; + } +cannot_merge: + err = DB_FAIL; + goto err_exit; + } + + merge_page = buf_block_get_frame(merge_block); + + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_page + (is_left + ? FIL_PAGE_NEXT + : FIL_PAGE_PREV), + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + goto corrupted; + } + + ut_ad(page_validate(merge_page, index)); + + merge_page_zip = buf_block_get_page_zip(merge_block); +#ifdef UNIV_ZIP_DEBUG + if (merge_page_zip) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(page_zip); + ut_a(page_zip_validate(merge_page_zip, merge_page, index)); + ut_a(page_zip_validate(page_zip, page, index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + btr_cur_t cursor2; + cursor2.page_cur.index = index; + cursor2.page_cur.block = merge_block; + + /* Move records to the merge page */ + if (is_left) { + rtr_mbr_t new_mbr; + rec_offs* offsets2 = NULL; + + /* For rtree, we need to update father's mbr. */ + if (index->is_spatial()) { + /* We only support merge pages with the same parent + page */ + if (!rtr_check_same_block( + index, &cursor2, + btr_cur_get_block(&father_cursor), heap)) { + is_left = false; + goto retry; + } + + /* Set rtr_info for cursor2, since it is + necessary in recursive page merge. */ + cursor2.rtr_info = cursor->rtr_info; + cursor2.tree_height = cursor->tree_height; + + offsets2 = rec_get_offsets( + btr_cur_get_rec(&cursor2), index, NULL, + page_is_leaf(btr_cur_get_page(&cursor2)) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + /* Check if parent entry needs to be updated */ + mbr_changed = rtr_merge_mbr_changed( + &cursor2, &father_cursor, + offsets2, offsets, &new_mbr); + } + + rec_t* orig_pred = page_copy_rec_list_start( + merge_block, block, page_get_supremum_rec(page), + index, mtr, &err); + + if (!orig_pred) { + goto err_exit; + } + + btr_search_drop_page_hash_index(block, false); + + /* Remove the page from the level list */ + err = btr_level_list_remove(*block, *index, mtr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + + const page_id_t id{block->page.id()}; + + if (index->is_spatial()) { + rec_t* my_rec = father_cursor.page_cur.rec; + + ulint page_no = btr_node_ptr_get_child_page_no( + my_rec, offsets); + + if (page_no != block->page.id().page_no()) { + ib::fatal() << "father positioned on " + << page_no << " instead of " + << block->page.id().page_no(); + } + + if (mbr_changed) { + rtr_update_mbr_field( + &cursor2, offsets2, &father_cursor, + merge_page, &new_mbr, NULL, mtr); + } else { + rtr_node_ptr_delete(&father_cursor, mtr); + } + + /* No GAP lock needs to be worrying about */ + lock_sys.prdt_page_free_from_discard(id); + } else { + err = btr_cur_node_ptr_delete(&father_cursor, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + if (index->has_locking()) { + lock_update_merge_left( + *merge_block, orig_pred, id); + } + } + + if (adjust) { + ulint n = page_rec_get_n_recs_before(orig_pred); + if (UNIV_UNLIKELY(!n || n == ULINT_UNDEFINED)) { + goto corrupted; + } + nth_rec += n; + } + } else { + rec_t* orig_succ; + ibool compressed; + dberr_t err; + byte fil_page_prev[4]; + + if (index->is_spatial()) { + /* For spatial index, we disallow merge of blocks + with different parents, since the merge would need + to update entry (for MBR and Primary key) in the + parent of block being merged */ + if (!rtr_check_same_block( + index, &cursor2, + btr_cur_get_block(&father_cursor), heap)) { + goto cannot_merge; + } + + /* Set rtr_info for cursor2, since it is + necessary in recursive page merge. */ + cursor2.rtr_info = cursor->rtr_info; + cursor2.tree_height = cursor->tree_height; + } else if (!btr_page_get_father(mtr, &cursor2)) { + goto cannot_merge; + } + + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* The function page_zip_compress(), which will be + invoked by page_copy_rec_list_end() below, + requires that FIL_PAGE_PREV be FIL_NULL. + Clear the field, but prepare to restore it. */ + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); + memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); + compile_time_assert(FIL_NULL == 0xffffffffU); + memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4); + } + + orig_succ = page_copy_rec_list_end(merge_block, block, + page_get_infimum_rec(page), + cursor->index(), mtr, &err); + + if (!orig_succ) { + ut_a(merge_page_zip); + if (left_page_no == FIL_NULL) { + /* FIL_PAGE_PREV was restored from + merge_page_zip. */ + ut_ad(!memcmp(fil_page_prev, + merge_page + FIL_PAGE_PREV, 4)); + } + goto err_exit; + } + + btr_search_drop_page_hash_index(block, false); + + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* Restore FIL_PAGE_PREV in order to avoid an assertion + failure in btr_level_list_remove(), which will set + the field again to FIL_NULL. Even though this makes + merge_page and merge_page_zip inconsistent for a + split second, it is harmless, because the pages + are X-latched. */ + memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); + } + + /* Remove the page from the level list */ + err = btr_level_list_remove(*block, *index, mtr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + + ut_ad(btr_node_ptr_get_child_page_no( + btr_cur_get_rec(&father_cursor), offsets) + == block->page.id().page_no()); + + /* Replace the address of the old child node (= page) with the + address of the merge page to the right */ + btr_node_ptr_set_child_page_no( + btr_cur_get_block(&father_cursor), + btr_cur_get_rec(&father_cursor), + offsets, right_page_no, mtr); + +#ifdef UNIV_DEBUG + if (!page_is_leaf(page) && left_page_no == FIL_NULL) { + ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec( + buf_block_get_frame(merge_block))), + page_is_comp(page))); + } +#endif /* UNIV_DEBUG */ + + /* For rtree, we need to update father's mbr. */ + if (index->is_spatial()) { + rec_offs* offsets2; + ulint rec_info; + + offsets2 = rec_get_offsets( + btr_cur_get_rec(&cursor2), index, NULL, + page_is_leaf(btr_cur_get_page(&cursor2)) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + ut_ad(btr_node_ptr_get_child_page_no( + btr_cur_get_rec(&cursor2), offsets2) + == right_page_no); + + rec_info = rec_get_info_bits( + btr_cur_get_rec(&father_cursor), + rec_offs_comp(offsets)); + if (rec_info & REC_INFO_MIN_REC_FLAG) { + /* When the father node ptr is minimal rec, + we will keep it and delete the node ptr of + merge page. */ + rtr_merge_and_update_mbr(&father_cursor, + &cursor2, + offsets, offsets2, + merge_page, mtr); + } else { + /* Otherwise, we will keep the node ptr of + merge page and delete the father node ptr. + This is for keeping the rec order in upper + level. */ + rtr_merge_and_update_mbr(&cursor2, + &father_cursor, + offsets2, offsets, + merge_page, mtr); + } + const page_id_t id{block->page.id()}; + lock_sys.prdt_page_free_from_discard(id); + } else { + + compressed = btr_cur_pessimistic_delete(&err, TRUE, + &cursor2, + BTR_CREATE_FLAG, + false, mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor2, false, + mtr); + } + + if (index->has_locking()) { + lock_update_merge_right( + merge_block, orig_succ, block); + } + } + } + + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(merge_page)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (merge_block->zip_size()) { + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); + } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full(merge_block, + srv_page_size, + ULINT_UNDEFINED); + } + } + + ut_ad(page_validate(merge_page, index)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page, + index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, NULL, block); + } + + /* Free the file page */ + err = btr_page_free(index, block, mtr); + if (err == DB_SUCCESS) { + ut_ad(leftmost_child + || btr_check_node_ptr(index, merge_block, mtr)); + goto success; + } else { + goto err_exit; + } +} + +/*************************************************************//** +Discards a page that is the only page on its level. This will empty +the whole B-tree, leaving just an empty root page. This function +should almost never be reached, because btr_compress(), which is invoked in +delete operations, calls btr_lift_page_up() to flatten the B-tree. */ +ATTRIBUTE_COLD +static +void +btr_discard_only_page_on_level( +/*===========================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_level = 0; + + ut_ad(!index->is_dummy); + + /* Save the PAGE_MAX_TRX_ID from the leaf page. */ + const trx_id_t max_trx_id = page_get_max_trx_id(block->page.frame); + const rec_t* r = page_rec_get_next( + page_get_infimum_rec(block->page.frame)); + /* In the caller we checked that a valid key exists in the page, + because we were able to look up a parent page. */ + ut_ad(r); + ut_ad(rec_is_metadata(r, *index) == index->is_instant()); + + while (block->page.id().page_no() != dict_index_get_page(index)) { + btr_cur_t cursor; + buf_block_t* father; + const page_t* page = buf_block_get_frame(block); + + ut_a(page_get_n_recs(page) == 1); + ut_a(page_level == btr_page_get_level(page)); + ut_a(!page_has_siblings(page)); + ut_ad(fil_page_index_page_check(page)); + ut_ad(block->page.id().space() == index->table->space->id); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + btr_search_drop_page_hash_index(block, false); + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + if (index->is_spatial()) { + /* Check any concurrent search having this page */ + rtr_check_discard_page(index, NULL, block); + if (!rtr_page_get_father(mtr, nullptr, &cursor)) { + return; + } + } else { + if (!btr_page_get_father(mtr, &cursor)) { + return; + } + } + father = btr_cur_get_block(&cursor); + + if (index->has_locking()) { + lock_update_discard( + father, PAGE_HEAP_NO_SUPREMUM, block); + } + + /* Free the file page */ + if (btr_page_free(index, block, mtr) != DB_SUCCESS) { + return; + } + + block = father; + page_level++; + } + + /* block is the root page, which must be empty, except + for the node pointer to the (now discarded) block(s). */ + ut_ad(!page_has_siblings(block->page.frame)); + + mem_heap_t* heap = nullptr; + const rec_t* rec = nullptr; + rec_offs* offsets = nullptr; + if (index->table->instant || index->must_avoid_clear_instant_add()) { + if (!rec_is_metadata(r, *index)) { + } else if (!index->table->instant + || rec_is_alter_metadata(r, *index)) { + heap = mem_heap_create(srv_page_size); + offsets = rec_get_offsets(r, index, nullptr, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + rec = rec_copy(mem_heap_alloc(heap, + rec_offs_size(offsets)), + r, offsets); + rec_offs_make_valid(rec, index, true, offsets); + } + } + + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(block->page.frame)); + + if (index->is_primary()) { + if (rec) { + page_cur_t cur; + page_cur_set_before_first(block, &cur); + cur.index = index; + DBUG_ASSERT(index->table->instant); + DBUG_ASSERT(rec_is_alter_metadata(rec, *index)); + btr_set_instant(block, *index, mtr); + rec = page_cur_insert_rec_low(&cur, rec, offsets, mtr); + ut_ad(rec); + mem_heap_free(heap); + } else if (index->is_instant()) { + index->clear_instant_add(); + } + } else if (!index->table->is_temporary()) { + /* We play it safe and reset the free bits for the root */ + ibuf_reset_free_bits(block); + + ut_a(max_trx_id); + page_set_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, mtr); + } +} + +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +dberr_t +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + buf_block_t* merge_block; + buf_block_t* block; + btr_cur_t parent_cursor; + + block = btr_cur_get_block(cursor); + index = btr_cur_get_index(cursor); + parent_cursor.page_cur = cursor->page_cur; + + ut_ad(dict_index_get_page(index) != block->page.id().page_no()); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + MONITOR_INC(MONITOR_INDEX_DISCARD); + + if (index->is_spatial() + ? !rtr_page_get_father(mtr, cursor, &parent_cursor) + : !btr_page_get_father(mtr, &parent_cursor)) { + return DB_CORRUPTION; + } + + /* Decide the page which will inherit the locks */ + + const uint32_t left_page_no = btr_page_get_prev(block->page.frame); + const uint32_t right_page_no = btr_page_get_next(block->page.frame); + page_id_t merge_page_id{block->page.id()}; + + ut_d(bool parent_is_different = false); + dberr_t err; + if (left_page_no != FIL_NULL) { + merge_page_id.set_page_no(left_page_no); + merge_block = btr_block_reget(mtr, *index, merge_page_id, + &err); + if (UNIV_UNLIKELY(!merge_block)) { + return err; + } +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ + ut_ad(!memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + FIL_PAGE_OFFSET, + 4)); +#else + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + return DB_CORRUPTION; + } +#endif + ut_d(parent_is_different = + (page_rec_get_next( + page_get_infimum_rec( + btr_cur_get_page( + &parent_cursor))) + == btr_cur_get_rec(&parent_cursor))); + } else if (right_page_no != FIL_NULL) { + merge_page_id.set_page_no(right_page_no); + merge_block = btr_block_reget(mtr, *index, merge_page_id, + &err); + if (UNIV_UNLIKELY(!merge_block)) { + return err; + } +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ + ut_ad(!memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + FIL_PAGE_OFFSET, + 4)); +#else + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + return DB_CORRUPTION; + } +#endif + ut_d(parent_is_different = page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(&parent_cursor)))); + if (page_is_leaf(merge_block->page.frame)) { + } else if (rec_t* node_ptr = + page_rec_get_next(page_get_infimum_rec( + merge_block->page.frame))) { + ut_ad(page_rec_is_user_rec(node_ptr)); + /* We have to mark the leftmost node pointer as the + predefined minimum record. */ + btr_set_min_rec_mark<true>(node_ptr, *merge_block, + mtr); + } else { + return DB_CORRUPTION; + } + } else { + btr_discard_only_page_on_level(index, block, mtr); + return DB_SUCCESS; + } + + if (UNIV_UNLIKELY(memcmp_aligned<2>(&merge_block->page.frame + [PAGE_HEADER + PAGE_LEVEL], + &block->page.frame + [PAGE_HEADER + PAGE_LEVEL], 2))) { + return DB_CORRUPTION; + } + + btr_search_drop_page_hash_index(block, false); + + if (dict_index_is_spatial(index)) { + rtr_node_ptr_delete(&parent_cursor, mtr); + } else if (dberr_t err = + btr_cur_node_ptr_delete(&parent_cursor, mtr)) { + return err; + } + + /* Remove the page from the level list */ + if (dberr_t err = btr_level_list_remove(*block, *index, mtr)) { + return err; + } + +#ifdef UNIV_ZIP_DEBUG + if (page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block)) + ut_a(page_zip_validate(merge_page_zip, + merge_block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (index->has_locking()) { + if (left_page_no != FIL_NULL) { + lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, + block); + } else { + lock_update_discard(merge_block, + lock_get_min_heap_no(merge_block), + block); + } + + if (index->is_spatial()) { + rtr_check_discard_page(index, cursor, block); + } + } + + /* Free the file page */ + err = btr_page_free(index, block, mtr); + + if (err == DB_SUCCESS) { + /* btr_check_node_ptr() needs parent block latched. + If the merge_block's parent block is not same, + we cannot use btr_check_node_ptr() */ + ut_ad(parent_is_different + || btr_check_node_ptr(index, merge_block, mtr)); + + if (btr_cur_get_block(&parent_cursor)->page.id().page_no() + == index->page + && !page_has_siblings(btr_cur_get_page(&parent_cursor)) + && page_get_n_recs(btr_cur_get_page(&parent_cursor)) + == 1) { + btr_lift_page_up(index, merge_block, mtr, &err); + } + } + + return err; +} + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ +{ + page_t* root; + fseg_header_t* seg; + mtr_t mtr; + + if (dict_index_is_ibuf(index)) { + fputs("Sorry, cannot print info of an ibuf tree:" + " use ibuf functions\n", stderr); + + return; + } + + mtr_start(&mtr); + + root = btr_root_get(index, &mtr); + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + + if (!dict_index_is_ibuf(index)) { + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + } + + mtr_commit(&mtr); +} + +/************************************************************//** +Prints recursively index tree pages. */ +static +void +btr_print_recursive( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + ulint width, /*!< in: print this many entries from start + and end */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + rec_offs** offsets,/*!< in/out: buffer for rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + page_cur_t cursor; + ulint n_recs; + ulint i = 0; + mtr_t mtr2; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX)); + + ib::info() << "NODE ON LEVEL " << btr_page_get_level(page) + << " page " << block->page.id; + + page_print(block, index, width, width); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + while (!page_cur_is_after_last(&cursor)) { + + if (page_is_leaf(page)) { + + /* If this is the leaf level, do nothing */ + + } else if ((i <= width) || (i >= n_recs - width)) { + + const rec_t* node_ptr; + + mtr_start(&mtr2); + + node_ptr = page_cur_get_rec(&cursor); + + *offsets = rec_get_offsets( + node_ptr, index, *offsets, 0, + ULINT_UNDEFINED, heap); + if (buf_block_t *child = + btr_node_ptr_get_child(node_ptr, index, *offsets, + &mtr2)) { + btr_print_recursive(index, child, width, heap, + offsets, &mtr2); + } + mtr_commit(&mtr2); + } + + page_cur_move_to_next(&cursor); + i++; + } +} + +/**************************************************************//** +Prints directories and other info of all nodes in the tree. */ +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ +{ + mtr_t mtr; + buf_block_t* root; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + fputs("--------------------------\n" + "INDEX TREE PRINT\n", stderr); + + mtr_start(&mtr); + + root = btr_root_block_get(index, RW_SX_LATCH, &mtr); + + btr_print_recursive(index, root, width, &heap, &offsets, &mtr); + if (heap) { + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + ut_ad(btr_validate_index(index, 0)); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* tuple; + rec_offs* offsets; + btr_cur_t cursor; + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + if (dict_index_get_page(index) == block->page.id().page_no()) { + + return(TRUE); + } + + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + heap = mem_heap_create(256); + + if (dict_index_is_spatial(index)) { + offsets = rtr_page_get_father_block(NULL, heap, mtr, + NULL, &cursor); + } else { + offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor); + } + + ut_ad(offsets); + + if (page_is_leaf(page)) { + + goto func_exit; + } + + tuple = dict_index_build_node_ptr( + index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, + btr_page_get_level(page)); + + /* For spatial index, the MBR in the parent rec could be different + with that of first rec of child, their relationship should be + "WITHIN" relationship */ + if (dict_index_is_spatial(index)) { + ut_a(!cmp_dtuple_rec_with_gis( + tuple, btr_cur_get_rec(&cursor), + PAGE_CUR_WITHIN)); + } else { + ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), index, + offsets)); + } +func_exit: + mem_heap_free(heap); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +Display identification information for a record. */ +static +void +btr_index_rec_validate_report( +/*==========================*/ + const page_t* page, /*!< in: index page */ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index) /*!< in: index */ +{ + ib::info() << "Record in index " << index->name + << " of table " << index->table->name + << ", page " << page_id_t(page_get_space_id(page), + page_get_page_no(page)) + << ", at offset " << page_offset(rec); +} + +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ +{ + ulint len; + const page_t* page; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + page = page_align(rec); + + ut_ad(index->n_core_fields); + + if (index->is_ibuf()) { + /* The insert buffer index tree can contain records from any + other index: we cannot check the number of fields or + their length */ + + return(TRUE); + } + +#ifdef VIRTUAL_INDEX_DEBUG + if (dict_index_has_virtual(index)) { + fprintf(stderr, "index name is %s\n", index->name()); + } +#endif + if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "Compact flag=" << !!page_is_comp(page) + << ", should be " << dict_table_is_comp(index->table); + + return(FALSE); + } + + const bool is_alter_metadata = page_is_leaf(page) + && !page_has_prev(page) + && index->is_primary() && index->table->instant + && rec == page_rec_get_next_const(page_get_infimum_rec(page)); + + if (is_alter_metadata + && !rec_is_alter_metadata(rec, page_is_comp(page))) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "First record is not ALTER TABLE metadata"; + return FALSE; + } + + if (!page_is_comp(page)) { + const ulint n_rec_fields = rec_get_n_fields_old(rec); + if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD + && index->id == DICT_INDEXES_ID) { + /* A record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + } else if (is_alter_metadata) { + if (n_rec_fields != ulint(index->n_fields) + 1) { + goto n_field_mismatch; + } + } else if (n_rec_fields < index->n_core_fields + || n_rec_fields > index->n_fields) { +n_field_mismatch: + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "Has " << rec_get_n_fields_old(rec) + << " fields, should have " + << index->n_core_fields << ".." + << index->n_fields; + + if (dump_on_error) { + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); + } + } + + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + const dict_field_t* field = index->fields; + ut_ad(rec_offs_n_fields(offsets) + == ulint(index->n_fields) + is_alter_metadata); + + for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) { + rec_get_nth_field_offs(offsets, i, &len); + + ulint fixed_size; + + if (is_alter_metadata && i == index->first_user_field()) { + fixed_size = FIELD_REF_SIZE; + if (len != FIELD_REF_SIZE + || !rec_offs_nth_extern(offsets, i)) { + goto len_mismatch; + } + + continue; + } else { + fixed_size = dict_col_get_fixed_size( + field->col, page_is_comp(page)); + if (rec_offs_nth_extern(offsets, i)) { + const byte* data = rec_get_nth_field( + rec, offsets, i, &len); + len -= BTR_EXTERN_FIELD_REF_SIZE; + ulint extern_len = mach_read_from_4( + data + len + BTR_EXTERN_LEN + 4); + if (fixed_size == extern_len + len) { + goto next_field; + } + } + } + + /* Note that if fixed_size != 0, it equals the + length of a fixed-size column in the clustered index. + We should adjust it here. + A prefix index of the column is of fixed, but different + length. When fixed_size == 0, prefix_len is the maximum + length of the prefix index column. */ + + if (len_is_stored(len) + && (field->prefix_len + ? len > field->prefix_len + : (fixed_size && len != fixed_size))) { +len_mismatch: + btr_index_rec_validate_report(page, rec, index); + ib::error error; + + error << "Field " << i << " len is " << len + << ", should be " << fixed_size; + + if (dump_on_error) { + error << "; "; + rec_print(error.m_oss, rec, + rec_get_info_bits( + rec, rec_offs_comp(offsets)), + offsets); + } + if (heap) { + mem_heap_free(heap); + } + return(FALSE); + } +next_field: + field++; + } + +#ifdef VIRTUAL_INDEX_DEBUG + if (dict_index_has_virtual(index)) { + rec_print_new(stderr, rec, offsets); + } +#endif + + if (heap) { + mem_heap_free(heap); + } + return(TRUE); +} + +/************************************************************//** +Checks the size and number of fields in records based on the definition of +the index. +@return true if ok */ +static +bool +btr_index_page_validate( +/*====================*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index) /*!< in: index */ +{ + page_cur_t cur; +#ifndef DBUG_OFF + ulint nth = 1; +#endif /* !DBUG_OFF */ + + page_cur_set_before_first(block, &cur); + + /* Directory slot 0 should only contain the infimum record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(page_rec_get_nth_const( + page_cur_get_page(&cur), 0) + == cur.rec); + ut_a(page_dir_slot_get_n_owned( + page_dir_get_nth_slot( + page_cur_get_page(&cur), 0)) + == 1);); + + while (page_cur_move_to_next(&cur)) { + if (page_cur_is_after_last(&cur)) { + return true; + } + + if (!btr_index_rec_validate(cur.rec, index, TRUE)) { + break; + } + + /* Verify that page_rec_get_nth_const() is correctly + retrieving each record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(cur.rec == page_rec_get_nth_const( + page_cur_get_page(&cur), + page_rec_get_n_recs_before( + cur.rec))); + ut_a(nth++ == page_rec_get_n_recs_before( + cur.rec));); + } + + return false; +} + +/************************************************************//** +Report an error on one page of an index tree. */ +static +void +btr_validate_report1( +/*=================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block) /*!< in: index page */ +{ + ib::error error; + error << "In page " << block->page.id().page_no() + << " of index " << index->name + << " of table " << index->table->name; + + if (level > 0) { + error << ", index tree level " << level; + } +} + +/************************************************************//** +Report an error on two pages of an index tree. */ +static +void +btr_validate_report2( +/*=================*/ + const dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block1, /*!< in: first index page */ + const buf_block_t* block2) /*!< in: second index page */ +{ + ib::error error; + error << "In pages " << block1->page.id() + << " and " << block2->page.id() << " of index " << index->name + << " of table " << index->table->name; + + if (level) + error << ", index tree level " << level; +} + +/** Validate an index tree level. */ +static +dberr_t +btr_validate_level( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + const trx_t* trx, /*!< in: transaction or NULL */ + ulint level) /*!< in: level number */ +{ + buf_block_t* block; + page_t* page; + buf_block_t* right_block = 0; /* remove warning */ + page_t* right_page = 0; /* remove warning */ + page_t* father_page; + btr_cur_t node_cur; + btr_cur_t right_node_cur; + rec_t* rec; + page_cur_t cursor; + dtuple_t* node_ptr_tuple; + mtr_t mtr; + mem_heap_t* heap = mem_heap_create(256); + rec_offs* offsets = NULL; + rec_offs* offsets2= NULL; +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip; +#endif /* UNIV_ZIP_DEBUG */ + + mtr.start(); + + mtr_x_lock_index(index, &mtr); + + dberr_t err; + block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); + if (!block) { + mtr.commit(); + return err; + } + page = buf_block_get_frame(block); + + fil_space_t* space = index->table->space; + + while (level != btr_page_get_level(page)) { + const rec_t* node_ptr; + switch (dberr_t e = + fseg_page_is_allocated(space, + block->page.id().page_no())) { + case DB_SUCCESS_LOCKED_REC: + break; + case DB_SUCCESS: + btr_validate_report1(index, level, block); + ib::warn() << "Page is free"; + e = DB_CORRUPTION; + /* fall through */ + default: + err = e; + } + ut_ad(index->table->space_id == block->page.id().space()); + ut_ad(block->page.id().space() == page_get_space_id(page)); +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + if (page_is_leaf(page)) { +corrupted: + err = DB_CORRUPTION; + goto invalid_page; + } + + page_cur_set_before_first(block, &cursor); + if (!(node_ptr = page_cur_move_to_next(&cursor))) { + goto corrupted; + } + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr, + &err); + if (!block) { + break; + } + page = buf_block_get_frame(block); + + /* For R-Tree, since record order might not be the same as + linked index page in the lower level, we need to travers + backwards to get the first page rec in this level. + This is only used for index validation. Spatial index + does not use such scan for any of its DML or query + operations */ + if (dict_index_is_spatial(index)) { + uint32_t left_page_no = btr_page_get_prev(page); + + while (left_page_no != FIL_NULL) { + /* To obey latch order of tree blocks, + we should release the right_block once to + obtain lock of the uncle block. */ + mtr.release_last_page(); + + block = btr_block_get(*index, left_page_no, + RW_SX_LATCH, false, + &mtr, &err); + if (!block) { + goto invalid_page; + } + page = buf_block_get_frame(block); + left_page_no = btr_page_get_prev(page); + } + } + } + + /* Now we are on the desired level. Loop through the pages on that + level. */ + +loop: + if (!block) { +invalid_page: + mtr.commit(); +func_exit: + mem_heap_free(heap); + return err; + } + + mem_heap_empty(heap); + offsets = offsets2 = NULL; + + mtr_x_lock_index(index, &mtr); + + page = block->page.frame; + +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (DB_SUCCESS_LOCKED_REC + != fseg_page_is_allocated(space, block->page.id().page_no())) { + btr_validate_report1(index, level, block); + + ib::warn() << "Page is marked as free"; + err = DB_CORRUPTION; + } else if (btr_page_get_index_id(page) != index->id) { + ib::error() << "Page index id " << btr_page_get_index_id(page) + << " != data dictionary index id " << index->id; + err = DB_CORRUPTION; + } else if (!page_validate(page, index)) { + btr_validate_report1(index, level, block); + err = DB_CORRUPTION; + } else if (btr_page_get_level(page) != level) { + btr_validate_report1(index, level, block); + ib::error() << "Page level is not " << level; + err = DB_CORRUPTION; + } else if (level == 0 && !btr_index_page_validate(block, index)) { + /* We are on level 0. Check that the records have the right + number of fields, and field lengths are right. */ + err = DB_CORRUPTION; + } else if (!page_is_empty(page)) { + } else if (level) { + btr_validate_report1(index, level, block); + ib::error() << "Non-leaf page is empty"; + } else if (block->page.id().page_no() != index->page) { + btr_validate_report1(index, level, block); + ib::error() << "Empty leaf page is not index root"; + } + + uint32_t right_page_no = btr_page_get_next(page); + uint32_t left_page_no = btr_page_get_prev(page); + + if (right_page_no != FIL_NULL) { + const rec_t* right_rec; + + right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, + !level, &mtr, &err); + if (!right_block) { + btr_validate_report1(index, level, block); + fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr); + goto invalid_page; + } + right_page = buf_block_get_frame(right_block); + + if (btr_page_get_prev(right_page) != page_get_page_no(page)) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: broken FIL_PAGE_NEXT" + " or FIL_PAGE_PREV links\n", stderr); + err = DB_CORRUPTION; + } + + if (!(rec = page_rec_get_prev(page_get_supremum_rec(page)))) { +broken_links: + btr_validate_report1(index, level, block); + fputs("InnoDB: broken record links\n", stderr); + goto invalid_page; + } + if (!(right_rec = + page_rec_get_next(page_get_infimum_rec(right_page)))) { + goto broken_links; + } + + offsets = rec_get_offsets(rec, index, offsets, + page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, offsets2, + page_is_leaf(right_page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + + /* For spatial index, we cannot guarantee the key ordering + across pages, so skip the record compare verification for + now. Will enhanced in special R-Tree index validation scheme */ + if (index->is_btree() + && cmp_rec_rec(rec, right_rec, + offsets, offsets2, index) >= 0) { + + btr_validate_report2(index, level, block, right_block); + + fputs("InnoDB: records in wrong order" + " on adjacent pages\n", stderr); + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + if (rec) { + fputs("InnoDB: record ", stderr); + rec_print(stderr, rec, index); + putc('\n', stderr); + } + fputs("InnoDB: record ", stderr); + rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + if (rec) { + rec_print(stderr, rec, index); + } + putc('\n', stderr); + err = DB_CORRUPTION; + } + } + + if (!level || left_page_no != FIL_NULL) { + } else if (const rec_t* first = + page_rec_get_next_const(page_get_infimum_rec(page))) { + if (!(REC_INFO_MIN_REC_FLAG + & rec_get_info_bits(first, page_is_comp(page)))) { + btr_validate_report1(index, level, block); + ib::error() << "Missing REC_INFO_MIN_REC_FLAG"; + err = DB_CORRUPTION; + } + } else { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + /* Similarly skip the father node check for spatial index for now, + for a couple of reasons: + 1) As mentioned, there is no ordering relationship between records + in parent level and linked pages in the child level. + 2) Search parent from root is very costly for R-tree. + We will add special validation mechanism for R-tree later (WL #7520) */ + if (index->is_btree() && block->page.id().page_no() != index->page) { + /* Check father node pointers */ + rec_t* node_ptr + = page_rec_get_next(page_get_infimum_rec(page)); + if (!node_ptr) { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + btr_cur_position(index, node_ptr, block, &node_cur); + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &node_cur, &mtr); + + father_page = btr_cur_get_page(&node_cur); + node_ptr = btr_cur_get_rec(&node_cur); + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + if (rec) { + btr_cur_position(index, rec, block, &node_cur); + + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &node_cur, &mtr); + } else { + offsets = nullptr; + } + + if (!offsets || node_ptr != btr_cur_get_rec(&node_cur) + || btr_node_ptr_get_child_page_no(node_ptr, offsets) + != block->page.id().page_no()) { + + btr_validate_report1(index, level, block); + + fputs("InnoDB: node pointer to the page is wrong\n", + stderr); + + fputs("InnoDB: node ptr ", stderr); + rec_print(stderr, node_ptr, index); + + if (offsets) { + rec = btr_cur_get_rec(&node_cur); + fprintf(stderr, "\n" + "InnoDB: node ptr child page n:o %u\n", + btr_node_ptr_get_child_page_no( + rec, offsets)); + fputs("InnoDB: record on page ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + if (page_is_leaf(page)) { + } else if (const rec_t* first_rec = + page_rec_get_next(page_get_infimum_rec(page))) { + node_ptr_tuple = dict_index_build_node_ptr( + index, first_rec, + 0, heap, btr_page_get_level(page)); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, index, + offsets)) { + btr_validate_report1(index, level, block); + + ib::error() << "Node ptrs differ on levels > 0"; + + fputs("InnoDB: node ptr ",stderr); + rec_print_new(stderr, node_ptr, offsets); + fputs("InnoDB: first rec ", stderr); + rec_print(stderr, first_rec, index); + putc('\n', stderr); + err = DB_CORRUPTION; + goto node_ptr_fails; + } + } else { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + if (left_page_no == FIL_NULL) { + if (page_has_prev(father_page) + || node_ptr != page_rec_get_next( + page_get_infimum_rec(father_page))) { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + } + + if (right_page_no == FIL_NULL) { + if (page_has_next(father_page) + || node_ptr != page_rec_get_prev( + page_get_supremum_rec(father_page))) { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + } else if (const rec_t* right_node_ptr + = page_rec_get_next(node_ptr)) { + btr_cur_position( + index, + page_get_infimum_rec(right_block->page.frame), + right_block, &right_node_cur); + if (!page_cur_move_to_next(&right_node_cur.page_cur)) { + goto node_pointer_corrupted; + } + + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &right_node_cur, &mtr); + + if (right_node_ptr + != page_get_supremum_rec(father_page)) { + + if (btr_cur_get_rec(&right_node_cur) + != right_node_ptr) { +node_pointer_corrupted: + err = DB_CORRUPTION; + fputs("InnoDB: node pointer to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + } else { + page_t* right_father_page + = btr_cur_get_page(&right_node_cur); + + if (btr_cur_get_rec(&right_node_cur) + != page_rec_get_next( + page_get_infimum_rec( + right_father_page))) { + err = DB_CORRUPTION; + fputs("InnoDB: node pointer 2 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + + if (page_get_page_no(right_father_page) + != btr_page_get_next(father_page)) { + + err = DB_CORRUPTION; + fputs("InnoDB: node pointer 3 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + } + } else { + err = DB_CORRUPTION; + } + } + +node_ptr_fails: + /* Commit the mini-transaction to release the latch on 'page'. + Re-acquire the latch on right_page, which will become 'page' + on the next loop. The page has already been checked. */ + mtr.commit(); + + if (trx_is_interrupted(trx)) { + /* On interrupt, return the current status. */ + } else if (right_page_no != FIL_NULL) { + + mtr.start(); + + block = btr_block_get(*index, right_page_no, RW_SX_LATCH, + !level, &mtr, &err); + goto loop; + } + + goto func_exit; +} + +/**************************************************************//** +Checks the consistency of an index tree. +@return DB_SUCCESS if ok, error code if not */ +dberr_t +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or NULL */ +{ + mtr_t mtr; + mtr.start(); + + mtr_x_lock_index(index, &mtr); + + dberr_t err; + if (page_t *root= btr_root_get(index, &mtr, &err)) + for (auto level= btr_page_get_level(root);; level--) + { + if (dberr_t err_level= btr_validate_level(index, trx, level)) + err= err_level; + if (!level) + break; + } + + mtr.commit(); + return err; +} + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return true if possible to merge. */ +static +bool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + uint32_t page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + dict_index_t* index; + page_t* page; + ulint n_recs; + ulint data_size; + ulint max_ins_size_reorg; + ulint max_ins_size; + buf_block_t* mblock; + page_t* mpage; + DBUG_ENTER("btr_can_merge_with_page"); + + if (page_no == FIL_NULL) { +error: + *merge_block = NULL; + DBUG_RETURN(false); + } + + index = btr_cur_get_index(cursor); + page = btr_cur_get_page(cursor); + + mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page), + mtr); + if (!mblock) { + goto error; + } + mpage = buf_block_get_frame(mblock); + + n_recs = page_get_n_recs(page); + data_size = page_get_data_size(page); + + max_ins_size_reorg = page_get_max_insert_size_after_reorganize( + mpage, n_recs); + + if (data_size > max_ins_size_reorg) { + goto error; + } + + /* If compression padding tells us that merging will result in + too packed up page i.e.: which is likely to cause compression + failure then don't merge the pages. */ + if (mblock->page.zip.data && page_is_leaf(mpage) + && (page_get_data_size(mpage) + data_size + >= dict_index_zip_pad_optimal_page_size(index))) { + + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + if (data_size > max_ins_size) { + /* We have to reorganize mpage */ + if (btr_page_reorganize_block(page_zip_level, mblock, index, + mtr) != DB_SUCCESS) { + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + ut_ad(page_validate(mpage, index)); + ut_ad(max_ins_size == max_ins_size_reorg); + + if (data_size > max_ins_size) { + + /* Add fault tolerance, though this should + never happen */ + + goto error; + } + } + + *merge_block = mblock; + DBUG_RETURN(true); +} |