/***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. Copyright (c) 2014, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** @file btr/btr0btr.cc The B-tree Created 6/2/1994 Heikki Tuuri *******************************************************/ #include "btr0btr.h" #include "page0page.h" #include "page0zip.h" #include "gis0rtree.h" #include "btr0cur.h" #include "btr0sea.h" #include "btr0pcur.h" #include "btr0defragment.h" #include "rem0cmp.h" #include "lock0lock.h" #include "ibuf0ibuf.h" #include "trx0trx.h" #include "srv0mon.h" #include "gis0geo.h" #include "dict0boot.h" #include "row0sel.h" /* row_search_max_autoinc() */ #include "log.h" /**************************************************************//** Checks if the page in the cursor can be merged with given page. If necessary, re-organize the merge_page. @return true if possible to merge. */ static bool btr_can_merge_with_page( /*====================*/ btr_cur_t* cursor, /*!< in: cursor on the page to merge */ uint32_t page_no, /*!< in: a sibling page */ buf_block_t** merge_block, /*!< out: the merge block */ mtr_t* mtr); /*!< in: mini-transaction */ /* Latching strategy of the InnoDB B-tree -------------------------------------- Node pointer page latches acquisition is protected by index->lock latch. Before MariaDB 10.2.2, all node pointer pages were protected by index->lock either in S (shared) or X (exclusive) mode and block->lock was not acquired on node pointer pages. After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect node pointer pages and obtaiment of node pointer page latches is protected by index->lock. (0) Definition: B-tree level. (0.1) The leaf pages of the B-tree are at level 0. (0.2) The parent of a page at level L has level L+1. (The level of the root page is equal to the tree height.) (0.3) The B-tree lock (index->lock) is the parent of the root page and has a level = tree height + 1. Index->lock has 3 possible locking modes: (1) S-latch: (1.1) All latches for pages must be obtained in descending order of tree level. (1.2) Before obtaining the first node pointer page latch at a given B-tree level, parent latch must be held (at level +1 ). (1.3) If a node pointer page is already latched at the same level we can only obtain latch to its right sibling page latch at the same level. (1.4) Release of the node pointer page latches must be done in child-to-parent order. (Prevents deadlocks when obtained index->lock in SX mode). (1.4.1) Level L node pointer page latch can be released only when no latches at children level i.e. level < L are hold. (1.4.2) All latches from node pointer pages must be released so that no latches are obtained between. (1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer latch obtained. (2) SX-latch: In this case rules (1.2) and (1.3) from S-latch case are relaxed and merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition can be skipped at some tree levels and latches can be obtained in a less restricted order. (2.1) [identical to (1.1)]: All latches for pages must be obtained in descending order of tree level. (2.2) When a node pointer latch at level L is obtained, the left sibling page latch in the same level or some ancestor page latch (at level > L) must be hold. (2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can be any node pointer page. (3) X-latch: Node pointer latches can be obtained in any order. NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages: index->lock S-latch is needed in read for the node pointer traversal. When the leaf level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all node pointer latches). Left to right index travelsal in leaf page level can be safely done by obtaining right sibling leaf page latch and then releasing the old page latch. Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock S-latch. B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page allocations are protected by index->lock X-latch. Node pointers ------------- Leaf pages of a B-tree contain the index records stored in the tree. On levels n > 0 we store 'node pointers' to pages on level n - 1. For each page there is exactly one node pointer stored: thus the our tree is an ordinary B-tree, not a B-link tree. A node pointer contains a prefix P of an index record. The prefix is long enough so that it determines an index record uniquely. The file page number of the child page is added as the last field. To the child page we can store node pointers or index records which are >= P in the alphabetical order, but < P1 if there is a next node pointer on the level, and P1 is its prefix. If a node pointer with a prefix P points to a non-leaf child, then the leftmost record in the child must have the same prefix P. If it points to a leaf node, the child is not required to contain any record with a prefix equal to P. The leaf case is decided this way to allow arbitrary deletions in a leaf node without touching upper levels of the tree. We have predefined a special minimum record which we define as the smallest record in any alphabetical order. A minimum record is denoted by setting a bit in the record header. A minimum record acts as the prefix of a node pointer which points to a leftmost node on any level of the tree. File page allocation -------------------- In the root node of a B-tree there are two file segment headers. The leaf pages of a tree are allocated from one file segment, to make them consecutive on disk if possible. From the other file segment we allocate pages for the non-leaf levels of the tree. */ /** Check a file segment header within a B-tree root page. @param offset file segment header offset @param block B-tree root page @param space tablespace @return whether the segment header is valid */ static bool btr_root_fseg_validate(ulint offset, const buf_block_t &block, const fil_space_t &space) { ut_ad(block.page.id().space() == space.id); const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET + block.page.frame); if (FIL_PAGE_DATA <= hdr && hdr <= srv_page_size - FIL_PAGE_DATA_END && mach_read_from_4(block.page.frame + offset + FSEG_HDR_SPACE) == space.id) return true; sql_print_error("InnoDB: Index root page " UINT32PF " in %s is corrupted " "at " ULINTPF, block.page.id().page_no(), UT_LIST_GET_FIRST(space.chain)->name); return false; } /** Report a decryption failure. */ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index) { ib_push_warning(static_cast(nullptr), DB_DECRYPTION_FAILED, "Table %s is encrypted but encryption service or" " used key_id is not available. " " Can't continue reading table.", index.table->name.m_name); index.table->file_unreadable= true; } /** Get an index page and declare its latching order level. @param[in] index index tree @param[in] page page number @param[in] mode latch mode @param[in] merge whether change buffer merge should be attempted @param[in,out] mtr mini-transaction @param[out] err error code @return block */ buf_block_t *btr_block_get(const dict_index_t &index, uint32_t page, rw_lock_type_t mode, bool merge, mtr_t *mtr, dberr_t *err) { ut_ad(mode != RW_NO_LATCH); dberr_t local_err; if (!err) err= &local_err; buf_block_t *block= buf_page_get_gen(page_id_t{index.table->space->id, page}, index.table->space->zip_size(), mode, nullptr, BUF_GET, mtr, err, merge && !index.is_clust()); ut_ad(!block == (*err != DB_SUCCESS)); if (UNIV_LIKELY(block != nullptr)) { if (!!page_is_comp(block->page.frame) != index.table->not_redundant() || btr_page_get_index_id(block->page.frame) != index.id || !fil_page_index_page_check(block->page.frame) || index.is_spatial() != (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) { *err= DB_PAGE_CORRUPTED; block= nullptr; } } else if (*err == DB_DECRYPTION_FAILED) btr_decryption_failed(index); return block; } /**************************************************************//** Gets the root node of a tree and x- or s-latches it. @return root page, x- or s-latched */ buf_block_t* btr_root_block_get( /*===============*/ dict_index_t* index, /*!< in: index tree */ rw_lock_type_t mode, /*!< in: either RW_S_LATCH or RW_X_LATCH */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { if (!index->table || !index->table->space) { *err= DB_TABLESPACE_NOT_FOUND; return nullptr; } buf_block_t *block; #ifndef BTR_CUR_ADAPT static constexpr buf_block_t *guess= nullptr; #else buf_block_t *&guess= btr_search_get_info(index)->root_guess; guess= #endif block= buf_page_get_gen(page_id_t{index->table->space->id, index->page}, index->table->space->zip_size(), mode, guess, BUF_GET, mtr, err, false); ut_ad(!block == (*err != DB_SUCCESS)); if (UNIV_LIKELY(block != nullptr)) { if (UNIV_UNLIKELY(mode == RW_NO_LATCH)); else if (!!page_is_comp(block->page.frame) != index->table->not_redundant() || btr_page_get_index_id(block->page.frame) != index->id || !fil_page_index_page_check(block->page.frame) || index->is_spatial() != (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) { *err= DB_PAGE_CORRUPTED; block= nullptr; } else if (index->is_ibuf()); else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, *block, *index->table->space) || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, *block, *index->table->space)) { *err= DB_CORRUPTION; block= nullptr; } } else if (*err == DB_DECRYPTION_FAILED) btr_decryption_failed(*index); return block; } /**************************************************************//** Gets the root node of a tree and sx-latches it for segment access. @return root page, sx-latched */ static page_t* btr_root_get( /*=========*/ dict_index_t* index, /*!< in: index tree */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { /* Intended to be used for accessing file segment lists. Concurrent read of other data is allowed. */ if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err)) return root->page.frame; return nullptr; } /**************************************************************//** Checks a file segment header within a B-tree root page and updates the segment header space id. @return TRUE if valid */ static bool btr_root_fseg_adjust_on_import( /*===========================*/ fseg_header_t* seg_header, /*!< in/out: segment header */ page_zip_des_t* page_zip, /*!< in/out: compressed page, or NULL */ ulint space) /*!< in: tablespace identifier */ { ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); if (offset < FIL_PAGE_DATA || offset > srv_page_size - FIL_PAGE_DATA_END) { return false; } seg_header += FSEG_HDR_SPACE; mach_write_to_4(seg_header, space); if (UNIV_LIKELY_NULL(page_zip)) { memcpy(page_zip->data + page_offset(seg_header), seg_header, 4); } return true; } /**************************************************************//** Checks and adjusts the root node of a tree during IMPORT TABLESPACE. @return error code, or DB_SUCCESS */ dberr_t btr_root_adjust_on_import( /*======================*/ const dict_index_t* index) /*!< in: index tree */ { dberr_t err; mtr_t mtr; page_t* page; page_zip_des_t* page_zip; dict_table_t* table = index->table; DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", return(DB_CORRUPTION);); mtr_start(&mtr); mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); buf_block_t* block = buf_page_get_gen( page_id_t(table->space->id, index->page), table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET, &mtr, &err); if (!block) { ut_ad(err != DB_SUCCESS); goto func_exit; } page = buf_block_get_frame(block); page_zip = buf_block_get_page_zip(block); if (!fil_page_index_page_check(page) || page_has_siblings(page)) { err = DB_CORRUPTION; } else if (dict_index_is_clust(index)) { bool page_is_compact_format; page_is_compact_format = page_is_comp(page) > 0; /* Check if the page format and table format agree. */ if (page_is_compact_format != dict_table_is_comp(table)) { err = DB_CORRUPTION; } else { /* Check that the table flags and the tablespace flags match. */ uint32_t tf = dict_tf_to_fsp_flags(table->flags); uint32_t sf = table->space->flags; sf &= ~FSP_FLAGS_MEM_MASK; tf &= ~FSP_FLAGS_MEM_MASK; if (fil_space_t::is_flags_equal(tf, sf) || fil_space_t::is_flags_equal(sf, tf)) { mysql_mutex_lock(&fil_system.mutex); table->space->flags = (table->space->flags & ~FSP_FLAGS_MEM_MASK) | (tf & FSP_FLAGS_MEM_MASK); mysql_mutex_unlock(&fil_system.mutex); err = DB_SUCCESS; } else { err = DB_CORRUPTION; } } } else { err = DB_SUCCESS; } /* Check and adjust the file segment headers, if all OK so far. */ if (err == DB_SUCCESS && (!btr_root_fseg_adjust_on_import( FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + page, page_zip, table->space_id) || !btr_root_fseg_adjust_on_import( FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + page, page_zip, table->space_id))) { err = DB_CORRUPTION; } func_exit: mtr_commit(&mtr); return(err); } /**************************************************************//** Creates a new index page (not the root, and also not used in page reorganization). @see btr_page_empty(). */ void btr_page_create( /*============*/ buf_block_t* block, /*!< in/out: page to be created */ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ dict_index_t* index, /*!< in: index */ ulint level, /*!< in: the B-tree level of the page */ mtr_t* mtr) /*!< in: mtr */ { ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame); if (UNIV_LIKELY_NULL(page_zip)) { mach_write_to_8(index_id, index->id); page_create_zip(block, index, level, 0, mtr); } else { page_create(block, mtr, dict_table_is_comp(index->table)); if (index->is_spatial()) { static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == FIL_PAGE_RTREE, "compatibility"); mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, byte(FIL_PAGE_RTREE)); if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); } /* Set the level of the new index page */ mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>(PAGE_HEADER + PAGE_LEVEL + block->page.frame), level); mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id); } } buf_block_t * mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const { ut_ad(is_active()); ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX || type == MTR_MEMO_PAGE_S_FIX); for (ulint i= 0; i < m_memo.size(); i++) { const mtr_memo_slot_t &slot= m_memo[i]; const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY); if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type) { buf_block_t *block= static_cast(slot.object); if (block->page.id() == id) return block; } } return nullptr; } /** Fetch an index root page that was already latched in the mini-transaction. */ static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr) { return mtr->get_already_latched(page_id_t{index.table->space_id, index.page}, MTR_MEMO_PAGE_SX_FIX); } /** Fetch an index page that should have been already latched in the mini-transaction. */ static buf_block_t * btr_block_reget(mtr_t *mtr, const dict_index_t &index, const page_id_t id, dberr_t *err) { if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX)) { *err= DB_SUCCESS; return block; } ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK)); return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err); } /**************************************************************//** Allocates a new file page to be used in an ibuf tree. Takes the page from the free list of the tree, which must contain pages! @return new allocated block, x-latched */ static buf_block_t* btr_page_alloc_for_ibuf( /*====================*/ dict_index_t* index, /*!< in: index tree */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { buf_block_t *root= btr_get_latched_root(*index, mtr); if (UNIV_UNLIKELY(!root)) return root; buf_block_t *new_block= buf_page_get_gen(page_id_t(IBUF_SPACE_ID, mach_read_from_4(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST + FLST_FIRST + FIL_ADDR_PAGE + root->page.frame)), 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); if (new_block) *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); ut_d(if (*err == DB_SUCCESS) flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); return new_block; } /**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! @retval NULL if no page could be allocated */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) buf_block_t* btr_page_alloc_low( /*===============*/ dict_index_t* index, /*!< in: index */ uint32_t hint_page_no, /*!< in: hint of a good page */ byte file_direction, /*!< in: direction where a possible page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ mtr_t* mtr, /*!< in/out: mini-transaction for the allocation */ mtr_t* init_mtr, /*!< in/out: mtr or another mini-transaction in which the page should be initialized. */ dberr_t* err) /*!< out: error code */ { const auto savepoint= mtr->get_savepoint(); buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err); if (UNIV_UNLIKELY(!root)) return root; const bool have_latch= mtr->have_u_or_x_latch(*root); #ifdef BTR_CUR_HASH_ADAPT ut_ad(!have_latch || !root->index || !root->index->freed()); #endif mtr->rollback_to_savepoint(savepoint); if (!have_latch && UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err)))) return root; fseg_header_t *seg_header= root->page.frame + (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF); return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction, true, mtr, init_mtr, err); } /**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! @retval NULL if no page could be allocated */ buf_block_t* btr_page_alloc( /*===========*/ dict_index_t* index, /*!< in: index */ uint32_t hint_page_no, /*!< in: hint of a good page */ byte file_direction, /*!< in: direction where a possible page split is made */ ulint level, /*!< in: level where the page is placed in the tree */ mtr_t* mtr, /*!< in/out: mini-transaction for the allocation */ mtr_t* init_mtr, /*!< in/out: mini-transaction for x-latching and initializing the page */ dberr_t* err) /*!< out: error code */ { ut_ad(level < BTR_MAX_NODE_LEVEL); return index->is_ibuf() ? btr_page_alloc_for_ibuf(index, mtr, err) : btr_page_alloc_low(index, hint_page_no, file_direction, level, mtr, init_mtr, err); } /**************************************************************//** Frees a page used in an ibuf tree. Puts the page to the free list of the ibuf tree. */ static dberr_t btr_page_free_for_ibuf( /*===================*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: block to be freed, x-latched */ mtr_t* mtr) /*!< in: mtr */ { ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); buf_block_t *root= btr_get_latched_root(*index, mtr); dberr_t err= flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); ut_d(if (err == DB_SUCCESS) flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); return err; } /** Free an index page. @param[in,out] index index tree @param[in,out] block block to be freed @param[in,out] mtr mini-transaction @param[in] blob whether this is freeing a BLOB page @param[in] latched whether index->table->space->x_lock() was called @return error code */ dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, bool blob, bool space_latched) { ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); #if defined BTR_CUR_HASH_ADAPT && defined UNIV_DEBUG if (btr_search_check_marked_free_index(block)) { ut_ad(!blob); ut_ad(page_is_leaf(block->page.frame)); } #endif const uint32_t page{block->page.id().page_no()}; ut_ad(index->table->space_id == block->page.id().space()); /* The root page is freed by btr_free_root(). */ ut_ad(page != index->page); ut_ad(mtr->is_named_space(index->table->space)); /* The page gets invalid for optimistic searches: increment the frame modify clock */ buf_block_modify_clock_inc(block); /* TODO: Discard any operations for block from mtr->m_log. The page will be freed, so previous changes to it by this mini-transaction should not matter. */ if (index->is_ibuf()) return btr_page_free_for_ibuf(index, block, mtr); fil_space_t *space= index->table->space; dberr_t err; const auto savepoint= mtr->get_savepoint(); if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err)) { const bool have_latch= mtr->have_u_or_x_latch(*root); #ifdef BTR_CUR_HASH_ADAPT ut_ad(!have_latch || !root->index || !root->index->freed()); #endif mtr->rollback_to_savepoint(savepoint); if (have_latch || (root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err))) err= fseg_free_page(&root->page.frame[blob || page_is_leaf(block->page.frame) ? PAGE_HEADER + PAGE_BTR_SEG_LEAF : PAGE_HEADER + PAGE_BTR_SEG_TOP], space, page, mtr, space_latched); } if (err == DB_SUCCESS) buf_page_free(space, page, mtr); /* The page was marked free in the allocation bitmap, but it should remain exclusively latched until mtr_t::commit() or until it is explicitly freed from the mini-transaction. */ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); return err; } /** Set the child page number in a node pointer record. @param[in,out] block non-leaf index page @param[in,out] rec node pointer record in the page @param[in] offsets rec_get_offsets(rec) @param[in] page_no child page number @param[in,out] mtr mini-transaction Sets the child node file address in a node pointer. */ inline void btr_node_ptr_set_child_page_no(buf_block_t *block, rec_t *rec, const rec_offs *offsets, ulint page_no, mtr_t *mtr) { ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(!page_rec_is_leaf(rec)); ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); const ulint offs= rec_offs_data_size(offsets); ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) == REC_NODE_PTR_SIZE); if (UNIV_LIKELY_NULL(block->page.zip.data)) page_zip_write_node_ptr(block, rec, offs, page_no, mtr); else mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no); } MY_ATTRIBUTE((nonnull(1,2,3,4),warn_unused_result)) /************************************************************//** Returns the child page of a node pointer and sx-latches it. @return child page, sx-latched */ static buf_block_t* btr_node_ptr_get_child( /*===================*/ const rec_t* node_ptr,/*!< in: node pointer */ dict_index_t* index, /*!< in: index */ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err = nullptr) /*!< out: error code */ { ut_ad(rec_offs_validate(node_ptr, index, offsets)); ut_ad(index->table->space_id == page_get_space_id(page_align(node_ptr))); return btr_block_get( *index, btr_node_ptr_get_child_page_no(node_ptr, offsets), RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1, mtr, err); } MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) /************************************************************//** Returns the upper level node pointer to a page. It is assumed that mtr holds an sx-latch on the tree. @return rec_get_offsets() of the node pointer record */ static rec_offs* btr_page_get_father_node_ptr_for_validate( rec_offs* offsets,/*!< in: work area for the return value */ mem_heap_t* heap, /*!< in: memory heap to use */ btr_cur_t* cursor, /*!< in: cursor pointing to user record, out: cursor on node pointer record, its page x-latched */ mtr_t* mtr) /*!< in: mtr */ { const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); dict_index_t* index = btr_cur_get_index(cursor); ut_ad(!dict_index_is_spatial(index)); ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); ut_ad(dict_index_get_page(index) != page_no); const auto level = btr_page_get_level(btr_cur_get_page(cursor)); const rec_t* user_rec = btr_cur_get_rec(cursor); ut_a(page_rec_is_user_rec(user_rec)); if (btr_cur_search_to_nth_level(level + 1, dict_index_build_node_ptr(index, user_rec, 0, heap, level), RW_S_LATCH, cursor, mtr) != DB_SUCCESS) { return nullptr; } const rec_t* node_ptr = btr_cur_get_rec(cursor); offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { offsets = nullptr; } return(offsets); } MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) /** Return the node pointer to a page. @param offsets work area for the return value @param heap memory heap @param cursor in: child page; out: node pointer to it @param mtr mini-transaction @return rec_get_offsets() of the node pointer record @retval nullptr if the parent page had not been latched in mtr */ static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap, btr_cur_t *cursor, mtr_t *mtr) { const uint32_t page_no= cursor->block()->page.id().page_no(); const dict_index_t *index= cursor->index(); ut_ad(!index->is_spatial()); ut_ad(index->page != page_no); uint32_t p= index->page; auto level= btr_page_get_level(cursor->block()->page.frame); const dtuple_t *tuple= dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, level); level++; ulint i; for (i= 0; i < mtr->get_savepoint(); i++) if (buf_block_t *block= mtr->block_at_savepoint(i)) if (block->page.id().page_no() == p) { ut_ad(block->page.lock.have_u_or_x() || (!block->page.lock.have_s() && index->lock.have_x())); ulint up_match= 0, low_match= 0; cursor->page_cur.block= block; if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, &low_match, &cursor->page_cur, nullptr)) return nullptr; offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, ULINT_UNDEFINED, &heap); p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets); if (p != page_no) { if (btr_page_get_level(block->page.frame) == level) return nullptr; i= 0; // MDEV-29835 FIXME: require all pages to be latched in order! continue; } ut_ad(block->page.lock.have_u_or_x()); if (block->page.lock.have_u_not_x()) { /* btr_cur_t::search_leaf(BTR_MODIFY_TREE) only U-latches the root page initially. */ ut_ad(block->page.id().page_no() == index->page); block->page.lock.u_x_upgrade(); mtr->page_lock_upgrade(*block); } return offsets; } return nullptr; } /************************************************************//** Returns the upper level node pointer to a page. It is assumed that mtr holds an x-latch on the tree. @return rec_get_offsets() of the node pointer record */ static rec_offs* btr_page_get_father_block( /*======================*/ rec_offs* offsets,/*!< in: work area for the return value */ mem_heap_t* heap, /*!< in: memory heap to use */ mtr_t* mtr, /*!< in: mtr */ btr_cur_t* cursor) /*!< out: cursor on node pointer record, its page x-latched */ { rec_t *rec= page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); if (UNIV_UNLIKELY(!rec)) return nullptr; cursor->page_cur.rec= rec; return btr_page_get_parent(offsets, heap, cursor, mtr); } /** Seek to the parent page of a B-tree page. @param[in,out] mtr mini-transaction @param[in,out] cursor cursor pointing to the x-latched parent page @return whether the cursor was successfully positioned */ bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor) { rec_t *rec= page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); if (UNIV_UNLIKELY(!rec)) return false; cursor->page_cur.rec= rec; mem_heap_t *heap= mem_heap_create(100); const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr); mem_heap_free(heap); return got; } #ifdef UNIV_DEBUG /** PAGE_INDEX_ID value for freed index B-trees */ constexpr index_id_t BTR_FREED_INDEX_ID = 0; #endif /** Free a B-tree root page. btr_free_but_not_root() must already have been called. @param block index root page @param space tablespace @param mtr mini-transaction */ static void btr_free_root(buf_block_t *block, const fil_space_t &space, mtr_t *mtr) { ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); ut_ad(mtr->is_named_space(&space)); btr_search_drop_page_hash_index(block, false); if (btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP, *block, space)) { /* Free the entire segment in small steps. */ ut_d(mtr->freeing_tree()); while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->page.frame, mtr)); } } MY_ATTRIBUTE((warn_unused_result)) /** Prepare to free a B-tree. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] index_id PAGE_INDEX_ID contents @param[in,out] mtr mini-transaction @return root block, to invoke btr_free_but_not_root() and btr_free_root() @retval NULL if the page is no longer a matching B-tree page */ static buf_block_t *btr_free_root_check(const page_id_t page_id, ulint zip_size, index_id_t index_id, mtr_t *mtr) { ut_ad(page_id.space() != SRV_TMP_SPACE_ID); ut_ad(index_id != BTR_FREED_INDEX_ID); buf_block_t *block= buf_page_get_gen(page_id, zip_size, RW_X_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, mtr); if (!block); else if (fil_page_index_page_check(block->page.frame) && index_id == btr_page_get_index_id(block->page.frame)) /* This should be a root page. It should not be possible to reassign the same index_id for some other index in the tablespace. */ ut_ad(!page_has_siblings(block->page.frame)); else block= nullptr; return block; } /** Initialize the root page of the b-tree @param[in,out] block root block @param[in] index_id index id @param[in] index index of root page @param[in,out] mtr mini-transaction */ static void btr_root_page_init(buf_block_t *block, index_id_t index_id, dict_index_t *index, mtr_t *mtr) { constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID; byte *page_index_id= my_assume_aligned<2>(field + block->page.frame); /* Create a new index page on the allocated segment page */ if (UNIV_LIKELY_NULL(block->page.zip.data)) { mach_write_to_8(page_index_id, index_id); ut_ad(!page_has_siblings(block->page.zip.data)); page_create_zip(block, index, 0, 0, mtr); } else { page_create(block, mtr, index && index->table->not_redundant()); if (index && index->is_spatial()) { static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == FIL_PAGE_RTREE, "compatibility"); mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, byte(FIL_PAGE_RTREE)); if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); } /* Set the level of the new index page */ mtr->write<2,mtr_t::MAYBE_NOP>( *block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, 0U); mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, index_id); } } /** Create the root node for a new index tree. @param[in] type type of the index @param[in] index_id index id @param[in,out] space tablespace where created @param[in] index index, or NULL to create a system table @param[in,out] mtr mini-transaction @param[out] err error code @return page number of the created root @retval FIL_NULL if did not succeed */ uint32_t btr_create( ulint type, fil_space_t* space, index_id_t index_id, dict_index_t* index, mtr_t* mtr, dberr_t* err) { buf_block_t* block; ut_ad(mtr->is_named_space(space)); ut_ad(index_id != BTR_FREED_INDEX_ID); ut_ad(index || space == fil_system.sys_space); /* Create the two new segments (one, in the case of an ibuf tree) for the index tree; the segment headers are put on the allocated root page (for an ibuf tree, not in the root, but on a separate ibuf header page) */ if (UNIV_UNLIKELY(type & DICT_IBUF)) { /* Allocate first the ibuf header page */ buf_block_t* ibuf_hdr_block = fseg_create( space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err); if (ibuf_hdr_block == NULL) { return(FIL_NULL); } ut_ad(ibuf_hdr_block->page.id().page_no() == IBUF_HEADER_PAGE_NO); /* Allocate then the next page to the segment: it will be the tree root page */ block = fseg_alloc_free_page_general( buf_block_get_frame(ibuf_hdr_block) + IBUF_HEADER + IBUF_TREE_SEG_HEADER, IBUF_TREE_ROOT_PAGE_NO, FSP_UP, false, mtr, mtr, err); if (block == NULL) { return(FIL_NULL); } ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO)); flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); } else { block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr, err); if (block == NULL) { return(FIL_NULL); } if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr, err, false, block)) { /* Not enough space for new segment, free root segment before return. */ btr_free_root(block, *space, mtr); return(FIL_NULL); } } ut_ad(!page_has_siblings(block->page.frame)); btr_root_page_init(block, index_id, index, mtr); /* We reset the free bits for the page in a separate mini-transaction to allow creation of several trees in the same mtr, otherwise the latch on a bitmap page would prevent it because of the latching order. Note: Insert Buffering is disabled for temporary tables given that most temporary tables are smaller in size and short-lived. */ if (!(type & DICT_CLUSTERED) && (!index || !index->table->is_temporary())) { ibuf_reset_free_bits(block); } /* In the following assertion we test that two records of maximum allowed size fit on the root page: this fact is needed to ensure correctness of split algorithms */ ut_ad(page_get_max_insert_size(block->page.frame, 2) > 2 * BTR_PAGE_MAX_REC_SIZE); return(block->page.id().page_no()); } /** Free a B-tree except the root page. The root page MUST be freed after this by calling btr_free_root. @param[in,out] block root page @param[in] log_mode mtr logging mode */ static void btr_free_but_not_root( buf_block_t* block, mtr_log_t log_mode #ifdef BTR_CUR_HASH_ADAPT ,bool ahi=false #endif ) { mtr_t mtr; ut_ad(fil_page_index_page_check(block->page.frame)); ut_ad(!page_has_siblings(block->page.frame)); leaf_loop: mtr_start(&mtr); ut_d(mtr.freeing_tree()); mtr_set_log_mode(&mtr, log_mode); fil_space_t *space = mtr.set_named_space_id(block->page.id().space()); if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, *block, *space) || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, *block, *space)) { mtr_commit(&mtr); return; } /* NOTE: page hash indexes are dropped when a page is freed inside fsp0fsp. */ bool finished = fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_LEAF + block->page.frame, &mtr #ifdef BTR_CUR_HASH_ADAPT , ahi #endif /* BTR_CUR_HASH_ADAPT */ ); mtr_commit(&mtr); if (!finished) { goto leaf_loop; } top_loop: mtr_start(&mtr); mtr_set_log_mode(&mtr, log_mode); space = mtr.set_named_space_id(block->page.id().space()); finished = !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, *block, *space) || fseg_free_step_not_header(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->page.frame, &mtr #ifdef BTR_CUR_HASH_ADAPT ,ahi #endif /* BTR_CUR_HASH_ADAPT */ ); mtr_commit(&mtr); if (!finished) { goto top_loop; } } /** Clear the index tree and reinitialize the root page, in the rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized. @param thr query thread @return error code */ TRANSACTIONAL_TARGET dberr_t dict_index_t::clear(que_thr_t *thr) { mtr_t mtr; mtr.start(); if (table->is_temporary()) mtr.set_log_mode(MTR_LOG_NO_REDO); else set_modified(mtr); mtr_sx_lock_index(this, &mtr); dberr_t err; if (buf_block_t *root_block= buf_page_get_gen(page_id_t(table->space->id, page), table->space->zip_size(), RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) { btr_free_but_not_root(root_block, mtr.get_log_mode() #ifdef BTR_CUR_HASH_ADAPT ,n_ahi_pages() != 0 #endif ); #ifdef BTR_CUR_HASH_ADAPT if (root_block->index) btr_search_drop_page_hash_index(root_block, false); ut_ad(n_ahi_pages() == 0); #endif mtr.memset(root_block, PAGE_HEADER + PAGE_BTR_SEG_LEAF, FSEG_HEADER_SIZE, 0); if (fseg_create(table->space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr, &err, false, root_block)) btr_root_page_init(root_block, id, this, &mtr); } mtr.commit(); return err; } /** Free a persistent index tree if it exists. @param[in,out] space tablespce @param[in] page root page number @param[in] index_id PAGE_INDEX_ID contents @param[in,out] mtr mini-transaction */ void btr_free_if_exists(fil_space_t *space, uint32_t page, index_id_t index_id, mtr_t *mtr) { if (buf_block_t *root= btr_free_root_check(page_id_t(space->id, page), space->zip_size(), index_id, mtr)) { btr_free_but_not_root(root, mtr->get_log_mode()); mtr->set_named_space(space); btr_free_root(root, *space, mtr); } } /** Drop a temporary table @param table temporary table */ void btr_drop_temporary_table(const dict_table_t &table) { ut_ad(table.is_temporary()); ut_ad(table.space == fil_system.temp_space); mtr_t mtr; mtr.start(); for (const dict_index_t *index= table.indexes.start; index; index= dict_table_get_next_index(index)) { if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, nullptr, false)) { btr_free_but_not_root(block, MTR_LOG_NO_REDO); mtr.set_log_mode(MTR_LOG_NO_REDO); btr_free_root(block, *fil_system.temp_space, &mtr); mtr.commit(); mtr.start(); } } mtr.commit(); } /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. @param[in,out] index clustered index @return the last used AUTO_INCREMENT value @retval 0 on error or if no AUTO_INCREMENT value was used yet */ ib_uint64_t btr_read_autoinc(dict_index_t* index) { ut_ad(index->is_primary()); ut_ad(index->table->persistent_autoinc); ut_ad(!index->table->is_temporary()); mtr_t mtr; mtr.start(); ib_uint64_t autoinc; if (buf_block_t* block = buf_page_get( page_id_t(index->table->space_id, index->page), index->table->space->zip_size(), RW_S_LATCH, &mtr)) { autoinc = page_get_autoinc(block->page.frame); } else { autoinc = 0; } mtr.commit(); return autoinc; } /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, or fall back to MAX(auto_increment_column). @param[in] table table containing an AUTO_INCREMENT column @param[in] col_no index of the AUTO_INCREMENT column @return the AUTO_INCREMENT value @retval 0 on error or if no AUTO_INCREMENT value was used yet */ ib_uint64_t btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) { ut_ad(table->persistent_autoinc); ut_ad(!table->is_temporary()); dict_index_t* index = dict_table_get_first_index(table); if (index == NULL) { return 0; } mtr_t mtr; mtr.start(); buf_block_t* block = buf_page_get( page_id_t(index->table->space_id, index->page), index->table->space->zip_size(), RW_S_LATCH, &mtr); ib_uint64_t autoinc = block ? page_get_autoinc(block->page.frame) : 0; const bool retry = block && autoinc == 0 && !page_is_empty(block->page.frame); mtr.commit(); if (retry) { /* This should be an old data file where PAGE_ROOT_AUTO_INC was initialized to 0. Fall back to reading MAX(autoinc_col). There should be an index on it. */ const dict_col_t* autoinc_col = dict_table_get_nth_col(table, col_no); while (index && index->fields[0].col != autoinc_col) { index = dict_table_get_next_index(index); } if (index) { autoinc = row_search_max_autoinc(index); } } return autoinc; } /** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. @param[in,out] index clustered index @param[in] autoinc the AUTO_INCREMENT value @param[in] reset whether to reset the AUTO_INCREMENT to a possibly smaller value than currently exists in the page */ void btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) { ut_ad(index->is_primary()); ut_ad(index->table->persistent_autoinc); ut_ad(!index->table->is_temporary()); mtr_t mtr; mtr.start(); fil_space_t *space= index->table->space; if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page), space->zip_size(), RW_SX_LATCH, &mtr)) { mtr.set_named_space(space); page_set_autoinc(root, autoinc, &mtr, reset); } mtr.commit(); } /** Reorganize an index page. @param cursor index page cursor @param mtr mini-transaction */ static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr) { buf_block_t *const block= cursor->block; ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); ut_ad(!is_buf_block_get_page_zip(block)); ut_ad(fil_page_index_page_check(block->page.frame)); ut_ad(cursor->index->is_dummy || block->page.id().space() == cursor->index->table->space->id); ut_ad(cursor->index->is_dummy || block->page.id().page_no() != cursor->index->page || !page_has_siblings(block->page.frame)); /* Save the cursor position. */ const ulint pos= page_rec_get_n_recs_before(cursor->rec); if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) return DB_CORRUPTION; btr_search_drop_page_hash_index(block, false); buf_block_t *old= buf_block_alloc(); /* Copy the old page to temporary space */ memcpy_aligned(old->page.frame, block->page.frame, srv_page_size); const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO); page_create(block, mtr, cursor->index->table->not_redundant()); if (cursor->index->is_spatial()) block->page.frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE); static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == FIL_PAGE_RTREE, "compatibility"); /* Copy the records from the temporary space to the recreated page; do not copy the lock bits yet */ dberr_t err= page_copy_rec_list_end_no_locks(block, old, page_get_infimum_rec(old->page.frame), cursor->index, mtr); mtr->set_log_mode(log_mode); if (UNIV_UNLIKELY(err != DB_SUCCESS)) return err; /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */ ut_ad(!page_get_max_trx_id(block->page.frame)); memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->page.frame, PAGE_MAX_TRX_ID + PAGE_HEADER + old->page.frame, 8); #ifdef UNIV_DEBUG if (page_get_max_trx_id(block->page.frame)) /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than clustered index root pages. */ ut_ad(dict_index_is_sec_or_ibuf(cursor->index) ? page_is_leaf(block->page.frame) : block->page.id().page_no() == cursor->index->page); else /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf pages, and in temporary tables. It was always zero-initialized in page_create(). PAGE_MAX_TRX_ID must be nonzero on dict_index_is_sec_or_ibuf() leaf pages. */ ut_ad(cursor->index->table->is_temporary() || !page_is_leaf(block->page.frame) || !dict_index_is_sec_or_ibuf(cursor->index)); #endif const uint16_t data_size1= page_get_data_size(old->page.frame); const uint16_t data_size2= page_get_data_size(block->page.frame); const ulint max1= page_get_max_insert_size_after_reorganize(old->page.frame, 1); const ulint max2= page_get_max_insert_size_after_reorganize(block->page.frame, 1); if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2)) { sql_print_error("InnoDB: Page old data size %u new data size %u" ", page old max ins size %zu new max ins size %zu", data_size1, data_size2, max1, max2); return DB_CORRUPTION; } /* Restore the cursor position. */ if (!pos) ut_ad(cursor->rec == page_get_infimum_rec(block->page.frame)); else if (!(cursor->rec= page_rec_get_nth(block->page.frame, pos))) return DB_CORRUPTION; if (block->page.id().page_no() != cursor->index->page || fil_page_get_type(old->page.frame) != FIL_PAGE_TYPE_INSTANT) ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER)); else if (!cursor->index->is_instant()) { ut_ad(!memcmp(old->page.frame, block->page.frame, FIL_PAGE_TYPE)); ut_ad(!memcmp(old->page.frame + FIL_PAGE_TYPE + 2, block->page.frame + FIL_PAGE_TYPE + 2, PAGE_HEADER - FIL_PAGE_TYPE - 2)); mtr->write<2,mtr_t::FORCED>(*block, FIL_PAGE_TYPE + block->page.frame, FIL_PAGE_INDEX); } else { /* Preserve the PAGE_INSTANT information. */ memcpy_aligned<2>(FIL_PAGE_TYPE + block->page.frame, FIL_PAGE_TYPE + old->page.frame, 2); memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->page.frame, PAGE_HEADER + PAGE_INSTANT + old->page.frame, 2); if (!cursor->index->table->instant); else if (page_is_comp(block->page.frame)) { memcpy(PAGE_NEW_INFIMUM + block->page.frame, PAGE_NEW_INFIMUM + old->page.frame, 8); memcpy(PAGE_NEW_SUPREMUM + block->page.frame, PAGE_NEW_SUPREMUM + old->page.frame, 8); } else { memcpy(PAGE_OLD_INFIMUM + block->page.frame, PAGE_OLD_INFIMUM + old->page.frame, 8); memcpy(PAGE_OLD_SUPREMUM + block->page.frame, PAGE_OLD_SUPREMUM + old->page.frame, 8); } ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER)); } ut_ad(!memcmp(old->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER, block->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER, PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER))); if (!cursor->index->has_locking()); else if (cursor->index->page == FIL_NULL) ut_ad(cursor->index->is_dummy); else lock_move_reorganize_page(block, old); /* Write log for the changes, if needed. */ if (log_mode == MTR_LOG_ALL) { /* Check and log the changes in the page header. */ ulint a, e; for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++) { if (old->page.frame[a] == block->page.frame[a]) continue; while (--e, old->page.frame[e] == block->page.frame[e]); e++; ut_ad(a < e); /* Write log for the changed page header fields. */ mtr->memcpy(*block, a, e - a); break; } const uint16_t top= page_header_get_offs(block->page.frame, PAGE_HEAP_TOP); if (page_is_comp(block->page.frame)) { /* info_bits=0, n_owned=1, heap_no=0, status */ ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + block->page.frame, PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + old->page.frame, 3)); /* If the 'next' pointer of the infimum record has changed, log it. */ a= PAGE_NEW_INFIMUM - 2; e= a + 2; if (block->page.frame[a] == old->page.frame[a]) a++; if (--e, block->page.frame[e] != old->page.frame[e]) e++; if (ulint len= e - a) mtr->memcpy(*block, a, len); /* The infimum record itself must not change. */ ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->page.frame, PAGE_NEW_INFIMUM + old->page.frame, 8)); /* Log any change of the n_owned of the supremum record. */ a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES; if (block->page.frame[a] != old->page.frame[a]) mtr->memcpy(*block, a, 1); /* The rest of the supremum record must not change. */ ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1], PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM + REC_N_NEW_EXTRA_BYTES - 1)); /* Log the differences in the payload. */ for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++) { if (old->page.frame[a] == block->page.frame[a]) continue; while (--e, old->page.frame[e] == block->page.frame[e]); e++; ut_ad(a < e); /* TODO: write MEMMOVE records to minimize this further! */ mtr->memcpy(*block, a, e - a); break; } } else { /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */ ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + block->page.frame, PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + old->page.frame, 4)); /* If the 'next' pointer of the infimum record has changed, log it. */ a= PAGE_OLD_INFIMUM - 2; e= a + 2; if (block->page.frame[a] == old->page.frame[a]) a++; if (--e, block->page.frame[e] != old->page.frame[e]) e++; if (ulint len= e - a) mtr->memcpy(*block, a, len); /* The infimum record itself must not change. */ ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->page.frame, PAGE_OLD_INFIMUM + old->page.frame, 8)); /* Log any change of the n_owned of the supremum record. */ a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES; if (block->page.frame[a] != old->page.frame[a]) mtr->memcpy(*block, a, 1); ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1], PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM + REC_N_OLD_EXTRA_BYTES - 1)); /* Log the differences in the payload. */ for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++) { if (old->page.frame[a] == block->page.frame[a]) continue; while (--e, old->page.frame[e] == block->page.frame[e]); e++; ut_ad(a < e); /* TODO: write MEMMOVE records to minimize this further! */ mtr->memcpy(*block, a, e - a); break; } } e= srv_page_size - PAGE_DIR; a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->page.frame); /* Zero out the payload area. */ mtr->memset(*block, top, a - top, 0); /* Log changes to the page directory. */ for (; a < e; a++) { if (old->page.frame[a] == block->page.frame[a]) continue; while (--e, old->page.frame[e] == block->page.frame[e]); e++; ut_ad(a < e); /* Write log for the changed page directory slots. */ mtr->memcpy(*block, a, e - a); break; } } buf_block_free(old); MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS); MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL); return DB_SUCCESS; } /*************************************************************//** Reorganizes an index page. IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE if this is a compressed leaf page in a secondary index. This has to be done either within the same mini-transaction, or by invoking ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, IBUF_BITMAP_FREE is unaffected by reorganization. @return error code @retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ dberr_t btr_page_reorganize_block( ulint z_level,/*!< in: compression level to be used if dealing with compressed page */ buf_block_t* block, /*!< in/out: B-tree page */ dict_index_t* index, /*!< in: the index tree of the page */ mtr_t* mtr) /*!< in/out: mini-transaction */ { if (buf_block_get_page_zip(block)) return page_zip_reorganize(block, index, z_level, mtr, true); page_cur_t cur; page_cur_set_before_first(block, &cur); cur.index= index; return btr_page_reorganize_low(&cur, mtr); } /*************************************************************//** Reorganizes an index page. IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE if this is a compressed leaf page in a secondary index. This has to be done either within the same mini-transaction, or by invoking ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, IBUF_BITMAP_FREE is unaffected by reorganization. @param cursor page cursor @param mtr mini-transaction @return error code @retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr) { if (!buf_block_get_page_zip(cursor->block)) return btr_page_reorganize_low(cursor, mtr); ulint pos= page_rec_get_n_recs_before(cursor->rec); if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) return DB_CORRUPTION; dberr_t err= page_zip_reorganize(cursor->block, cursor->index, page_zip_level, mtr, true); if (err == DB_FAIL); else if (!pos) ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->page.frame)); else if (!(cursor->rec= page_rec_get_nth(cursor->block->page.frame, pos))) err= DB_CORRUPTION; return err; } /** Empty an index page (possibly the root page). @see btr_page_create(). @param[in,out] block page to be emptied @param[in,out] page_zip compressed page frame, or NULL @param[in] index index of the page @param[in] level B-tree level of the page (0=leaf) @param[in,out] mtr mini-transaction */ void btr_page_empty( buf_block_t* block, page_zip_des_t* page_zip, dict_index_t* index, ulint level, mtr_t* mtr) { ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_zip == buf_block_get_page_zip(block)); ut_ad(!index->is_dummy); ut_ad(index->table->space->id == block->page.id().space()); #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index)); #endif /* UNIV_ZIP_DEBUG */ btr_search_drop_page_hash_index(block, false); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index root page. */ const ib_uint64_t autoinc = dict_index_is_clust(index) && index->page == block->page.id().page_no() ? page_get_autoinc(block->page.frame) : 0; if (page_zip) { page_create_zip(block, index, level, autoinc, mtr); } else { page_create(block, mtr, index->table->not_redundant()); if (index->is_spatial()) { static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == FIL_PAGE_RTREE, "compatibility"); mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, byte(FIL_PAGE_RTREE)); if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) { mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); } } mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, level); if (autoinc) { mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID + block->page.frame, autoinc); } } } /** Write instant ALTER TABLE metadata to a root page. @param[in,out] root clustered index root page @param[in] index clustered index with instant ALTER TABLE @param[in,out] mtr mini-transaction */ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) { ut_ad(index.n_core_fields > 0); ut_ad(index.n_core_fields < REC_MAX_N_FIELDS); ut_ad(index.is_instant()); ut_ad(fil_page_get_type(root->page.frame) == FIL_PAGE_TYPE_INSTANT || fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX); ut_ad(!page_has_siblings(root->page.frame)); ut_ad(root->page.id().page_no() == index.page); rec_t* infimum = page_get_infimum_rec(root->page.frame); rec_t* supremum = page_get_supremum_rec(root->page.frame); byte* page_type = root->page.frame + FIL_PAGE_TYPE; uint16_t i = page_header_get_field(root->page.frame, PAGE_INSTANT); switch (mach_read_from_2(page_type)) { case FIL_PAGE_TYPE_INSTANT: ut_ad(page_get_instant(root->page.frame) == index.n_core_fields); if (memcmp(infimum, "infimum", 8) || memcmp(supremum, "supremum", 8)) { ut_ad(index.table->instant); ut_ad(!memcmp(infimum, field_ref_zero, 8)); ut_ad(!memcmp(supremum, field_ref_zero, 7)); /* The n_core_null_bytes only matters for ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ ut_ad(supremum[7] == index.n_core_null_bytes || !index.table->not_redundant()); return; } break; default: ut_ad("wrong page type" == 0); /* fall through */ case FIL_PAGE_INDEX: ut_ad(!page_is_comp(root->page.frame) || !page_get_instant(root->page.frame)); ut_ad(!memcmp(infimum, "infimum", 8)); ut_ad(!memcmp(supremum, "supremum", 8)); mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT); ut_ad(i <= PAGE_NO_DIRECTION); i |= static_cast(index.n_core_fields << 3); mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + root->page.frame, i); break; } if (index.table->instant) { mtr->memset(root, infimum - root->page.frame, 8, 0); mtr->memset(root, supremum - root->page.frame, 7, 0); mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7], index.n_core_null_bytes); } } /** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. @param[in] index clustered index with instant ALTER TABLE @param[in] all whether to reset FIL_PAGE_TYPE as well @param[in,out] mtr mini-transaction */ ATTRIBUTE_COLD void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) { ut_ad(!index.table->is_temporary()); ut_ad(index.is_primary()); buf_block_t *root= btr_get_latched_root(index, mtr); byte *page_type= root->page.frame + FIL_PAGE_TYPE; if (all) { ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || mach_read_from_2(page_type) == FIL_PAGE_INDEX); mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX); byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame; mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant, page_ptr_get_direction(instant + 1)); } else ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); static const byte supremuminfimum[8 + 8] = "supremuminfimum"; uint16_t infimum, supremum; if (page_is_comp(root->page.frame)) { infimum= PAGE_NEW_INFIMUM; supremum= PAGE_NEW_SUPREMUM; } else { infimum= PAGE_OLD_INFIMUM; supremum= PAGE_OLD_SUPREMUM; } ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) == !memcmp(&root->page.frame[supremum], supremuminfimum, 8)); mtr->memcpy(*root, &root->page.frame[infimum], supremuminfimum + 8, 8); mtr->memcpy(*root, &root->page.frame[supremum], supremuminfimum, 8); } /*************************************************************//** Makes tree one level higher by splitting the root, and inserts the tuple. It is assumed that mtr contains an x-latch on the tree. NOTE that the operation of this function must always succeed, we cannot reverse it: therefore enough free disk space must be guaranteed to be available before this function is called. @return inserted record */ rec_t* btr_root_raise_and_insert( /*======================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be on the root page; when the function returns, the cursor is positioned on the predecessor of the inserted record */ rec_offs** offsets,/*!< out: offsets on inserted record */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { dict_index_t* index; rec_t* rec; dtuple_t* node_ptr; ulint level; rec_t* node_ptr_rec; page_cur_t* page_cursor; page_zip_des_t* root_page_zip; page_zip_des_t* new_page_zip; buf_block_t* root; buf_block_t* new_block; root = btr_cur_get_block(cursor); root_page_zip = buf_block_get_page_zip(root); ut_ad(!page_is_empty(root->page.frame)); index = btr_cur_get_index(cursor); ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); #ifdef UNIV_ZIP_DEBUG ut_a(!root_page_zip || page_zip_validate(root_page_zip, root->page.frame, index)); #endif /* UNIV_ZIP_DEBUG */ const page_id_t root_id{root->page.id()}; ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX)); if (index->page != root_id.page_no()) { ut_ad("corrupted root page number" == 0); return nullptr; } if (index->is_ibuf()) { } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, *root, *index->table->space) || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, *root, *index->table->space)) { return nullptr; } /* Allocate a new page to the tree. Root splitting is done by first moving the root records to the new page, emptying the root, putting a node pointer to the new page, and then splitting the new page. */ level = btr_page_get_level(root->page.frame); new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err); if (!new_block) { return nullptr; } new_page_zip = buf_block_get_page_zip(new_block); ut_a(!new_page_zip == !root_page_zip); ut_a(!new_page_zip || page_zip_get_size(new_page_zip) == page_zip_get_size(root_page_zip)); btr_page_create(new_block, new_page_zip, index, level, mtr); if (page_has_siblings(new_block->page.frame)) { compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); compile_time_assert(FIL_NULL == 0xffffffff); static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV, 0xff, 8); mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff); if (UNIV_LIKELY_NULL(new_page_zip)) { memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV, 0xff, 8); } } /* Copy the records from root to the new page one by one. */ if (0 #ifdef UNIV_ZIP_COPY || new_page_zip #endif /* UNIV_ZIP_COPY */ || !page_copy_rec_list_end(new_block, root, page_get_infimum_rec(root->page.frame), index, mtr, err)) { switch (*err) { case DB_SUCCESS: break; case DB_FAIL: *err = DB_SUCCESS; break; default: return nullptr; } ut_a(new_page_zip); /* Copy the page byte for byte. */ page_zip_copy_recs(new_block, root_page_zip, root->page.frame, index, mtr); /* Update the lock table and possible hash index. */ if (index->has_locking()) { lock_move_rec_list_end( new_block, root, page_get_infimum_rec(root->page.frame)); } /* Move any existing predicate locks */ if (dict_index_is_spatial(index)) { lock_prdt_rec_move(new_block, root_id); } else { btr_search_move_or_delete_hash_entries( new_block, root); } } constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID; if (dict_index_is_sec_or_ibuf(index)) { /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID can be reset on the root page, because the field only matters on leaf pages, and the root no longer is a leaf page. (Older versions of InnoDB did set PAGE_MAX_TRX_ID on all secondary index pages.) */ byte* p = my_assume_aligned<8>( PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame); if (mach_read_from_8(p)) { mtr->memset(root, max_trx_id, 8, 0); if (UNIV_LIKELY_NULL(root->page.zip.data)) { memset_aligned<8>(max_trx_id + root->page.zip.data, 0, 8); } } } else { /* PAGE_ROOT_AUTO_INC is only present in the clustered index root page; on other clustered index pages, we want to reserve the field PAGE_MAX_TRX_ID for future use. */ byte* p = my_assume_aligned<8>( PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame); if (mach_read_from_8(p)) { mtr->memset(new_block, max_trx_id, 8, 0); if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { memset_aligned<8>(max_trx_id + new_block->page.zip.data, 0, 8); } } } /* If this is a pessimistic insert which is actually done to perform a pessimistic update then we have stored the lock information of the record to be inserted on the infimum of the root page: we cannot discard the lock structs on the root page */ if (index->has_locking()) { lock_update_root_raise(*new_block, root_id); } /* Create a memory heap where the node pointer is stored */ if (!*heap) { *heap = mem_heap_create(1000); } const uint32_t new_page_no = new_block->page.id().page_no(); rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame)); ut_ad(rec); /* We just created the page. */ /* Build the node pointer (= node key and page address) for the child */ if (dict_index_is_spatial(index)) { rtr_mbr_t new_mbr; rtr_page_cal_mbr(index, new_block, &new_mbr, *heap); node_ptr = rtr_index_build_node_ptr( index, &new_mbr, rec, new_page_no, *heap); } else { node_ptr = dict_index_build_node_ptr( index, rec, new_page_no, *heap, level); } /* The node pointer must be marked as the predefined minimum record, as there is no lower alphabetical limit to records in the leftmost node of a level: */ dtuple_set_info_bits(node_ptr, dtuple_get_info_bits(node_ptr) | REC_INFO_MIN_REC_FLAG); /* Rebuild the root page to get free space */ btr_page_empty(root, root_page_zip, index, level + 1, mtr); /* btr_page_empty() is supposed to zero-initialize the field. */ ut_ad(!page_get_instant(root->page.frame)); if (index->is_instant()) { ut_ad(!root_page_zip); btr_set_instant(root, *index, mtr); } ut_ad(!page_has_siblings(root->page.frame)); page_cursor = btr_cur_get_page_cur(cursor); /* Insert node pointer to the root */ page_cur_set_before_first(root, page_cursor); node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, offsets, heap, 0, mtr); /* The root page should only contain the node pointer to new_block at this point. Thus, the data should fit. */ ut_a(node_ptr_rec); /* We play safe and reset the free bits for the new page */ if (!dict_index_is_clust(index) && !index->table->is_temporary()) { ibuf_reset_free_bits(new_block); } page_cursor->block = new_block; page_cursor->index = index; ut_ad(dtuple_check_typed(tuple)); /* Reposition the cursor to the child node */ ulint low_match = 0, up_match = 0; if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, &low_match, page_cursor, nullptr)) { *err = DB_CORRUPTION; return nullptr; } /* Split the child and insert tuple */ return btr_page_split_and_insert(flags, cursor, offsets, heap, tuple, n_ext, mtr, err); } /** Decide if the page should be split at the convergence point of inserts converging to the left. @param[in] cursor insert position @return the first record to be moved to the right half page @retval NULL if no split is recommended */ rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor) { rec_t* split_rec = btr_cur_get_rec(cursor); const page_t* page = page_align(split_rec); if (page_header_get_ptr(page, PAGE_LAST_INSERT) != page_rec_get_next(split_rec)) { return NULL; } /* The metadata record must be present in the leftmost leaf page of the clustered index, if and only if index->is_instant(). However, during innobase_instant_try(), index->is_instant() would already hold when row_ins_clust_index_entry_low() is being invoked to insert the the metadata record. So, we can only assert that when the metadata record exists, index->is_instant() must hold. */ ut_ad(!page_is_leaf(page) || page_has_prev(page) || cursor->index()->is_instant() || !(rec_get_info_bits(page_rec_get_next_const( page_get_infimum_rec(page)), cursor->index()->table->not_redundant()) & REC_INFO_MIN_REC_FLAG)); const rec_t* infimum = page_get_infimum_rec(page); /* If the convergence is in the middle of a page, include also the record immediately before the new insert to the upper page. Otherwise, we could repeatedly move from page to page lots of records smaller than the convergence point. */ if (split_rec == infimum || split_rec == page_rec_get_next_const(infimum)) { split_rec = page_rec_get_next(split_rec); } return split_rec; } /** Decide if the page should be split at the convergence point of inserts converging to the right. @param[in] cursor insert position @param[out] split_rec if split recommended, the first record on the right half page, or NULL if the to-be-inserted record should be first @return whether split is recommended */ bool btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec) { rec_t* insert_point = btr_cur_get_rec(cursor); const page_t* page = page_align(insert_point); /* We use eager heuristics: if the new insert would be right after the previous insert on the same page, we assume that there is a pattern of sequential inserts here. */ if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) { return false; } insert_point = page_rec_get_next(insert_point); if (!insert_point || page_rec_is_supremum(insert_point)) { insert_point = NULL; } else { insert_point = page_rec_get_next(insert_point); if (page_rec_is_supremum(insert_point)) { insert_point = NULL; } /* If there are >= 2 user records up from the insert point, split all but 1 off. We want to keep one because then sequential inserts can use the adaptive hash index, as they can do the necessary checks of the right search position just by looking at the records on this page. */ } *split_rec = insert_point; return true; } /*************************************************************//** Calculates a split record such that the tuple will certainly fit on its half-page when the split is performed. We assume in this function only that the cursor page has at least one user record. @return split record, or NULL if tuple will be the first record on the lower or upper half-page (determined by btr_page_tuple_smaller()) */ static rec_t* btr_page_get_split_rec( /*===================*/ btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext) /*!< in: number of externally stored columns */ { page_t* page; page_zip_des_t* page_zip; ulint insert_size; ulint free_space; ulint total_data; ulint total_n_recs; ulint total_space; ulint incl_data; rec_t* ins_rec; rec_t* rec; rec_t* next_rec; ulint n; mem_heap_t* heap; rec_offs* offsets; page = btr_cur_get_page(cursor); insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); free_space = page_get_free_space_of_empty(page_is_comp(page)); page_zip = btr_cur_get_page_zip(cursor); if (page_zip) { /* Estimate the free space of an empty compressed page. */ ulint free_space_zip = page_zip_empty_size( cursor->index()->n_fields, page_zip_get_size(page_zip)); if (free_space > (ulint) free_space_zip) { free_space = (ulint) free_space_zip; } } /* free_space is now the free space of a created new page */ total_data = page_get_data_size(page) + insert_size; total_n_recs = ulint(page_get_n_recs(page)) + 1; ut_ad(total_n_recs >= 2); total_space = total_data + page_dir_calc_reserved_space(total_n_recs); n = 0; incl_data = 0; ins_rec = btr_cur_get_rec(cursor); rec = page_get_infimum_rec(page); heap = NULL; offsets = NULL; /* We start to include records to the left half, and when the space reserved by them exceeds half of total_space, then if the included records fit on the left page, they will be put there if something was left over also for the right page, otherwise the last included record will be the first on the right half page */ do { /* Decide the next record to include */ if (rec == ins_rec) { rec = NULL; /* NULL denotes that tuple is now included */ } else if (rec == NULL) { rec = page_rec_get_next(ins_rec); } else { rec = page_rec_get_next(rec); } if (rec == NULL) { /* Include tuple */ incl_data += insert_size; } else { offsets = rec_get_offsets(rec, cursor->index(), offsets, page_is_leaf(page) ? cursor->index() ->n_core_fields : 0, ULINT_UNDEFINED, &heap); incl_data += rec_offs_size(offsets); } n++; } while (incl_data + page_dir_calc_reserved_space(n) < total_space / 2); if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { /* The next record will be the first on the right half page if it is not the supremum record of page */ if (rec == ins_rec) { rec = NULL; goto func_exit; } else if (rec == NULL) { next_rec = page_rec_get_next(ins_rec); } else { next_rec = page_rec_get_next(rec); } ut_ad(next_rec); if (!page_rec_is_supremum(next_rec)) { rec = next_rec; } } func_exit: if (heap) { mem_heap_free(heap); } return(rec); } #ifdef UNIV_DEBUG /*************************************************************//** Returns TRUE if the insert fits on the appropriate half-page with the chosen split_rec. @return true if fits */ static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result)) bool btr_page_insert_fits( /*=================*/ btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ const rec_t* split_rec,/*!< in: suggestion for first record on upper half-page, or NULL if tuple to be inserted should be first */ rec_offs** offsets,/*!< in: rec_get_offsets( split_rec, cursor->index()); out: garbage */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ mem_heap_t** heap) /*!< in: temporary memory heap */ { page_t* page; ulint insert_size; ulint free_space; ulint total_data; ulint total_n_recs; const rec_t* rec; const rec_t* end_rec; page = btr_cur_get_page(cursor); ut_ad(!split_rec || !page_is_comp(page) == !rec_offs_comp(*offsets)); ut_ad(!split_rec || rec_offs_validate(split_rec, cursor->index(), *offsets)); insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); free_space = page_get_free_space_of_empty(page_is_comp(page)); /* free_space is now the free space of a created new page */ total_data = page_get_data_size(page) + insert_size; total_n_recs = ulint(page_get_n_recs(page)) + 1; /* We determine which records (from rec to end_rec, not including end_rec) will end up on the other half page from tuple when it is inserted. */ if (!(end_rec = split_rec)) { end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); } else if (cmp_dtuple_rec(tuple, split_rec, cursor->index(), *offsets) < 0) { rec = split_rec; end_rec = page_get_supremum_rec(page); goto got_rec; } if (!(rec = page_rec_get_next(page_get_infimum_rec(page)))) { return false; } got_rec: if (total_data + page_dir_calc_reserved_space(total_n_recs) <= free_space) { /* Ok, there will be enough available space on the half page where the tuple is inserted */ return(true); } while (rec != end_rec) { /* In this loop we calculate the amount of reserved space after rec is removed from page. */ *offsets = rec_get_offsets(rec, cursor->index(), *offsets, page_is_leaf(page) ? cursor->index()->n_core_fields : 0, ULINT_UNDEFINED, heap); total_data -= rec_offs_size(*offsets); total_n_recs--; if (total_data + page_dir_calc_reserved_space(total_n_recs) <= free_space) { /* Ok, there will be enough available space on the half page where the tuple is inserted */ return(true); } if (!(rec = page_rec_get_next_const(rec))) { break; } } return(false); } #endif /*******************************************************//** Inserts a data tuple to a tree on a non-leaf level. It is assumed that mtr holds an x-latch on the tree. */ dberr_t btr_insert_on_non_leaf_level( ulint flags, /*!< in: undo logging and locking flags */ dict_index_t* index, /*!< in: index */ ulint level, /*!< in: level, must be > 0 */ dtuple_t* tuple, /*!< in: the record to be inserted */ mtr_t* mtr) /*!< in: mtr */ { big_rec_t* dummy_big_rec; btr_cur_t cursor; rec_t* rec; mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets = offsets_; rec_offs_init(offsets_); rtr_info_t rtr_info; ut_ad(level > 0); flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG | BTR_NO_UNDO_LOG_FLAG; cursor.page_cur.index = index; dberr_t err; if (index->is_spatial()) { /* For spatial index, initialize structures to track its parents etc. */ rtr_init_rtr_info(&rtr_info, false, &cursor, index, false); rtr_info_update_btr(&cursor, &rtr_info); err = rtr_search_to_nth_level(level, tuple, PAGE_CUR_RTREE_INSERT, BTR_CONT_MODIFY_TREE, &cursor, mtr); } else { err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH, &cursor, mtr); } ut_ad(cursor.flag == BTR_CUR_BINARY); ut_ad(btr_cur_get_block(&cursor) != mtr->at_savepoint(mtr->get_savepoint() - 1) || index->is_spatial() || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); if (UNIV_LIKELY(err == DB_SUCCESS)) { err = btr_cur_optimistic_insert(flags, &cursor, &offsets, &heap, tuple, &rec, &dummy_big_rec, 0, NULL, mtr); } if (err == DB_FAIL) { err = btr_cur_pessimistic_insert(flags, &cursor, &offsets, &heap, tuple, &rec, &dummy_big_rec, 0, NULL, mtr); } if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } if (index->is_spatial()) { ut_ad(cursor.rtr_info); rtr_clean_rtr_info(&rtr_info, true); } return err; } static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); static_assert(FIL_PAGE_PREV % 4 == 0, "alignment"); static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment"); MY_ATTRIBUTE((nonnull,warn_unused_result)) /**************************************************************//** Attaches the halves of an index page on the appropriate level in an index tree. */ static dberr_t btr_attach_half_pages( /*==================*/ ulint flags, /*!< in: undo logging and locking flags */ dict_index_t* index, /*!< in: the index tree */ buf_block_t* block, /*!< in/out: page to be split */ const rec_t* split_rec, /*!< in: first record on upper half page */ buf_block_t* new_block, /*!< in/out: the new half page */ ulint direction, /*!< in: FSP_UP or FSP_DOWN */ mtr_t* mtr) /*!< in: mtr */ { dtuple_t* node_ptr_upper; mem_heap_t* heap; buf_block_t* prev_block = nullptr; buf_block_t* next_block = nullptr; buf_block_t* lower_block; buf_block_t* upper_block; ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX)); /* Create a memory heap where the data tuple is stored */ heap = mem_heap_create(1024); /* Based on split direction, decide upper and lower pages */ if (direction == FSP_DOWN) { btr_cur_t cursor; rec_offs* offsets; lower_block = new_block; upper_block = block; cursor.page_cur.block = block; cursor.page_cur.index = index; /* Look up the index for the node pointer to page */ offsets = btr_page_get_father_block(nullptr, heap, mtr, &cursor); /* Replace the address of the old child node (= page) with the address of the new lower half */ btr_node_ptr_set_child_page_no( btr_cur_get_block(&cursor), btr_cur_get_rec(&cursor), offsets, lower_block->page.id().page_no(), mtr); mem_heap_empty(heap); } else { lower_block = block; upper_block = new_block; } /* Get the level of the split pages */ const ulint level = btr_page_get_level(block->page.frame); ut_ad(level == btr_page_get_level(new_block->page.frame)); page_id_t id{block->page.id()}; /* Get the previous and next pages of page */ const uint32_t prev_page_no = btr_page_get_prev(block->page.frame); const uint32_t next_page_no = btr_page_get_next(block->page.frame); /* for consistency, both blocks should be locked, before change */ if (prev_page_no != FIL_NULL && direction == FSP_DOWN) { id.set_page_no(prev_page_no); prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!prev_block) { ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH, !level, mtr); } #endif } if (next_page_no != FIL_NULL && direction != FSP_DOWN) { id.set_page_no(next_page_no); next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!next_block) { ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); next_block = btr_block_get(*index, next_page_no, RW_X_LATCH, !level, mtr); } #endif } /* Build the node pointer (= node key and page address) for the upper half */ node_ptr_upper = dict_index_build_node_ptr( index, split_rec, upper_block->page.id().page_no(), heap, level); /* Insert it next to the pointer to the lower half. Note that this may generate recursion leading to a split on the higher level. */ dberr_t err = btr_insert_on_non_leaf_level( flags, index, level + 1, node_ptr_upper, mtr); /* Free the memory heap */ mem_heap_free(heap); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { return err; } /* Update page links of the level */ if (prev_block) { if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, 4))) { return DB_CORRUPTION; } btr_page_set_next(prev_block, lower_block->page.id().page_no(), mtr); } if (next_block) { if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame + FIL_PAGE_PREV, block->page.frame + FIL_PAGE_OFFSET, 4))) { return DB_CORRUPTION; } btr_page_set_prev(next_block, upper_block->page.id().page_no(), mtr); } if (direction == FSP_DOWN) { ut_ad(lower_block == new_block); ut_ad(btr_page_get_next(upper_block->page.frame) == next_page_no); btr_page_set_prev(lower_block, prev_page_no, mtr); } else { ut_ad(upper_block == new_block); ut_ad(btr_page_get_prev(lower_block->page.frame) == prev_page_no); btr_page_set_next(upper_block, next_page_no, mtr); } btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr); btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr); return DB_SUCCESS; } /*************************************************************//** Determine if a tuple is smaller than any record on the page. @return TRUE if smaller */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) bool btr_page_tuple_smaller( /*===================*/ btr_cur_t* cursor, /*!< in: b-tree cursor */ const dtuple_t* tuple, /*!< in: tuple to consider */ rec_offs** offsets,/*!< in/out: temporary storage */ ulint n_uniq, /*!< in: number of unique fields in the index page records */ mem_heap_t** heap) /*!< in/out: heap for offsets */ { buf_block_t* block; const rec_t* first_rec; page_cur_t pcur; /* Read the first user record in the page. */ block = btr_cur_get_block(cursor); page_cur_set_before_first(block, &pcur); if (UNIV_UNLIKELY(!(first_rec = page_cur_move_to_next(&pcur)))) { ut_ad("corrupted page" == 0); return false; } *offsets = rec_get_offsets(first_rec, cursor->index(), *offsets, page_is_leaf(block->page.frame) ? cursor->index()->n_core_fields : 0, n_uniq, heap); return cmp_dtuple_rec(tuple, first_rec, cursor->index(), *offsets) < 0; } /** Insert the tuple into the right sibling page, if the cursor is at the end of a page. @param[in] flags undo logging and locking flags @param[in,out] cursor cursor at which to insert; when the function succeeds, the cursor is positioned before the insert point. @param[out] offsets offsets on inserted record @param[in,out] heap memory heap for allocating offsets @param[in] tuple tuple to insert @param[in] n_ext number of externally stored columns @param[in,out] mtr mini-transaction @return inserted record (first record on the right sibling page); the cursor will be positioned on the page infimum @retval NULL if the operation was not performed */ static rec_t* btr_insert_into_right_sibling( ulint flags, btr_cur_t* cursor, rec_offs** offsets, mem_heap_t* heap, const dtuple_t* tuple, ulint n_ext, mtr_t* mtr) { buf_block_t* block = btr_cur_get_block(cursor); page_t* page = buf_block_get_frame(block); const uint32_t next_page_no = btr_page_get_next(page); ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); ut_ad(heap); ut_ad(dtuple_check_typed(tuple)); if (next_page_no == FIL_NULL || !page_rec_is_supremum( page_rec_get_next(btr_cur_get_rec(cursor)))) { return nullptr; } page_cur_t next_page_cursor; buf_block_t* next_block; page_t* next_page; btr_cur_t next_father_cursor; rec_t* rec = nullptr; ulint max_size; next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH, page_is_leaf(page), mtr); if (UNIV_UNLIKELY(!next_block)) { return nullptr; } next_page = buf_block_get_frame(next_block); const bool is_leaf = page_is_leaf(next_page); next_page_cursor.index = cursor->index(); next_page_cursor.block = next_block; next_father_cursor.page_cur = next_page_cursor; if (!btr_page_get_father(mtr, &next_father_cursor)) { return nullptr; } ulint up_match = 0, low_match = 0; if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, &low_match, &next_page_cursor, nullptr)) { return nullptr; } max_size = page_get_max_insert_size_after_reorganize(next_page, 1); /* Extends gap lock for the next page */ if (is_leaf && cursor->index()->has_locking()) { lock_update_node_pointer(block, next_block); } rec = page_cur_tuple_insert(&next_page_cursor, tuple, offsets, &heap, n_ext, mtr); if (!rec) { if (is_leaf && next_block->page.zip.ssize && !dict_index_is_clust(cursor->index()) && !cursor->index()->table->is_temporary()) { /* Reset the IBUF_BITMAP_FREE bits, because page_cur_tuple_insert() will have attempted page reorganize before failing. */ ibuf_reset_free_bits(next_block); } return nullptr; } ibool compressed; dberr_t err; ulint level = btr_page_get_level(next_page); /* adjust cursor position */ *btr_cur_get_page_cur(cursor) = next_page_cursor; ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page)); ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec); /* We have to change the parent node pointer */ compressed = btr_cur_pessimistic_delete( &err, TRUE, &next_father_cursor, BTR_CREATE_FLAG, false, mtr); if (err != DB_SUCCESS) { return nullptr; } if (!compressed) { btr_cur_compress_if_useful(&next_father_cursor, false, mtr); } dtuple_t* node_ptr = dict_index_build_node_ptr( cursor->index(), rec, next_block->page.id().page_no(), heap, level); if (btr_insert_on_non_leaf_level(flags, cursor->index(), level + 1, node_ptr, mtr) != DB_SUCCESS) { return nullptr; } ut_ad(rec_offs_validate(rec, cursor->index(), *offsets)); if (is_leaf && !dict_index_is_clust(cursor->index()) && !cursor->index()->table->is_temporary()) { /* Update the free bits of the B-tree page in the insert buffer bitmap. */ if (next_block->page.zip.ssize) { ibuf_update_free_bits_zip(next_block, mtr); } else { ibuf_update_free_bits_if_full( next_block, max_size, rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE); } } return(rec); } /*************************************************************//** Moves record list end to another page. Moved records include split_rec. IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if new_block is a compressed leaf page in a secondary index. This has to be done either within the same mini-transaction, or by invoking ibuf_reset_free_bits() before mtr_commit(). @return error code */ static dberr_t page_move_rec_list_end( /*===================*/ buf_block_t* new_block, /*!< in/out: index page where to move */ buf_block_t* block, /*!< in: index page from where to move */ rec_t* split_rec, /*!< in: first record to move */ dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ { page_t* new_page = buf_block_get_frame(new_block); ulint old_data_size; ulint new_data_size; ulint old_n_recs; ulint new_n_recs; ut_ad(!dict_index_is_spatial(index)); old_data_size = page_get_data_size(new_page); old_n_recs = page_get_n_recs(new_page); #ifdef UNIV_ZIP_DEBUG { page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); page_zip_des_t* page_zip = buf_block_get_page_zip(block); ut_a(!new_page_zip == !page_zip); ut_a(!new_page_zip || page_zip_validate(new_page_zip, new_page, index)); ut_a(!page_zip || page_zip_validate(page_zip, page_align(split_rec), index)); } #endif /* UNIV_ZIP_DEBUG */ dberr_t err; if (!page_copy_rec_list_end(new_block, block, split_rec, index, mtr, &err)) { return err; } new_data_size = page_get_data_size(new_page); new_n_recs = page_get_n_recs(new_page); ut_ad(new_data_size >= old_data_size); return page_delete_rec_list_end(split_rec, block, index, new_n_recs - old_n_recs, new_data_size - old_data_size, mtr); } /*************************************************************//** Moves record list start to another page. Moved records do not include split_rec. IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if new_block is a compressed leaf page in a secondary index. This has to be done either within the same mini-transaction, or by invoking ibuf_reset_free_bits() before mtr_commit(). @return error code */ static dberr_t page_move_rec_list_start( /*=====================*/ buf_block_t* new_block, /*!< in/out: index page where to move */ buf_block_t* block, /*!< in/out: page containing split_rec */ rec_t* split_rec, /*!< in: first record not to move */ dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ { dberr_t err; if (page_copy_rec_list_start(new_block, block, split_rec, index, mtr, &err)) page_delete_rec_list_start(split_rec, block, index, mtr); return err; } /*************************************************************//** Splits an index page to halves and inserts the tuple. It is assumed that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is released within this function! NOTE that the operation of this function must always succeed, we cannot reverse it: therefore enough free disk space (2 pages) must be guaranteed to be available before this function is called. @return inserted record or NULL if run out of space */ rec_t* btr_page_split_and_insert( /*======================*/ ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the function returns, the cursor is positioned on the predecessor of the inserted record */ rec_offs** offsets,/*!< out: offsets on inserted record */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { buf_block_t* block; page_t* page; page_zip_des_t* page_zip; buf_block_t* new_block; page_t* new_page; page_zip_des_t* new_page_zip; rec_t* split_rec; buf_block_t* left_block; buf_block_t* right_block; page_cur_t* page_cursor; rec_t* first_rec; byte* buf = 0; /* remove warning */ rec_t* move_limit; ulint n_iterations = 0; ulint n_uniq; ut_ad(*err == DB_SUCCESS); ut_ad(dtuple_check_typed(tuple)); buf_pool.pages_split++; if (cursor->index()->is_spatial()) { /* Split rtree page and update parent */ return rtr_page_split_and_insert(flags, cursor, offsets, heap, tuple, n_ext, mtr, err); } if (!*heap) { *heap = mem_heap_create(1024); } n_uniq = dict_index_get_n_unique_in_tree(cursor->index()); func_start: mem_heap_empty(*heap); *offsets = NULL; ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); ut_ad(!dict_index_is_online_ddl(cursor->index()) || (flags & BTR_CREATE_FLAG) || dict_index_is_clust(cursor->index())); ut_ad(cursor->index()->lock.have_u_or_x()); block = btr_cur_get_block(cursor); page = buf_block_get_frame(block); page_zip = buf_block_get_page_zip(block); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); ut_ad(!page_is_empty(page)); /* try to insert to the next page if possible before split */ if (rec_t* rec = btr_insert_into_right_sibling( flags, cursor, offsets, *heap, tuple, n_ext, mtr)) { return(rec); } /* 1. Decide the split record; split_rec == NULL means that the tuple to be inserted should be the first record on the upper half-page */ bool insert_left = false; uint32_t hint_page_no = block->page.id().page_no() + 1; byte direction = FSP_UP; if (n_iterations > 0) { split_rec = btr_page_get_split_rec(cursor, tuple, n_ext); if (split_rec == NULL) { insert_left = btr_page_tuple_smaller( cursor, tuple, offsets, n_uniq, heap); } } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { } else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) { direction = FSP_DOWN; hint_page_no -= 2; } else { /* If there is only one record in the index page, we can't split the node in the middle by default. We need to determine whether the new record will be inserted to the left or right. */ if (page_get_n_recs(page) > 1) { split_rec = page_get_middle_rec(page); } else if (btr_page_tuple_smaller(cursor, tuple, offsets, n_uniq, heap)) { split_rec = page_rec_get_next( page_get_infimum_rec(page)); } else { split_rec = NULL; goto got_split_rec; } if (UNIV_UNLIKELY(!split_rec)) { *err = DB_CORRUPTION; return nullptr; } } got_split_rec: /* 2. Allocate a new page to the index */ const uint16_t page_level = btr_page_get_level(page); new_block = btr_page_alloc(cursor->index(), hint_page_no, direction, page_level, mtr, mtr, err); if (!new_block) { return nullptr; } new_page = buf_block_get_frame(new_block); new_page_zip = buf_block_get_page_zip(new_block); if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4); } btr_page_create(new_block, new_page_zip, cursor->index(), page_level, mtr); /* Only record the leaf level page splits. */ if (!page_level) { cursor->index()->stat_defrag_n_page_split ++; cursor->index()->stat_defrag_modified_counter ++; btr_defragment_save_defrag_stats_if_needed(cursor->index()); } /* 3. Calculate the first record on the upper half-page, and the first record (move_limit) on original page which ends up on the upper half */ if (split_rec) { first_rec = move_limit = split_rec; *offsets = rec_get_offsets(split_rec, cursor->index(), *offsets, page_is_leaf(page) ? cursor->index()->n_core_fields : 0, n_uniq, heap); insert_left = cmp_dtuple_rec(tuple, split_rec, cursor->index(), *offsets) < 0; if (!insert_left && new_page_zip && n_iterations > 0) { /* If a compressed page has already been split, avoid further splits by inserting the record to an empty page. */ split_rec = NULL; goto insert_empty; } } else if (insert_left) { if (UNIV_UNLIKELY(!n_iterations)) { corrupted: *err = DB_CORRUPTION; return nullptr; } first_rec = page_rec_get_next(page_get_infimum_rec(page)); insert_move_limit: move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); if (UNIV_UNLIKELY(!first_rec || !move_limit)) { goto corrupted; } } else { insert_empty: ut_ad(!split_rec); ut_ad(!insert_left); buf = UT_NEW_ARRAY_NOKEY( byte, rec_get_converted_size(cursor->index(), tuple, n_ext)); first_rec = rec_convert_dtuple_to_rec(buf, cursor->index(), tuple, n_ext); goto insert_move_limit; } /* 4. Do first the modifications in the tree structure */ /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */ *err = btr_attach_half_pages(flags, cursor->index(), block, first_rec, new_block, direction, mtr); if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { return nullptr; } #ifdef UNIV_DEBUG /* If the split is made on the leaf level and the insert will fit on the appropriate half-page, we may release the tree x-latch. We can then move the records after releasing the tree latch, thus reducing the tree latch contention. */ const bool insert_will_fit = !new_page_zip && btr_page_insert_fits(cursor, split_rec, offsets, tuple, n_ext, heap); #endif if (!split_rec && !insert_left) { UT_DELETE_ARRAY(buf); buf = NULL; } #if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled if (insert_will_fit && page_is_leaf(page) && !dict_index_is_online_ddl(cursor->index())) { mtr->release(cursor->index()->lock); /* NOTE: We cannot release root block latch here, because it has segment header and already modified in most of cases.*/ } #endif /* 5. Move then the records to the new page */ if (direction == FSP_DOWN) { /* fputs("Split left\n", stderr); */ if (0 #ifdef UNIV_ZIP_COPY || page_zip #endif /* UNIV_ZIP_COPY */ || (*err = page_move_rec_list_start(new_block, block, move_limit, cursor->index(), mtr))) { if (*err != DB_FAIL) { return nullptr; } /* For some reason, compressing new_block failed, even though it should contain fewer records than the original page. Copy the page byte for byte and then delete the records from both pages as appropriate. Deleting will always succeed. */ ut_a(new_page_zip); page_zip_copy_recs(new_block, page_zip, page, cursor->index(), mtr); *err = page_delete_rec_list_end(move_limit - page + new_page, new_block, cursor->index(), ULINT_UNDEFINED, ULINT_UNDEFINED, mtr); if (*err != DB_SUCCESS) { return nullptr; } /* Update the lock table and possible hash index. */ if (cursor->index()->has_locking()) { lock_move_rec_list_start( new_block, block, move_limit, new_page + PAGE_NEW_INFIMUM); } btr_search_move_or_delete_hash_entries( new_block, block); /* Delete the records from the source page. */ page_delete_rec_list_start(move_limit, block, cursor->index(), mtr); } left_block = new_block; right_block = block; if (cursor->index()->has_locking()) { lock_update_split_left(right_block, left_block); } } else { /* fputs("Split right\n", stderr); */ if (0 #ifdef UNIV_ZIP_COPY || page_zip #endif /* UNIV_ZIP_COPY */ || (*err = page_move_rec_list_end(new_block, block, move_limit, cursor->index(), mtr))) { if (*err != DB_FAIL) { return nullptr; } /* For some reason, compressing new_page failed, even though it should contain fewer records than the original page. Copy the page byte for byte and then delete the records from both pages as appropriate. Deleting will always succeed. */ ut_a(new_page_zip); page_zip_copy_recs(new_block, page_zip, page, cursor->index(), mtr); page_delete_rec_list_start(move_limit - page + new_page, new_block, cursor->index(), mtr); /* Update the lock table and possible hash index. */ if (cursor->index()->has_locking()) { lock_move_rec_list_end(new_block, block, move_limit); } btr_search_move_or_delete_hash_entries( new_block, block); /* Delete the records from the source page. */ *err = page_delete_rec_list_end(move_limit, block, cursor->index(), ULINT_UNDEFINED, ULINT_UNDEFINED, mtr); if (*err != DB_SUCCESS) { return nullptr; } } left_block = block; right_block = new_block; if (cursor->index()->has_locking()) { lock_update_split_right(right_block, left_block); } } #ifdef UNIV_ZIP_DEBUG if (page_zip) { ut_a(page_zip_validate(page_zip, page, cursor->index())); ut_a(page_zip_validate(new_page_zip, new_page, cursor->index())); } #endif /* UNIV_ZIP_DEBUG */ /* At this point, split_rec, move_limit and first_rec may point to garbage on the old page. */ /* 6. The split and the tree modification is now completed. Decide the page where the tuple should be inserted */ rec_t* rec; buf_block_t* const insert_block = insert_left ? left_block : right_block; /* 7. Reposition the cursor for insert and try insertion */ page_cursor = btr_cur_get_page_cur(cursor); page_cursor->block = insert_block; ulint up_match = 0, low_match = 0; if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, &low_match, page_cursor, nullptr)) { *err = DB_CORRUPTION; return nullptr; } rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext, mtr); #ifdef UNIV_ZIP_DEBUG { page_t* insert_page = buf_block_get_frame(insert_block); page_zip_des_t* insert_page_zip = buf_block_get_page_zip(insert_block); ut_a(!insert_page_zip || page_zip_validate(insert_page_zip, insert_page, cursor->index())); } #endif /* UNIV_ZIP_DEBUG */ if (rec != NULL) { goto func_exit; } /* 8. If insert did not fit, try page reorganization. For compressed pages, page_cur_tuple_insert() will have attempted this already. */ if (page_cur_get_page_zip(page_cursor)) { goto insert_failed; } *err = btr_page_reorganize(page_cursor, mtr); if (*err != DB_SUCCESS) { return nullptr; } rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext, mtr); if (rec == NULL) { /* The insert did not fit on the page: loop back to the start of the function for a new split */ insert_failed: /* We play safe and reset the free bits for new_page */ if (!dict_index_is_clust(page_cursor->index) && !page_cursor->index->table->is_temporary()) { ibuf_reset_free_bits(new_block); ibuf_reset_free_bits(block); } n_iterations++; ut_ad(n_iterations < 2 || buf_block_get_page_zip(insert_block)); ut_ad(!insert_will_fit); goto func_start; } func_exit: /* Insert fit on the page: update the free bits for the left and right pages in the same mtr */ if (!dict_index_is_clust(page_cursor->index) && !page_cursor->index->table->is_temporary() && page_is_leaf(page)) { ibuf_update_free_bits_for_two_pages_low( left_block, right_block, mtr); } ut_ad(page_validate(buf_block_get_frame(left_block), page_cursor->index)); ut_ad(page_validate(buf_block_get_frame(right_block), page_cursor->index)); ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets)); return(rec); } /** Remove a page from the level list of pages. @param[in] block page to remove @param[in] index index tree @param[in,out] mtr mini-transaction */ dberr_t btr_level_list_remove(const buf_block_t& block, const dict_index_t& index, mtr_t* mtr) { ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX)); ut_ad(block.zip_size() == index.table->space->zip_size()); ut_ad(index.table->space->id == block.page.id().space()); /* Get the previous and next page numbers of page */ const uint32_t prev_page_no= btr_page_get_prev(block.page.frame); const uint32_t next_page_no= btr_page_get_next(block.page.frame); page_id_t id{block.page.id()}; buf_block_t *prev= nullptr, *next; dberr_t err; /* Update page links of the level */ if (prev_page_no != FIL_NULL) { id.set_page_no(prev_page_no); prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!prev) { ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); prev= btr_block_get(index, id.page_no(), RW_X_LATCH, page_is_leaf(block.page.frame), mtr, &err); if (UNIV_UNLIKELY(!prev)) return err; } #endif } if (next_page_no != FIL_NULL) { id.set_page_no(next_page_no); next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!next) { ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); next= btr_block_get(index, id.page_no(), RW_X_LATCH, page_is_leaf(block.page.frame), mtr, &err); if (UNIV_UNLIKELY(!next)) return err; } #endif btr_page_set_prev(next, prev_page_no, mtr); } if (prev) btr_page_set_next(prev, next_page_no, mtr); return DB_SUCCESS; } /*************************************************************//** If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. @return father block */ buf_block_t* btr_lift_page_up( dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: page which is the only on its level; must not be empty: use btr_discard_only_page_on_level if the last record from the page should be removed */ mtr_t* mtr, /*!< in/out: mini-transaction */ dberr_t* err) /*!< out: error code */ { buf_block_t* father_block; ulint page_level; page_zip_des_t* father_page_zip; page_t* page = buf_block_get_frame(block); ulint root_page_no; buf_block_t* blocks[BTR_MAX_LEVELS]; ulint n_blocks; /*!< last used index in blocks[] */ ulint i; bool lift_father_up; buf_block_t* block_orig = block; ut_ad(!page_has_siblings(page)); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); ut_ad(!page_is_empty(page)); page_level = btr_page_get_level(page); root_page_no = dict_index_get_page(index); { btr_cur_t cursor; rec_offs* offsets = NULL; mem_heap_t* heap = mem_heap_create( sizeof(*offsets) * (REC_OFFS_HEADER_SIZE + 1 + 1 + unsigned(index->n_fields))); buf_block_t* b; cursor.page_cur.index = index; cursor.page_cur.block = block; if (index->is_spatial()) { offsets = rtr_page_get_father_block( nullptr, heap, mtr, nullptr, &cursor); } else { offsets = btr_page_get_father_block(offsets, heap, mtr, &cursor); } father_block = btr_cur_get_block(&cursor); father_page_zip = buf_block_get_page_zip(father_block); n_blocks = 0; /* Store all ancestor pages so we can reset their levels later on. We have to do all the searches on the tree now because later on, after we've replaced the first level, the tree is in an inconsistent state and can not be searched. */ for (b = father_block; b->page.id().page_no() != root_page_no; ) { ut_a(n_blocks < BTR_MAX_LEVELS); if (index->is_spatial()) { offsets = rtr_page_get_father_block( nullptr, heap, mtr, nullptr, &cursor); } else { offsets = btr_page_get_father_block(offsets, heap, mtr, &cursor); } blocks[n_blocks++] = b = btr_cur_get_block(&cursor); } lift_father_up = (n_blocks && page_level == 0); if (lift_father_up) { /* The father page also should be the only on its level (not root). We should lift up the father page at first. Because the leaf page should be lifted up only for root page. The freeing page is based on page_level (==0 or !=0) to choose segment. If the page_level is changed ==0 from !=0, later freeing of the page doesn't find the page allocation to be freed.*/ block = father_block; page = buf_block_get_frame(block); page_level = btr_page_get_level(page); ut_ad(!page_has_siblings(page)); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); father_block = blocks[0]; father_page_zip = buf_block_get_page_zip(father_block); } mem_heap_free(heap); } btr_search_drop_page_hash_index(block, false); /* Make the father empty */ btr_page_empty(father_block, father_page_zip, index, page_level, mtr); /* btr_page_empty() is supposed to zero-initialize the field. */ ut_ad(!page_get_instant(father_block->page.frame)); if (index->is_instant() && father_block->page.id().page_no() == root_page_no) { ut_ad(!father_page_zip); if (page_is_leaf(page)) { const rec_t* rec = page_rec_get_next( page_get_infimum_rec(page)); ut_ad(rec_is_metadata(rec, *index)); if (rec_is_add_metadata(rec, *index) && page_get_n_recs(page) == 1) { index->clear_instant_add(); goto copied; } } btr_set_instant(father_block, *index, mtr); } /* Copy the records to the father page one by one. */ if (0 #ifdef UNIV_ZIP_COPY || father_page_zip #endif /* UNIV_ZIP_COPY */ || !page_copy_rec_list_end(father_block, block, page_get_infimum_rec(page), index, mtr, err)) { switch (*err) { case DB_SUCCESS: break; case DB_FAIL: *err = DB_SUCCESS; break; default: return nullptr; } const page_zip_des_t* page_zip = buf_block_get_page_zip(block); ut_a(father_page_zip); ut_a(page_zip); /* Copy the page byte for byte. */ page_zip_copy_recs(father_block, page_zip, page, index, mtr); /* Update the lock table and possible hash index. */ if (index->has_locking()) { lock_move_rec_list_end(father_block, block, page_get_infimum_rec(page)); } /* Also update the predicate locks */ if (dict_index_is_spatial(index)) { lock_prdt_rec_move(father_block, block->page.id()); } else { btr_search_move_or_delete_hash_entries( father_block, block); } } copied: if (index->has_locking()) { const page_id_t id{block->page.id()}; /* Free predicate page locks on the block */ if (index->is_spatial()) { lock_sys.prdt_page_free_from_discard(id); } else { lock_update_copy_and_discard(*father_block, id); } } page_level++; /* Go upward to root page, decrementing levels by one. */ for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) { ut_ad(btr_page_get_level(blocks[i]->page.frame) == page_level + 1); btr_page_set_level(blocks[i], page_level, mtr); } if (dict_index_is_spatial(index)) { rtr_check_discard_page(index, NULL, block); } /* Free the file page */ btr_page_free(index, block, mtr); /* We play it safe and reset the free bits for the father */ if (!dict_index_is_clust(index) && !index->table->is_temporary()) { ibuf_reset_free_bits(father_block); } ut_ad(page_validate(father_block->page.frame, index)); ut_ad(btr_check_node_ptr(index, father_block, mtr)); return(lift_father_up ? block_orig : father_block); } /*************************************************************//** Tries to merge the page first to the left immediate brother if such a brother exists, and the node pointers to the current page and to the brother reside on the same page. If the left brother does not satisfy these conditions, looks at the right brother. If the page is the only one on that level lifts the records of the page to the father page, thus reducing the tree height. It is assumed that mtr holds an x-latch on the tree and on the page. If cursor is on the leaf level, mtr must also hold x-latches to the brothers, if they exist. @return error code */ dberr_t btr_compress( /*=========*/ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge or lift; the page must not be empty: when deleting records, use btr_discard_page() if the page would become empty */ bool adjust, /*!< in: whether the cursor position should be adjusted even when compression occurs */ mtr_t* mtr) /*!< in/out: mini-transaction */ { dict_index_t* index; buf_block_t* merge_block = nullptr; page_t* merge_page = nullptr; page_zip_des_t* merge_page_zip; ibool is_left; buf_block_t* block; page_t* page; btr_cur_t father_cursor; mem_heap_t* heap; rec_offs* offsets; ulint nth_rec = 0; /* remove bogus warning */ bool mbr_changed = false; #ifdef UNIV_DEBUG bool leftmost_child; #endif DBUG_ENTER("btr_compress"); block = btr_cur_get_block(cursor); page = btr_cur_get_page(cursor); index = btr_cur_get_index(cursor); ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); const uint32_t left_page_no = btr_page_get_prev(page); const uint32_t right_page_no = btr_page_get_next(page); dberr_t err = DB_SUCCESS; ut_ad(page_is_leaf(page) || left_page_no != FIL_NULL || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( page_rec_get_next(page_get_infimum_rec(page)), page_is_comp(page)))); heap = mem_heap_create(100); father_cursor.page_cur.index = index; father_cursor.page_cur.block = block; if (index->is_spatial()) { offsets = rtr_page_get_father_block( NULL, heap, mtr, cursor, &father_cursor); ut_ad(cursor->page_cur.block->page.id() == block->page.id()); rec_t* my_rec = father_cursor.page_cur.rec; ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets); if (page_no != block->page.id().page_no()) { ib::info() << "father positioned on page " << page_no << "instead of " << block->page.id().page_no(); offsets = btr_page_get_father_block( NULL, heap, mtr, &father_cursor); } } else { offsets = btr_page_get_father_block( NULL, heap, mtr, &father_cursor); } if (adjust) { nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) { corrupted: err = DB_CORRUPTION; err_exit: /* We play it safe and reset the free bits. */ if (merge_block && merge_block->zip_size() && page_is_leaf(merge_block->page.frame) && !index->is_clust()) { ibuf_reset_free_bits(merge_block); } goto func_exit; } } if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) { /* The page is the only one on the level, lift the records to the father */ merge_block = btr_lift_page_up(index, block, mtr, &err); success: if (adjust) { ut_ad(nth_rec > 0); if (rec_t* nth = page_rec_get_nth(merge_block->page.frame, nth_rec)) { btr_cur_position(index, nth, merge_block, cursor); } else { goto corrupted; } } MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL); func_exit: mem_heap_free(heap); DBUG_RETURN(err); } ut_d(leftmost_child = left_page_no != FIL_NULL && (page_rec_get_next( page_get_infimum_rec( btr_cur_get_page(&father_cursor))) == btr_cur_get_rec(&father_cursor))); /* Decide the page to which we try to merge and which will inherit the locks */ is_left = btr_can_merge_with_page(cursor, left_page_no, &merge_block, mtr); DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;); retry: if (!is_left && !btr_can_merge_with_page(cursor, right_page_no, &merge_block, mtr)) { if (!merge_block) { merge_page = NULL; } cannot_merge: err = DB_FAIL; goto err_exit; } merge_page = buf_block_get_frame(merge_block); if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_page + (is_left ? FIL_PAGE_NEXT : FIL_PAGE_PREV), block->page.frame + FIL_PAGE_OFFSET, 4))) { goto corrupted; } ut_ad(page_validate(merge_page, index)); merge_page_zip = buf_block_get_page_zip(merge_block); #ifdef UNIV_ZIP_DEBUG if (merge_page_zip) { const page_zip_des_t* page_zip = buf_block_get_page_zip(block); ut_a(page_zip); ut_a(page_zip_validate(merge_page_zip, merge_page, index)); ut_a(page_zip_validate(page_zip, page, index)); } #endif /* UNIV_ZIP_DEBUG */ btr_cur_t cursor2; cursor2.page_cur.index = index; cursor2.page_cur.block = merge_block; /* Move records to the merge page */ if (is_left) { rtr_mbr_t new_mbr; rec_offs* offsets2 = NULL; /* For rtree, we need to update father's mbr. */ if (index->is_spatial()) { /* We only support merge pages with the same parent page */ if (!rtr_check_same_block( index, &cursor2, btr_cur_get_block(&father_cursor), heap)) { is_left = false; goto retry; } /* Set rtr_info for cursor2, since it is necessary in recursive page merge. */ cursor2.rtr_info = cursor->rtr_info; cursor2.tree_height = cursor->tree_height; offsets2 = rec_get_offsets( btr_cur_get_rec(&cursor2), index, NULL, page_is_leaf(btr_cur_get_page(&cursor2)) ? index->n_fields : 0, ULINT_UNDEFINED, &heap); /* Check if parent entry needs to be updated */ mbr_changed = rtr_merge_mbr_changed( &cursor2, &father_cursor, offsets2, offsets, &new_mbr); } rec_t* orig_pred = page_copy_rec_list_start( merge_block, block, page_get_supremum_rec(page), index, mtr, &err); if (!orig_pred) { goto err_exit; } btr_search_drop_page_hash_index(block, false); /* Remove the page from the level list */ err = btr_level_list_remove(*block, *index, mtr); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { goto err_exit; } const page_id_t id{block->page.id()}; if (index->is_spatial()) { rec_t* my_rec = father_cursor.page_cur.rec; ulint page_no = btr_node_ptr_get_child_page_no( my_rec, offsets); if (page_no != block->page.id().page_no()) { ib::fatal() << "father positioned on " << page_no << " instead of " << block->page.id().page_no(); } if (mbr_changed) { rtr_update_mbr_field( &cursor2, offsets2, &father_cursor, merge_page, &new_mbr, NULL, mtr); } else { rtr_node_ptr_delete(&father_cursor, mtr); } /* No GAP lock needs to be worrying about */ lock_sys.prdt_page_free_from_discard(id); } else { err = btr_cur_node_ptr_delete(&father_cursor, mtr); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { goto err_exit; } if (index->has_locking()) { lock_update_merge_left( *merge_block, orig_pred, id); } } if (adjust) { ulint n = page_rec_get_n_recs_before(orig_pred); if (UNIV_UNLIKELY(!n || n == ULINT_UNDEFINED)) { goto corrupted; } nth_rec += n; } } else { rec_t* orig_succ; ibool compressed; dberr_t err; byte fil_page_prev[4]; if (index->is_spatial()) { /* For spatial index, we disallow merge of blocks with different parents, since the merge would need to update entry (for MBR and Primary key) in the parent of block being merged */ if (!rtr_check_same_block( index, &cursor2, btr_cur_get_block(&father_cursor), heap)) { goto cannot_merge; } /* Set rtr_info for cursor2, since it is necessary in recursive page merge. */ cursor2.rtr_info = cursor->rtr_info; cursor2.tree_height = cursor->tree_height; } else if (!btr_page_get_father(mtr, &cursor2)) { goto cannot_merge; } if (merge_page_zip && left_page_no == FIL_NULL) { /* The function page_zip_compress(), which will be invoked by page_copy_rec_list_end() below, requires that FIL_PAGE_PREV be FIL_NULL. Clear the field, but prepare to restore it. */ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); compile_time_assert(FIL_NULL == 0xffffffffU); memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4); } orig_succ = page_copy_rec_list_end(merge_block, block, page_get_infimum_rec(page), cursor->index(), mtr, &err); if (!orig_succ) { ut_a(merge_page_zip); if (left_page_no == FIL_NULL) { /* FIL_PAGE_PREV was restored from merge_page_zip. */ ut_ad(!memcmp(fil_page_prev, merge_page + FIL_PAGE_PREV, 4)); } goto err_exit; } btr_search_drop_page_hash_index(block, false); if (merge_page_zip && left_page_no == FIL_NULL) { /* Restore FIL_PAGE_PREV in order to avoid an assertion failure in btr_level_list_remove(), which will set the field again to FIL_NULL. Even though this makes merge_page and merge_page_zip inconsistent for a split second, it is harmless, because the pages are X-latched. */ memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); } /* Remove the page from the level list */ err = btr_level_list_remove(*block, *index, mtr); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { goto err_exit; } ut_ad(btr_node_ptr_get_child_page_no( btr_cur_get_rec(&father_cursor), offsets) == block->page.id().page_no()); /* Replace the address of the old child node (= page) with the address of the merge page to the right */ btr_node_ptr_set_child_page_no( btr_cur_get_block(&father_cursor), btr_cur_get_rec(&father_cursor), offsets, right_page_no, mtr); #ifdef UNIV_DEBUG if (!page_is_leaf(page) && left_page_no == FIL_NULL) { ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( page_rec_get_next(page_get_infimum_rec( buf_block_get_frame(merge_block))), page_is_comp(page))); } #endif /* UNIV_DEBUG */ /* For rtree, we need to update father's mbr. */ if (index->is_spatial()) { rec_offs* offsets2; ulint rec_info; offsets2 = rec_get_offsets( btr_cur_get_rec(&cursor2), index, NULL, page_is_leaf(btr_cur_get_page(&cursor2)) ? index->n_fields : 0, ULINT_UNDEFINED, &heap); ut_ad(btr_node_ptr_get_child_page_no( btr_cur_get_rec(&cursor2), offsets2) == right_page_no); rec_info = rec_get_info_bits( btr_cur_get_rec(&father_cursor), rec_offs_comp(offsets)); if (rec_info & REC_INFO_MIN_REC_FLAG) { /* When the father node ptr is minimal rec, we will keep it and delete the node ptr of merge page. */ rtr_merge_and_update_mbr(&father_cursor, &cursor2, offsets, offsets2, merge_page, mtr); } else { /* Otherwise, we will keep the node ptr of merge page and delete the father node ptr. This is for keeping the rec order in upper level. */ rtr_merge_and_update_mbr(&cursor2, &father_cursor, offsets2, offsets, merge_page, mtr); } const page_id_t id{block->page.id()}; lock_sys.prdt_page_free_from_discard(id); } else { compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor2, BTR_CREATE_FLAG, false, mtr); ut_a(err == DB_SUCCESS); if (!compressed) { btr_cur_compress_if_useful(&cursor2, false, mtr); } if (index->has_locking()) { lock_update_merge_right( merge_block, orig_succ, block); } } } if (!dict_index_is_clust(index) && !index->table->is_temporary() && page_is_leaf(merge_page)) { /* Update the free bits of the B-tree page in the insert buffer bitmap. This has to be done in a separate mini-transaction that is committed before the main mini-transaction. We cannot update the insert buffer bitmap in this mini-transaction, because btr_compress() can be invoked recursively without committing the mini-transaction in between. Since insert buffer bitmap pages have a lower rank than B-tree pages, we must not access other pages in the same mini-transaction after accessing an insert buffer bitmap page. */ /* The free bits in the insert buffer bitmap must never exceed the free space on a page. It is safe to decrement or reset the bits in the bitmap in a mini-transaction that is committed before the mini-transaction that affects the free space. */ /* It is unsafe to increment the bits in a separately committed mini-transaction, because in crash recovery, the free bits could momentarily be set too high. */ if (merge_block->zip_size()) { /* Because the free bits may be incremented and we cannot update the insert buffer bitmap in the same mini-transaction, the only safe thing we can do here is the pessimistic approach: reset the free bits. */ ibuf_reset_free_bits(merge_block); } else { /* On uncompressed pages, the free bits will never increase here. Thus, it is safe to write the bits accurately in a separate mini-transaction. */ ibuf_update_free_bits_if_full(merge_block, srv_page_size, ULINT_UNDEFINED); } } ut_ad(page_validate(merge_page, index)); #ifdef UNIV_ZIP_DEBUG ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page, index)); #endif /* UNIV_ZIP_DEBUG */ if (dict_index_is_spatial(index)) { rtr_check_discard_page(index, NULL, block); } /* Free the file page */ err = btr_page_free(index, block, mtr); if (err == DB_SUCCESS) { ut_ad(leftmost_child || btr_check_node_ptr(index, merge_block, mtr)); goto success; } else { goto err_exit; } } /*************************************************************//** Discards a page that is the only page on its level. This will empty the whole B-tree, leaving just an empty root page. This function should almost never be reached, because btr_compress(), which is invoked in delete operations, calls btr_lift_page_up() to flatten the B-tree. */ ATTRIBUTE_COLD static void btr_discard_only_page_on_level( /*===========================*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: page which is the only on its level */ mtr_t* mtr) /*!< in: mtr */ { ulint page_level = 0; ut_ad(!index->is_dummy); /* Save the PAGE_MAX_TRX_ID from the leaf page. */ const trx_id_t max_trx_id = page_get_max_trx_id(block->page.frame); const rec_t* r = page_rec_get_next( page_get_infimum_rec(block->page.frame)); /* In the caller we checked that a valid key exists in the page, because we were able to look up a parent page. */ ut_ad(r); ut_ad(rec_is_metadata(r, *index) == index->is_instant()); while (block->page.id().page_no() != dict_index_get_page(index)) { btr_cur_t cursor; buf_block_t* father; const page_t* page = buf_block_get_frame(block); ut_a(page_get_n_recs(page) == 1); ut_a(page_level == btr_page_get_level(page)); ut_a(!page_has_siblings(page)); ut_ad(fil_page_index_page_check(page)); ut_ad(block->page.id().space() == index->table->space->id); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); btr_search_drop_page_hash_index(block, false); cursor.page_cur.index = index; cursor.page_cur.block = block; if (index->is_spatial()) { /* Check any concurrent search having this page */ rtr_check_discard_page(index, NULL, block); if (!rtr_page_get_father(mtr, nullptr, &cursor)) { return; } } else { if (!btr_page_get_father(mtr, &cursor)) { return; } } father = btr_cur_get_block(&cursor); if (index->has_locking()) { lock_update_discard( father, PAGE_HEAP_NO_SUPREMUM, block); } /* Free the file page */ if (btr_page_free(index, block, mtr) != DB_SUCCESS) { return; } block = father; page_level++; } /* block is the root page, which must be empty, except for the node pointer to the (now discarded) block(s). */ ut_ad(!page_has_siblings(block->page.frame)); mem_heap_t* heap = nullptr; const rec_t* rec = nullptr; rec_offs* offsets = nullptr; if (index->table->instant || index->must_avoid_clear_instant_add()) { if (!rec_is_metadata(r, *index)) { } else if (!index->table->instant || rec_is_alter_metadata(r, *index)) { heap = mem_heap_create(srv_page_size); offsets = rec_get_offsets(r, index, nullptr, index->n_core_fields, ULINT_UNDEFINED, &heap); rec = rec_copy(mem_heap_alloc(heap, rec_offs_size(offsets)), r, offsets); rec_offs_make_valid(rec, index, true, offsets); } } btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); ut_ad(page_is_leaf(buf_block_get_frame(block))); /* btr_page_empty() is supposed to zero-initialize the field. */ ut_ad(!page_get_instant(block->page.frame)); if (index->is_primary()) { if (rec) { page_cur_t cur; page_cur_set_before_first(block, &cur); cur.index = index; DBUG_ASSERT(index->table->instant); DBUG_ASSERT(rec_is_alter_metadata(rec, *index)); btr_set_instant(block, *index, mtr); rec = page_cur_insert_rec_low(&cur, rec, offsets, mtr); ut_ad(rec); mem_heap_free(heap); } else if (index->is_instant()) { index->clear_instant_add(); } } else if (!index->table->is_temporary()) { /* We play it safe and reset the free bits for the root */ ibuf_reset_free_bits(block); ut_a(max_trx_id); page_set_max_trx_id(block, buf_block_get_page_zip(block), max_trx_id, mtr); } } /*************************************************************//** Discards a page from a B-tree. This is used to remove the last record from a B-tree page: the whole page must be removed at the same time. This cannot be used for the root page, which is allowed to be empty. */ dberr_t btr_discard_page( /*=============*/ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on the root page */ mtr_t* mtr) /*!< in: mtr */ { dict_index_t* index; buf_block_t* merge_block; buf_block_t* block; btr_cur_t parent_cursor; block = btr_cur_get_block(cursor); index = btr_cur_get_index(cursor); parent_cursor.page_cur = cursor->page_cur; ut_ad(dict_index_get_page(index) != block->page.id().page_no()); ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); MONITOR_INC(MONITOR_INDEX_DISCARD); if (index->is_spatial() ? !rtr_page_get_father(mtr, cursor, &parent_cursor) : !btr_page_get_father(mtr, &parent_cursor)) { return DB_CORRUPTION; } /* Decide the page which will inherit the locks */ const uint32_t left_page_no = btr_page_get_prev(block->page.frame); const uint32_t right_page_no = btr_page_get_next(block->page.frame); page_id_t merge_page_id{block->page.id()}; ut_d(bool parent_is_different = false); dberr_t err; if (left_page_no != FIL_NULL) { merge_page_id.set_page_no(left_page_no); merge_block = btr_block_reget(mtr, *index, merge_page_id, &err); if (UNIV_UNLIKELY(!merge_block)) { return err; } #if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ ut_ad(!memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, 4)); #else if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, 4))) { return DB_CORRUPTION; } #endif ut_d(parent_is_different = (page_rec_get_next( page_get_infimum_rec( btr_cur_get_page( &parent_cursor))) == btr_cur_get_rec(&parent_cursor))); } else if (right_page_no != FIL_NULL) { merge_page_id.set_page_no(right_page_no); merge_block = btr_block_reget(mtr, *index, merge_page_id, &err); if (UNIV_UNLIKELY(!merge_block)) { return err; } #if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ ut_ad(!memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_PREV, block->page.frame + FIL_PAGE_OFFSET, 4)); #else if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_PREV, block->page.frame + FIL_PAGE_OFFSET, 4))) { return DB_CORRUPTION; } #endif ut_d(parent_is_different = page_rec_is_supremum( page_rec_get_next(btr_cur_get_rec(&parent_cursor)))); if (page_is_leaf(merge_block->page.frame)) { } else if (rec_t* node_ptr = page_rec_get_next(page_get_infimum_rec( merge_block->page.frame))) { ut_ad(page_rec_is_user_rec(node_ptr)); /* We have to mark the leftmost node pointer as the predefined minimum record. */ btr_set_min_rec_mark(node_ptr, *merge_block, mtr); } else { return DB_CORRUPTION; } } else { btr_discard_only_page_on_level(index, block, mtr); return DB_SUCCESS; } if (UNIV_UNLIKELY(memcmp_aligned<2>(&merge_block->page.frame [PAGE_HEADER + PAGE_LEVEL], &block->page.frame [PAGE_HEADER + PAGE_LEVEL], 2))) { return DB_CORRUPTION; } btr_search_drop_page_hash_index(block, false); if (dict_index_is_spatial(index)) { rtr_node_ptr_delete(&parent_cursor, mtr); } else if (dberr_t err = btr_cur_node_ptr_delete(&parent_cursor, mtr)) { return err; } /* Remove the page from the level list */ if (dberr_t err = btr_level_list_remove(*block, *index, mtr)) { return err; } #ifdef UNIV_ZIP_DEBUG if (page_zip_des_t* merge_page_zip = buf_block_get_page_zip(merge_block)) ut_a(page_zip_validate(merge_page_zip, merge_block->page.frame, index)); #endif /* UNIV_ZIP_DEBUG */ if (index->has_locking()) { if (left_page_no != FIL_NULL) { lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, block); } else { lock_update_discard(merge_block, lock_get_min_heap_no(merge_block), block); } if (index->is_spatial()) { rtr_check_discard_page(index, cursor, block); } } /* Free the file page */ err = btr_page_free(index, block, mtr); if (err == DB_SUCCESS) { /* btr_check_node_ptr() needs parent block latched. If the merge_block's parent block is not same, we cannot use btr_check_node_ptr() */ ut_ad(parent_is_different || btr_check_node_ptr(index, merge_block, mtr)); if (btr_cur_get_block(&parent_cursor)->page.id().page_no() == index->page && !page_has_siblings(btr_cur_get_page(&parent_cursor)) && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) { btr_lift_page_up(index, merge_block, mtr, &err); } } return err; } #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ void btr_print_size( /*===========*/ dict_index_t* index) /*!< in: index tree */ { page_t* root; fseg_header_t* seg; mtr_t mtr; if (dict_index_is_ibuf(index)) { fputs("Sorry, cannot print info of an ibuf tree:" " use ibuf functions\n", stderr); return; } mtr_start(&mtr); root = btr_root_get(index, &mtr); seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); fseg_print(seg, &mtr); if (!dict_index_is_ibuf(index)) { seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); fseg_print(seg, &mtr); } mtr_commit(&mtr); } /************************************************************//** Prints recursively index tree pages. */ static void btr_print_recursive( /*================*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: index page */ ulint width, /*!< in: print this many entries from start and end */ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ rec_offs** offsets,/*!< in/out: buffer for rec_get_offsets() */ mtr_t* mtr) /*!< in: mtr */ { const page_t* page = buf_block_get_frame(block); page_cur_t cursor; ulint n_recs; ulint i = 0; mtr_t mtr2; ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX)); ib::info() << "NODE ON LEVEL " << btr_page_get_level(page) << " page " << block->page.id; page_print(block, index, width, width); n_recs = page_get_n_recs(page); page_cur_set_before_first(block, &cursor); page_cur_move_to_next(&cursor); while (!page_cur_is_after_last(&cursor)) { if (page_is_leaf(page)) { /* If this is the leaf level, do nothing */ } else if ((i <= width) || (i >= n_recs - width)) { const rec_t* node_ptr; mtr_start(&mtr2); node_ptr = page_cur_get_rec(&cursor); *offsets = rec_get_offsets( node_ptr, index, *offsets, 0, ULINT_UNDEFINED, heap); if (buf_block_t *child = btr_node_ptr_get_child(node_ptr, index, *offsets, &mtr2)) { btr_print_recursive(index, child, width, heap, offsets, &mtr2); } mtr_commit(&mtr2); } page_cur_move_to_next(&cursor); i++; } } /**************************************************************//** Prints directories and other info of all nodes in the tree. */ void btr_print_index( /*============*/ dict_index_t* index, /*!< in: index */ ulint width) /*!< in: print this many entries from start and end */ { mtr_t mtr; buf_block_t* root; mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets = offsets_; rec_offs_init(offsets_); fputs("--------------------------\n" "INDEX TREE PRINT\n", stderr); mtr_start(&mtr); root = btr_root_block_get(index, RW_SX_LATCH, &mtr); btr_print_recursive(index, root, width, &heap, &offsets, &mtr); if (heap) { mem_heap_free(heap); } mtr_commit(&mtr); ut_ad(btr_validate_index(index, 0)); } #endif /* UNIV_BTR_PRINT */ #ifdef UNIV_DEBUG /************************************************************//** Checks that the node pointer to a page is appropriate. @return TRUE */ ibool btr_check_node_ptr( /*===============*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: index page */ mtr_t* mtr) /*!< in: mtr */ { mem_heap_t* heap; dtuple_t* tuple; rec_offs* offsets; btr_cur_t cursor; page_t* page = buf_block_get_frame(block); ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); if (dict_index_get_page(index) == block->page.id().page_no()) { return(TRUE); } cursor.page_cur.index = index; cursor.page_cur.block = block; heap = mem_heap_create(256); if (dict_index_is_spatial(index)) { offsets = rtr_page_get_father_block(NULL, heap, mtr, NULL, &cursor); } else { offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor); } ut_ad(offsets); if (page_is_leaf(page)) { goto func_exit; } tuple = dict_index_build_node_ptr( index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, btr_page_get_level(page)); /* For spatial index, the MBR in the parent rec could be different with that of first rec of child, their relationship should be "WITHIN" relationship */ if (dict_index_is_spatial(index)) { ut_a(!cmp_dtuple_rec_with_gis( tuple, btr_cur_get_rec(&cursor), PAGE_CUR_WITHIN)); } else { ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), index, offsets)); } func_exit: mem_heap_free(heap); return(TRUE); } #endif /* UNIV_DEBUG */ /************************************************************//** Display identification information for a record. */ static void btr_index_rec_validate_report( /*==========================*/ const page_t* page, /*!< in: index page */ const rec_t* rec, /*!< in: index record */ const dict_index_t* index) /*!< in: index */ { ib::info() << "Record in index " << index->name << " of table " << index->table->name << ", page " << page_id_t(page_get_space_id(page), page_get_page_no(page)) << ", at offset " << page_offset(rec); } /************************************************************//** Checks the size and number of fields in a record based on the definition of the index. @return TRUE if ok */ ibool btr_index_rec_validate( /*===================*/ const rec_t* rec, /*!< in: index record */ const dict_index_t* index, /*!< in: index */ ibool dump_on_error) /*!< in: TRUE if the function should print hex dump of record and page on error */ { ulint len; const page_t* page; mem_heap_t* heap = NULL; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs* offsets = offsets_; rec_offs_init(offsets_); page = page_align(rec); ut_ad(index->n_core_fields); if (index->is_ibuf()) { /* The insert buffer index tree can contain records from any other index: we cannot check the number of fields or their length */ return(TRUE); } #ifdef VIRTUAL_INDEX_DEBUG if (dict_index_has_virtual(index)) { fprintf(stderr, "index name is %s\n", index->name()); } #endif if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) { btr_index_rec_validate_report(page, rec, index); ib::error() << "Compact flag=" << !!page_is_comp(page) << ", should be " << dict_table_is_comp(index->table); return(FALSE); } const bool is_alter_metadata = page_is_leaf(page) && !page_has_prev(page) && index->is_primary() && index->table->instant && rec == page_rec_get_next_const(page_get_infimum_rec(page)); if (is_alter_metadata && !rec_is_alter_metadata(rec, page_is_comp(page))) { btr_index_rec_validate_report(page, rec, index); ib::error() << "First record is not ALTER TABLE metadata"; return FALSE; } if (!page_is_comp(page)) { const ulint n_rec_fields = rec_get_n_fields_old(rec); if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD && index->id == DICT_INDEXES_ID) { /* A record for older SYS_INDEXES table (missing merge_threshold column) is acceptable. */ } else if (is_alter_metadata) { if (n_rec_fields != ulint(index->n_fields) + 1) { goto n_field_mismatch; } } else if (n_rec_fields < index->n_core_fields || n_rec_fields > index->n_fields) { n_field_mismatch: btr_index_rec_validate_report(page, rec, index); ib::error() << "Has " << rec_get_n_fields_old(rec) << " fields, should have " << index->n_core_fields << ".." << index->n_fields; if (dump_on_error) { fputs("InnoDB: corrupt record ", stderr); rec_print_old(stderr, rec); putc('\n', stderr); } return(FALSE); } } offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); const dict_field_t* field = index->fields; ut_ad(rec_offs_n_fields(offsets) == ulint(index->n_fields) + is_alter_metadata); for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) { rec_get_nth_field_offs(offsets, i, &len); ulint fixed_size; if (is_alter_metadata && i == index->first_user_field()) { fixed_size = FIELD_REF_SIZE; if (len != FIELD_REF_SIZE || !rec_offs_nth_extern(offsets, i)) { goto len_mismatch; } continue; } else { fixed_size = dict_col_get_fixed_size( field->col, page_is_comp(page)); if (rec_offs_nth_extern(offsets, i)) { const byte* data = rec_get_nth_field( rec, offsets, i, &len); len -= BTR_EXTERN_FIELD_REF_SIZE; ulint extern_len = mach_read_from_4( data + len + BTR_EXTERN_LEN + 4); if (fixed_size == extern_len + len) { goto next_field; } } } /* Note that if fixed_size != 0, it equals the length of a fixed-size column in the clustered index. We should adjust it here. A prefix index of the column is of fixed, but different length. When fixed_size == 0, prefix_len is the maximum length of the prefix index column. */ if (len_is_stored(len) && (field->prefix_len ? len > field->prefix_len : (fixed_size && len != fixed_size))) { len_mismatch: btr_index_rec_validate_report(page, rec, index); ib::error error; error << "Field " << i << " len is " << len << ", should be " << fixed_size; if (dump_on_error) { error << "; "; rec_print(error.m_oss, rec, rec_get_info_bits( rec, rec_offs_comp(offsets)), offsets); } if (heap) { mem_heap_free(heap); } return(FALSE); } next_field: field++; } #ifdef VIRTUAL_INDEX_DEBUG if (dict_index_has_virtual(index)) { rec_print_new(stderr, rec, offsets); } #endif if (heap) { mem_heap_free(heap); } return(TRUE); } /************************************************************//** Checks the size and number of fields in records based on the definition of the index. @return true if ok */ static bool btr_index_page_validate( /*====================*/ buf_block_t* block, /*!< in: index page */ dict_index_t* index) /*!< in: index */ { page_cur_t cur; #ifndef DBUG_OFF ulint nth = 1; #endif /* !DBUG_OFF */ page_cur_set_before_first(block, &cur); /* Directory slot 0 should only contain the infimum record. */ DBUG_EXECUTE_IF("check_table_rec_next", ut_a(page_rec_get_nth_const( page_cur_get_page(&cur), 0) == cur.rec); ut_a(page_dir_slot_get_n_owned( page_dir_get_nth_slot( page_cur_get_page(&cur), 0)) == 1);); while (page_cur_move_to_next(&cur)) { if (page_cur_is_after_last(&cur)) { return true; } if (!btr_index_rec_validate(cur.rec, index, TRUE)) { break; } /* Verify that page_rec_get_nth_const() is correctly retrieving each record. */ DBUG_EXECUTE_IF("check_table_rec_next", ut_a(cur.rec == page_rec_get_nth_const( page_cur_get_page(&cur), page_rec_get_n_recs_before( cur.rec))); ut_a(nth++ == page_rec_get_n_recs_before( cur.rec));); } return false; } /************************************************************//** Report an error on one page of an index tree. */ static void btr_validate_report1( /*=================*/ dict_index_t* index, /*!< in: index */ ulint level, /*!< in: B-tree level */ const buf_block_t* block) /*!< in: index page */ { ib::error error; error << "In page " << block->page.id().page_no() << " of index " << index->name << " of table " << index->table->name; if (level > 0) { error << ", index tree level " << level; } } /************************************************************//** Report an error on two pages of an index tree. */ static void btr_validate_report2( /*=================*/ const dict_index_t* index, /*!< in: index */ ulint level, /*!< in: B-tree level */ const buf_block_t* block1, /*!< in: first index page */ const buf_block_t* block2) /*!< in: second index page */ { ib::error error; error << "In pages " << block1->page.id() << " and " << block2->page.id() << " of index " << index->name << " of table " << index->table->name; if (level) error << ", index tree level " << level; } /** Validate an index tree level. */ static dberr_t btr_validate_level( /*===============*/ dict_index_t* index, /*!< in: index tree */ const trx_t* trx, /*!< in: transaction or NULL */ ulint level) /*!< in: level number */ { buf_block_t* block; page_t* page; buf_block_t* right_block = 0; /* remove warning */ page_t* right_page = 0; /* remove warning */ page_t* father_page; btr_cur_t node_cur; btr_cur_t right_node_cur; rec_t* rec; page_cur_t cursor; dtuple_t* node_ptr_tuple; mtr_t mtr; mem_heap_t* heap = mem_heap_create(256); rec_offs* offsets = NULL; rec_offs* offsets2= NULL; #ifdef UNIV_ZIP_DEBUG page_zip_des_t* page_zip; #endif /* UNIV_ZIP_DEBUG */ mtr.start(); mtr_x_lock_index(index, &mtr); dberr_t err; block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); if (!block) { mtr.commit(); return err; } page = buf_block_get_frame(block); fil_space_t* space = index->table->space; while (level != btr_page_get_level(page)) { const rec_t* node_ptr; switch (dberr_t e = fseg_page_is_allocated(space, block->page.id().page_no())) { case DB_SUCCESS_LOCKED_REC: break; case DB_SUCCESS: btr_validate_report1(index, level, block); ib::warn() << "Page is free"; e = DB_CORRUPTION; /* fall through */ default: err = e; } ut_ad(index->table->space_id == block->page.id().space()); ut_ad(block->page.id().space() == page_get_space_id(page)); #ifdef UNIV_ZIP_DEBUG page_zip = buf_block_get_page_zip(block); ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ if (page_is_leaf(page)) { corrupted: err = DB_CORRUPTION; goto invalid_page; } page_cur_set_before_first(block, &cursor); if (!(node_ptr = page_cur_move_to_next(&cursor))) { goto corrupted; } offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr, &err); if (!block) { break; } page = buf_block_get_frame(block); /* For R-Tree, since record order might not be the same as linked index page in the lower level, we need to travers backwards to get the first page rec in this level. This is only used for index validation. Spatial index does not use such scan for any of its DML or query operations */ if (dict_index_is_spatial(index)) { uint32_t left_page_no = btr_page_get_prev(page); while (left_page_no != FIL_NULL) { /* To obey latch order of tree blocks, we should release the right_block once to obtain lock of the uncle block. */ mtr.release_last_page(); block = btr_block_get(*index, left_page_no, RW_SX_LATCH, false, &mtr, &err); if (!block) { goto invalid_page; } page = buf_block_get_frame(block); left_page_no = btr_page_get_prev(page); } } } /* Now we are on the desired level. Loop through the pages on that level. */ loop: if (!block) { invalid_page: mtr.commit(); func_exit: mem_heap_free(heap); return err; } mem_heap_empty(heap); offsets = offsets2 = NULL; mtr_x_lock_index(index, &mtr); page = block->page.frame; #ifdef UNIV_ZIP_DEBUG page_zip = buf_block_get_page_zip(block); ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ if (DB_SUCCESS_LOCKED_REC != fseg_page_is_allocated(space, block->page.id().page_no())) { btr_validate_report1(index, level, block); ib::warn() << "Page is marked as free"; err = DB_CORRUPTION; } else if (btr_page_get_index_id(page) != index->id) { ib::error() << "Page index id " << btr_page_get_index_id(page) << " != data dictionary index id " << index->id; err = DB_CORRUPTION; } else if (!page_validate(page, index)) { btr_validate_report1(index, level, block); err = DB_CORRUPTION; } else if (btr_page_get_level(page) != level) { btr_validate_report1(index, level, block); ib::error() << "Page level is not " << level; err = DB_CORRUPTION; } else if (level == 0 && !btr_index_page_validate(block, index)) { /* We are on level 0. Check that the records have the right number of fields, and field lengths are right. */ err = DB_CORRUPTION; } else if (!page_is_empty(page)) { } else if (level) { btr_validate_report1(index, level, block); ib::error() << "Non-leaf page is empty"; } else if (block->page.id().page_no() != index->page) { btr_validate_report1(index, level, block); ib::error() << "Empty leaf page is not index root"; } uint32_t right_page_no = btr_page_get_next(page); uint32_t left_page_no = btr_page_get_prev(page); if (right_page_no != FIL_NULL) { const rec_t* right_rec; right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, !level, &mtr, &err); if (!right_block) { btr_validate_report1(index, level, block); fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr); goto invalid_page; } right_page = buf_block_get_frame(right_block); if (btr_page_get_prev(right_page) != page_get_page_no(page)) { btr_validate_report2(index, level, block, right_block); fputs("InnoDB: broken FIL_PAGE_NEXT" " or FIL_PAGE_PREV links\n", stderr); err = DB_CORRUPTION; } if (!(rec = page_rec_get_prev(page_get_supremum_rec(page)))) { broken_links: btr_validate_report1(index, level, block); fputs("InnoDB: broken record links\n", stderr); goto invalid_page; } if (!(right_rec = page_rec_get_next(page_get_infimum_rec(right_page)))) { goto broken_links; } offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); offsets2 = rec_get_offsets(right_rec, index, offsets2, page_is_leaf(right_page) ? index->n_core_fields : 0, ULINT_UNDEFINED, &heap); /* For spatial index, we cannot guarantee the key ordering across pages, so skip the record compare verification for now. Will enhanced in special R-Tree index validation scheme */ if (index->is_btree() && cmp_rec_rec(rec, right_rec, offsets, offsets2, index) >= 0) { btr_validate_report2(index, level, block, right_block); fputs("InnoDB: records in wrong order" " on adjacent pages\n", stderr); rec = page_rec_get_prev(page_get_supremum_rec(page)); if (rec) { fputs("InnoDB: record ", stderr); rec_print(stderr, rec, index); putc('\n', stderr); } fputs("InnoDB: record ", stderr); rec = page_rec_get_next( page_get_infimum_rec(right_page)); if (rec) { rec_print(stderr, rec, index); } putc('\n', stderr); err = DB_CORRUPTION; } } if (!level || left_page_no != FIL_NULL) { } else if (const rec_t* first = page_rec_get_next_const(page_get_infimum_rec(page))) { if (!(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(first, page_is_comp(page)))) { btr_validate_report1(index, level, block); ib::error() << "Missing REC_INFO_MIN_REC_FLAG"; err = DB_CORRUPTION; } } else { err = DB_CORRUPTION; goto node_ptr_fails; } /* Similarly skip the father node check for spatial index for now, for a couple of reasons: 1) As mentioned, there is no ordering relationship between records in parent level and linked pages in the child level. 2) Search parent from root is very costly for R-tree. We will add special validation mechanism for R-tree later (WL #7520) */ if (index->is_btree() && block->page.id().page_no() != index->page) { /* Check father node pointers */ rec_t* node_ptr = page_rec_get_next(page_get_infimum_rec(page)); if (!node_ptr) { err = DB_CORRUPTION; goto node_ptr_fails; } btr_cur_position(index, node_ptr, block, &node_cur); offsets = btr_page_get_father_node_ptr_for_validate( offsets, heap, &node_cur, &mtr); father_page = btr_cur_get_page(&node_cur); node_ptr = btr_cur_get_rec(&node_cur); rec = page_rec_get_prev(page_get_supremum_rec(page)); if (rec) { btr_cur_position(index, rec, block, &node_cur); offsets = btr_page_get_father_node_ptr_for_validate( offsets, heap, &node_cur, &mtr); } else { offsets = nullptr; } if (!offsets || node_ptr != btr_cur_get_rec(&node_cur) || btr_node_ptr_get_child_page_no(node_ptr, offsets) != block->page.id().page_no()) { btr_validate_report1(index, level, block); fputs("InnoDB: node pointer to the page is wrong\n", stderr); fputs("InnoDB: node ptr ", stderr); rec_print(stderr, node_ptr, index); if (offsets) { rec = btr_cur_get_rec(&node_cur); fprintf(stderr, "\n" "InnoDB: node ptr child page n:o %u\n", btr_node_ptr_get_child_page_no( rec, offsets)); fputs("InnoDB: record on page ", stderr); rec_print_new(stderr, rec, offsets); putc('\n', stderr); } err = DB_CORRUPTION; goto node_ptr_fails; } if (page_is_leaf(page)) { } else if (const rec_t* first_rec = page_rec_get_next(page_get_infimum_rec(page))) { node_ptr_tuple = dict_index_build_node_ptr( index, first_rec, 0, heap, btr_page_get_level(page)); if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, index, offsets)) { btr_validate_report1(index, level, block); ib::error() << "Node ptrs differ on levels > 0"; fputs("InnoDB: node ptr ",stderr); rec_print_new(stderr, node_ptr, offsets); fputs("InnoDB: first rec ", stderr); rec_print(stderr, first_rec, index); putc('\n', stderr); err = DB_CORRUPTION; goto node_ptr_fails; } } else { err = DB_CORRUPTION; goto node_ptr_fails; } if (left_page_no == FIL_NULL) { if (page_has_prev(father_page) || node_ptr != page_rec_get_next( page_get_infimum_rec(father_page))) { err = DB_CORRUPTION; goto node_ptr_fails; } } if (right_page_no == FIL_NULL) { if (page_has_next(father_page) || node_ptr != page_rec_get_prev( page_get_supremum_rec(father_page))) { err = DB_CORRUPTION; goto node_ptr_fails; } } else if (const rec_t* right_node_ptr = page_rec_get_next(node_ptr)) { btr_cur_position( index, page_get_infimum_rec(right_block->page.frame), right_block, &right_node_cur); if (!page_cur_move_to_next(&right_node_cur.page_cur)) { goto node_pointer_corrupted; } offsets = btr_page_get_father_node_ptr_for_validate( offsets, heap, &right_node_cur, &mtr); if (right_node_ptr != page_get_supremum_rec(father_page)) { if (btr_cur_get_rec(&right_node_cur) != right_node_ptr) { node_pointer_corrupted: err = DB_CORRUPTION; fputs("InnoDB: node pointer to" " the right page is wrong\n", stderr); btr_validate_report1(index, level, block); } } else { page_t* right_father_page = btr_cur_get_page(&right_node_cur); if (btr_cur_get_rec(&right_node_cur) != page_rec_get_next( page_get_infimum_rec( right_father_page))) { err = DB_CORRUPTION; fputs("InnoDB: node pointer 2 to" " the right page is wrong\n", stderr); btr_validate_report1(index, level, block); } if (page_get_page_no(right_father_page) != btr_page_get_next(father_page)) { err = DB_CORRUPTION; fputs("InnoDB: node pointer 3 to" " the right page is wrong\n", stderr); btr_validate_report1(index, level, block); } } } else { err = DB_CORRUPTION; } } node_ptr_fails: /* Commit the mini-transaction to release the latch on 'page'. Re-acquire the latch on right_page, which will become 'page' on the next loop. The page has already been checked. */ mtr.commit(); if (trx_is_interrupted(trx)) { /* On interrupt, return the current status. */ } else if (right_page_no != FIL_NULL) { mtr.start(); block = btr_block_get(*index, right_page_no, RW_SX_LATCH, !level, &mtr, &err); goto loop; } goto func_exit; } /**************************************************************//** Checks the consistency of an index tree. @return DB_SUCCESS if ok, error code if not */ dberr_t btr_validate_index( /*===============*/ dict_index_t* index, /*!< in: index */ const trx_t* trx) /*!< in: transaction or NULL */ { mtr_t mtr; mtr.start(); mtr_x_lock_index(index, &mtr); dberr_t err; if (page_t *root= btr_root_get(index, &mtr, &err)) for (auto level= btr_page_get_level(root);; level--) { if (dberr_t err_level= btr_validate_level(index, trx, level)) err= err_level; if (!level) break; } mtr.commit(); return err; } /**************************************************************//** Checks if the page in the cursor can be merged with given page. If necessary, re-organize the merge_page. @return true if possible to merge. */ static bool btr_can_merge_with_page( /*====================*/ btr_cur_t* cursor, /*!< in: cursor on the page to merge */ uint32_t page_no, /*!< in: a sibling page */ buf_block_t** merge_block, /*!< out: the merge block */ mtr_t* mtr) /*!< in: mini-transaction */ { dict_index_t* index; page_t* page; ulint n_recs; ulint data_size; ulint max_ins_size_reorg; ulint max_ins_size; buf_block_t* mblock; page_t* mpage; DBUG_ENTER("btr_can_merge_with_page"); if (page_no == FIL_NULL) { error: *merge_block = NULL; DBUG_RETURN(false); } index = btr_cur_get_index(cursor); page = btr_cur_get_page(cursor); mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page), mtr); if (!mblock) { goto error; } mpage = buf_block_get_frame(mblock); n_recs = page_get_n_recs(page); data_size = page_get_data_size(page); max_ins_size_reorg = page_get_max_insert_size_after_reorganize( mpage, n_recs); if (data_size > max_ins_size_reorg) { goto error; } /* If compression padding tells us that merging will result in too packed up page i.e.: which is likely to cause compression failure then don't merge the pages. */ if (mblock->page.zip.data && page_is_leaf(mpage) && (page_get_data_size(mpage) + data_size >= dict_index_zip_pad_optimal_page_size(index))) { goto error; } max_ins_size = page_get_max_insert_size(mpage, n_recs); if (data_size > max_ins_size) { /* We have to reorganize mpage */ if (btr_page_reorganize_block(page_zip_level, mblock, index, mtr) != DB_SUCCESS) { goto error; } max_ins_size = page_get_max_insert_size(mpage, n_recs); ut_ad(page_validate(mpage, index)); ut_ad(max_ins_size == max_ins_size_reorg); if (data_size > max_ins_size) { /* Add fault tolerance, though this should never happen */ goto error; } } *merge_block = mblock; DBUG_RETURN(true); }