diff options
Diffstat (limited to 'storage/innobase/btr')
-rw-r--r-- | storage/innobase/btr/btr0btr.cc | 5192 | ||||
-rw-r--r-- | storage/innobase/btr/btr0bulk.cc | 1238 | ||||
-rw-r--r-- | storage/innobase/btr/btr0cur.cc | 8279 | ||||
-rw-r--r-- | storage/innobase/btr/btr0defragment.cc | 843 | ||||
-rw-r--r-- | storage/innobase/btr/btr0pcur.cc | 681 | ||||
-rw-r--r-- | storage/innobase/btr/btr0sea.cc | 2372 |
6 files changed, 18605 insertions, 0 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc new file mode 100644 index 00000000..de87ad02 --- /dev/null +++ b/storage/innobase/btr/btr0btr.cc @@ -0,0 +1,5192 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0btr.cc +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#include "page0page.h" +#include "page0zip.h" +#include "gis0rtree.h" + +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "btr0defragment.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "gis0geo.h" +#include "dict0boot.h" +#include "row0sel.h" /* row_search_max_autoinc() */ + +Atomic_counter<uint32_t> btr_validate_index_running; + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return true if possible to merge. */ +static +bool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + uint32_t page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/** Report that an index page is corrupted. +@param[in] buffer block +@param[in] index tree */ +void btr_corruption_report(const buf_block_t* block, const dict_index_t* index) +{ + ib::fatal() + << "Flag mismatch in page " << block->page.id() + << " index " << index->name + << " of table " << index->table->name; +} + +/* +Latching strategy of the InnoDB B-tree +-------------------------------------- + +Node pointer page latches acquisition is protected by index->lock latch. + +Before MariaDB 10.2.2, all node pointer pages were protected by index->lock +either in S (shared) or X (exclusive) mode and block->lock was not acquired on +node pointer pages. + +After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect +node pointer pages and obtaiment of node pointer page latches is protected by +index->lock. + +(0) Definition: B-tree level. + +(0.1) The leaf pages of the B-tree are at level 0. + +(0.2) The parent of a page at level L has level L+1. (The level of the +root page is equal to the tree height.) + +(0.3) The B-tree lock (index->lock) is the parent of the root page and +has a level = tree height + 1. + +Index->lock has 3 possible locking modes: + +(1) S-latch: + +(1.1) All latches for pages must be obtained in descending order of tree level. + +(1.2) Before obtaining the first node pointer page latch at a given B-tree +level, parent latch must be held (at level +1 ). + +(1.3) If a node pointer page is already latched at the same level +we can only obtain latch to its right sibling page latch at the same level. + +(1.4) Release of the node pointer page latches must be done in +child-to-parent order. (Prevents deadlocks when obtained index->lock +in SX mode). + +(1.4.1) Level L node pointer page latch can be released only when +no latches at children level i.e. level < L are hold. + +(1.4.2) All latches from node pointer pages must be released so +that no latches are obtained between. + +(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer +latch obtained. + +(2) SX-latch: + +In this case rules (1.2) and (1.3) from S-latch case are relaxed and +merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition +can be skipped at some tree levels and latches can be obtained in +a less restricted order. + +(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending +order of tree level. + +(2.2) When a node pointer latch at level L is obtained, +the left sibling page latch in the same level or some ancestor +page latch (at level > L) must be hold. + +(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can +be any node pointer page. + +(3) X-latch: + +Node pointer latches can be obtained in any order. + +NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages: + +index->lock S-latch is needed in read for the node pointer traversal. When the leaf +level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all +node pointer latches). Left to right index travelsal in leaf page level can be safely done +by obtaining right sibling leaf page latch and then releasing the old page latch. + +Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock +S-latch. + +B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page +allocations are protected by index->lock X-latch. + +Node pointers +------------- +Leaf pages of a B-tree contain the index records stored in the +tree. On levels n > 0 we store 'node pointers' to pages on level +n - 1. For each page there is exactly one node pointer stored: +thus the our tree is an ordinary B-tree, not a B-link tree. + +A node pointer contains a prefix P of an index record. The prefix +is long enough so that it determines an index record uniquely. +The file page number of the child page is added as the last +field. To the child page we can store node pointers or index records +which are >= P in the alphabetical order, but < P1 if there is +a next node pointer on the level, and P1 is its prefix. + +If a node pointer with a prefix P points to a non-leaf child, +then the leftmost record in the child must have the same +prefix P. If it points to a leaf node, the child is not required +to contain any record with a prefix equal to P. The leaf case +is decided this way to allow arbitrary deletions in a leaf node +without touching upper levels of the tree. + +We have predefined a special minimum record which we +define as the smallest record in any alphabetical order. +A minimum record is denoted by setting a bit in the record +header. A minimum record acts as the prefix of a node pointer +which points to a leftmost node on any level of the tree. + +File page allocation +-------------------- +In the root node of a B-tree there are two file segment headers. +The leaf pages of a tree are allocated from one file segment, to +make them consecutive on disk if possible. From the other file segment +we allocate pages for the non-leaf levels of the tree. +*/ + +#ifdef UNIV_BTR_DEBUG +/**************************************************************//** +Checks a file segment header within a B-tree root page. +@return TRUE if valid */ +static +ibool +btr_root_fseg_validate( +/*===================*/ + const fseg_header_t* seg_header, /*!< in: segment header */ + ulint space) /*!< in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space); + ut_a(offset >= FIL_PAGE_DATA); + ut_a(offset <= srv_page_size - FIL_PAGE_DATA_END); + return(TRUE); +} +#endif /* UNIV_BTR_DEBUG */ + +/**************************************************************//** +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ +buf_block_t* +btr_root_block_get( +/*===============*/ + const dict_index_t* index, /*!< in: index tree */ + rw_lock_type_t mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (!index->table || !index->table->space) { + return NULL; + } + + buf_block_t* block = btr_block_get(*index, index->page, mode, false, + mtr); + + if (!block) { + index->table->file_unreadable = true; + + ib_push_warning( + static_cast<THD*>(NULL), DB_DECRYPTION_FAILED, + "Table %s in file %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name, + UT_LIST_GET_FIRST(index->table->space->chain)->name); + + return NULL; + } + + btr_assert_not_corrupted(block, index); + +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, index->table->space_id)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, index->table->space_id)); + } +#endif /* UNIV_BTR_DEBUG */ + + return(block); +} + +/**************************************************************//** +Gets the root node of a tree and sx-latches it for segment access. +@return root page, sx-latched */ +page_t* +btr_root_get( +/*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + /* Intended to be used for segment list access. + SX lock doesn't block reading user data by other threads. + And block the segment list access by others.*/ + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr); + return(root ? buf_block_get_frame(root) : NULL); +} + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +ulint +btr_height_get( +/*===========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint height=0; + buf_block_t* root_block; + + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK + | MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + /* S latches the page */ + root_block = btr_root_block_get(index, RW_S_LATCH, mtr); + + if (root_block) { + height = btr_page_get_level(buf_block_get_frame(root_block)); + + /* Release the S latch on the root page. */ + mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX); + + ut_d(sync_check_unlock(&root_block->lock)); + } + + return(height); +} + +/**************************************************************//** +Checks a file segment header within a B-tree root page and updates +the segment header space id. +@return TRUE if valid */ +static +bool +btr_root_fseg_adjust_on_import( +/*===========================*/ + fseg_header_t* seg_header, /*!< in/out: segment header */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + ulint space) /*!< in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + if (offset < FIL_PAGE_DATA + || offset > srv_page_size - FIL_PAGE_DATA_END) { + return false; + } + + seg_header += FSEG_HDR_SPACE; + + mach_write_to_4(seg_header, space); + if (UNIV_LIKELY_NULL(page_zip)) { + memcpy(page_zip->data + page_offset(seg_header), seg_header, + 4); + } + + return true; +} + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ +{ + dberr_t err; + mtr_t mtr; + page_t* page; + page_zip_des_t* page_zip; + dict_table_t* table = index->table; + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", + return(DB_CORRUPTION);); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + buf_block_t* block = buf_page_get_gen( + page_id_t(table->space->id, index->page), + table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET, + __FILE__, __LINE__, + &mtr, &err); + if (!block) { + ut_ad(err != DB_SUCCESS); + goto func_exit; + } + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + if (!fil_page_index_page_check(page) || page_has_siblings(page)) { + err = DB_CORRUPTION; + + } else if (dict_index_is_clust(index)) { + bool page_is_compact_format; + + page_is_compact_format = page_is_comp(page) > 0; + + /* Check if the page format and table format agree. */ + if (page_is_compact_format != dict_table_is_comp(table)) { + err = DB_CORRUPTION; + } else { + /* Check that the table flags and the tablespace + flags match. */ + ulint tf = dict_tf_to_fsp_flags(table->flags); + ulint sf = table->space->flags; + sf &= ~FSP_FLAGS_MEM_MASK; + tf &= ~FSP_FLAGS_MEM_MASK; + if (fil_space_t::is_flags_equal(tf, sf) + || fil_space_t::is_flags_equal(sf, tf)) { + mutex_enter(&fil_system.mutex); + table->space->flags = (table->space->flags + & ~FSP_FLAGS_MEM_MASK) + | (tf & FSP_FLAGS_MEM_MASK); + mutex_exit(&fil_system.mutex); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } + } + } else { + err = DB_SUCCESS; + } + + /* Check and adjust the file segment headers, if all OK so far. */ + if (err == DB_SUCCESS + && (!btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + page, page_zip, table->space_id) + || !btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + page, page_zip, table->space_id))) { + + err = DB_CORRUPTION; + } + +func_exit: + mtr_commit(&mtr); + + return(err); +} + +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + + block->frame); + + if (UNIV_LIKELY_NULL(page_zip)) + { + mach_write_to_8(index_id, index->id); + page_create_zip(block, index, level, 0, mtr); + } + else + { + page_create(block, mtr, dict_table_is_comp(index->table)); + if (index->is_spatial()) + { + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM)) + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::MAYBE_NOP>(*block, + my_assume_aligned<2>(PAGE_HEADER + + PAGE_LEVEL + + block->frame), level); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id); + } +} + +/**************************************************************//** +Allocates a new file page to be used in an ibuf tree. Takes the page from +the free list of the tree, which must contain pages! +@return new allocated block, x-latched */ +static +buf_block_t* +btr_page_alloc_for_ibuf( +/*====================*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* new_block; + + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr); + + fil_addr_t node_addr = flst_get_first(PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST + + root->frame); + ut_a(node_addr.page != FIL_NULL); + + new_block = buf_page_get( + page_id_t(index->table->space_id, node_addr.page), + index->table->space->zip_size(), + RW_X_LATCH, mtr); + + buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW); + + flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, + mtr); + ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + + return(new_block); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +buf_block_t* +btr_page_alloc_low( +/*===============*/ + dict_index_t* index, /*!< in: index */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mtr or another + mini-transaction in which the + page should be initialized. */ +{ + page_t* root = btr_root_get(index, mtr); + + fseg_header_t* seg_header = (level + ? PAGE_HEADER + PAGE_BTR_SEG_TOP + : PAGE_HEADER + PAGE_BTR_SEG_LEAF) + + root; + + /* Parameter TRUE below states that the caller has made the + reservation for free extents, and thus we know that a page can + be allocated: */ + + buf_block_t* block = fseg_alloc_free_page_general( + seg_header, hint_page_no, file_direction, + true, mtr, init_mtr); + + return block; +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ +{ + buf_block_t* new_block; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + new_block = btr_page_alloc_low( + index, hint_page_no, file_direction, level, mtr, init_mtr); + + if (new_block) { + buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + } + + return(new_block); +} + +/**************************************************************//** +Gets the number of pages in a B-tree. +@return number of pages, or ULINT_UNDEFINED if the index is unavailable */ +ulint +btr_get_size( +/*=========*/ + const dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ + ulint n=0; + + ut_ad(srv_read_only_mode + || mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK)); + ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + + if (index->page == FIL_NULL + || dict_index_is_online_ddl(index) + || !index->is_committed() + || !index->table->space) { + return(ULINT_UNDEFINED); + } + + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr); + if (!root) { + return ULINT_UNDEFINED; + } + mtr_x_lock_space(index->table->space, mtr); + if (flag == BTR_N_LEAF_PAGES) { + fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->frame, &n, mtr); + } else { + ulint dummy; + n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP + + root->frame, &dummy, mtr); + n += fseg_n_reserved_pages(*root, + PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->frame, &dummy, mtr); + } + + return(n); +} + +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ + ulint dummy; + + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK)); + ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + + if (index->page == FIL_NULL + || dict_index_is_online_ddl(index) + || !index->is_committed() + || !index->table->space) { + return(ULINT_UNDEFINED); + } + + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr); + *used = 0; + if (!root) { + return ULINT_UNDEFINED; + } + + mtr_x_lock_space(index->table->space, mtr); + + ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->frame, used, mtr); + if (flag == BTR_TOTAL_SIZE) { + n += fseg_n_reserved_pages(*root, + PAGE_HEADER + PAGE_BTR_SEG_TOP + + root->frame, &dummy, mtr); + *used += dummy; + } + + return(n); +} + +/**************************************************************//** +Frees a page used in an ibuf tree. Puts the page to the free list of the +ibuf tree. */ +static +void +btr_page_free_for_ibuf( +/*===================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr); + + flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + + ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); +} + +/** Free an index page. +@param[in,out] index index tree +@param[in,out] block block to be freed +@param[in,out] mtr mini-transaction +@param[in] blob whether this is freeing a BLOB page */ +void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, + bool blob) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +#ifdef BTR_CUR_HASH_ADAPT + if (block->index && !block->index->freed()) { + ut_ad(!blob); + ut_ad(page_is_leaf(block->frame)); + } +#endif + const page_id_t id(block->page.id()); + ut_ad(index->table->space_id == id.space()); + /* The root page is freed by btr_free_root(). */ + ut_ad(id.page_no() != index->page); + ut_ad(mtr->is_named_space(index->table->space)); + + /* The page gets invalid for optimistic searches: increment the frame + modify clock */ + + buf_block_modify_clock_inc(block); + + if (dict_index_is_ibuf(index)) { + btr_page_free_for_ibuf(index, block, mtr); + return; + } + + /* TODO: Discard any operations for block from mtr->log. + The page will be freed, so previous changes to it by this + mini-transaction should not matter. */ + page_t* root = btr_root_get(index, mtr); + fseg_header_t* seg_header = &root[blob || page_is_leaf(block->frame) + ? PAGE_HEADER + PAGE_BTR_SEG_LEAF + : PAGE_HEADER + PAGE_BTR_SEG_TOP]; + fil_space_t* space= index->table->space; + const uint32_t page= id.page_no(); + + fseg_free_page(seg_header, space, page, mtr); + buf_page_free(space, page, mtr, __FILE__, __LINE__); + + /* The page was marked free in the allocation bitmap, but it + should remain exclusively latched until mtr_t::commit() or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +} + +/** Set the child page number in a node pointer record. +@param[in,out] block non-leaf index page +@param[in,out] rec node pointer record in the page +@param[in] offsets rec_get_offsets(rec) +@param[in] page_no child page number +@param[in,out] mtr mini-transaction +Sets the child node file address in a node pointer. */ +inline void btr_node_ptr_set_child_page_no(buf_block_t *block, + rec_t *rec, const rec_offs *offsets, + ulint page_no, mtr_t *mtr) +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!page_rec_is_leaf(rec)); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + const ulint offs= rec_offs_data_size(offsets); + ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) == + REC_NODE_PTR_SIZE); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) + page_zip_write_node_ptr(block, rec, offs, page_no, mtr); + else + mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no); +} + +/************************************************************//** +Returns the child page of a node pointer and sx-latches it. +@return child page, sx-latched */ +static +buf_block_t* +btr_node_ptr_get_child( +/*===================*/ + const rec_t* node_ptr,/*!< in: node pointer */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(rec_offs_validate(node_ptr, index, offsets)); + ut_ad(index->table->space_id + == page_get_space_id(page_align(node_ptr))); + + return btr_block_get( + *index, btr_node_ptr_get_child_page_no(node_ptr, offsets), + RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1, + mtr); +} + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an sx-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_get_father_node_ptr_func( +/*==============================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + ulint latch_mode,/*!< in: BTR_CONT_MODIFY_TREE + or BTR_CONT_SEARCH_TREE */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* tuple; + rec_t* user_rec; + rec_t* node_ptr; + ulint level; + ulint page_no; + dict_index_t* index; + + ut_ad(latch_mode == BTR_CONT_MODIFY_TREE + || latch_mode == BTR_CONT_SEARCH_TREE); + + page_no = btr_cur_get_block(cursor)->page.id().page_no(); + index = btr_cur_get_index(cursor); + ut_ad(!dict_index_is_spatial(index)); + + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + ut_ad(dict_index_get_page(index) != page_no); + + level = btr_page_get_level(btr_cur_get_page(cursor)); + + user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + + tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level); + dberr_t err = DB_SUCCESS; + + err = btr_cur_search_to_nth_level( + index, level + 1, tuple, + PAGE_CUR_LE, latch_mode, cursor, 0, + file, line, mtr); + + if (err != DB_SUCCESS) { + ib::warn() << " Error code: " << err + << " btr_page_get_father_node_ptr_func " + << " level: " << level + 1 + << " called from file: " + << file << " line: " << line + << " table: " << index->table->name + << " index: " << index->name(); + } + + node_ptr = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { + rec_t* print_rec; + + ib::error() + << "Corruption of an index tree: table " + << index->table->name + << " index " << index->name + << ", father ptr page no " + << btr_node_ptr_get_child_page_no(node_ptr, offsets) + << ", child page no " << page_no; + + print_rec = page_rec_get_next( + page_get_infimum_rec(page_align(user_rec))); + offsets = rec_get_offsets(print_rec, index, offsets, + page_rec_is_leaf(user_rec) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + page_rec_print(print_rec, offsets); + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + page_rec_print(node_ptr, offsets); + + ib::fatal() + << "You should dump + drop + reimport the table to" + << " fix the corruption. If the crash happens at" + << " database startup. " << FORCE_RECOVERY_MSG + << " Then dump + drop + reimport."; + } + + return(offsets); +} + +#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \ + btr_page_get_father_node_ptr_func( \ + of,heap,cur,BTR_CONT_MODIFY_TREE,__FILE__,__LINE__,mtr) + +#define btr_page_get_father_node_ptr_for_validate(of,heap,cur,mtr) \ + btr_page_get_father_node_ptr_func( \ + of,heap,cur,BTR_CONT_SEARCH_TREE,__FILE__,__LINE__,mtr) + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_get_father_block( +/*======================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + dict_index_t* index, /*!< in: b-tree index */ + buf_block_t* block, /*!< in: child page in the index */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr)); +} + +/** Seek to the parent page of a B-tree page. +@param[in,out] index b-tree +@param[in] block child page +@param[in,out] mtr mini-transaction +@param[out] cursor cursor pointing to the x-latched parent page */ +void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr, + btr_cur_t* cursor) +{ + mem_heap_t* heap; + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + + heap = mem_heap_create(100); + btr_page_get_father_node_ptr(NULL, heap, cursor, mtr); + mem_heap_free(heap); +} + +#ifdef UNIV_DEBUG +/** PAGE_INDEX_ID value for freed index B-trees */ +constexpr index_id_t BTR_FREED_INDEX_ID = 0; +#endif + +/** Free a B-tree root page. btr_free_but_not_root() must already +have been called. +In a persistent tablespace, the caller must invoke fsp_init_file_page() +before mtr.commit(). +@param[in,out] block index root page +@param[in,out] mtr mini-transaction */ +static void btr_free_root(buf_block_t *block, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->is_named_space(block->page.id().space())); + + btr_search_drop_page_hash_index(block); + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, + block->page.id().space())); +#endif /* UNIV_BTR_DEBUG */ + + /* Free the entire segment in small steps. */ + while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, mtr)); +} + +/** Prepare to free a B-tree. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction +@return root block, to invoke btr_free_but_not_root() and btr_free_root() +@retval NULL if the page is no longer a matching B-tree page */ +static MY_ATTRIBUTE((warn_unused_result)) +buf_block_t* +btr_free_root_check( + const page_id_t page_id, + ulint zip_size, + index_id_t index_id, + mtr_t* mtr) +{ + ut_ad(page_id.space() != SRV_TMP_SPACE_ID); + ut_ad(index_id != BTR_FREED_INDEX_ID); + + buf_block_t* block = buf_page_get( + page_id, zip_size, RW_X_LATCH, mtr); + + if (block) { + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + if (fil_page_index_page_check(block->frame) + && index_id == btr_page_get_index_id(block->frame)) { + /* This should be a root page. + It should not be possible to reassign the same + index_id for some other index in the tablespace. */ + ut_ad(!page_has_siblings(block->frame)); + } else { + block = NULL; + } + } + + return(block); +} + +/** Create the root node for a new index tree. +@param[in] type type of the index +@param[in] index_id index id +@param[in,out] space tablespace where created +@param[in] index index, or NULL to create a system table +@param[in,out] mtr mini-transaction +@return page number of the created root +@retval FIL_NULL if did not succeed */ +uint32_t +btr_create( + ulint type, + fil_space_t* space, + index_id_t index_id, + dict_index_t* index, + mtr_t* mtr) +{ + buf_block_t* block; + + ut_ad(mtr->is_named_space(space)); + ut_ad(index_id != BTR_FREED_INDEX_ID); + + /* Create the two new segments (one, in the case of an ibuf tree) for + the index tree; the segment headers are put on the allocated root page + (for an ibuf tree, not in the root, but on a separate ibuf header + page) */ + + if (UNIV_UNLIKELY(type & DICT_IBUF)) { + /* Allocate first the ibuf header page */ + buf_block_t* ibuf_hdr_block = fseg_create( + space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr); + + if (ibuf_hdr_block == NULL) { + return(FIL_NULL); + } + + buf_block_dbg_add_level( + ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW); + + ut_ad(ibuf_hdr_block->page.id().page_no() + == IBUF_HEADER_PAGE_NO); + /* Allocate then the next page to the segment: it will be the + tree root page */ + + block = fseg_alloc_free_page( + buf_block_get_frame(ibuf_hdr_block) + + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + IBUF_TREE_ROOT_PAGE_NO, + FSP_UP, mtr); + + if (block == NULL) { + return(FIL_NULL); + } + + ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO)); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); + + flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); + } else { + block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP, + mtr); + + if (block == NULL) { + return(FIL_NULL); + } + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + + if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr, + false, block)) { + /* Not enough space for new segment, free root + segment before return. */ + btr_free_root(block, mtr); + return(FIL_NULL); + } + + /* The fseg create acquires a second latch on the page, + therefore we must declare it: */ + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + } + + ut_ad(!page_has_siblings(block->frame)); + + constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID; + + byte* page_index_id = my_assume_aligned<2>(field + block->frame); + + /* Create a new index page on the allocated segment page */ + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_8(page_index_id, index_id); + ut_ad(!page_has_siblings(block->page.zip.data)); + page_create_zip(block, index, 0, 0, mtr); + } else { + page_create(block, mtr, + index && index->table->not_redundant()); + if (index && index->is_spatial()) { + static_assert(((FIL_PAGE_INDEX & 0xff00) + | byte(FIL_PAGE_RTREE)) + == FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->frame + + FIL_RTREE_SPLIT_SEQ_NUM)) { + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, + 8, 0); + } + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL + + block->frame, 0U); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, + index_id); + } + + /* We reset the free bits for the page in a separate + mini-transaction to allow creation of several trees in the + same mtr, otherwise the latch on a bitmap page would prevent + it because of the latching order. + + Note: Insert Buffering is disabled for temporary tables given that + most temporary tables are smaller in size and short-lived. */ + if (!(type & DICT_CLUSTERED) + && (!index || !index->table->is_temporary())) { + ibuf_reset_free_bits(block); + } + + /* In the following assertion we test that two records of maximum + allowed size fit on the root page: this fact is needed to ensure + correctness of split algorithms */ + + ut_ad(page_get_max_insert_size(block->frame, 2) + > 2 * BTR_PAGE_MAX_REC_SIZE); + + return(block->page.id().page_no()); +} + +/** Free a B-tree except the root page. The root page MUST be freed after +this by calling btr_free_root. +@param[in,out] block root page +@param[in] log_mode mtr logging mode */ +static +void +btr_free_but_not_root( + buf_block_t* block, + mtr_log_t log_mode) +{ + mtr_t mtr; + + ut_ad(fil_page_index_page_check(block->frame)); + ut_ad(!page_has_siblings(block->frame)); +leaf_loop: + mtr_start(&mtr); + mtr_set_log_mode(&mtr, log_mode); + mtr.set_named_space_id(block->page.id().space()); + + page_t* root = block->frame; + + if (!root) { + mtr_commit(&mtr); + return; + } + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, block->page.id().space())); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, block->page.id().space())); +#endif /* UNIV_BTR_DEBUG */ + + /* NOTE: page hash indexes are dropped when a page is freed inside + fsp0fsp. */ + + bool finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF, + &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto leaf_loop; + } +top_loop: + mtr_start(&mtr); + mtr_set_log_mode(&mtr, log_mode); + mtr.set_named_space_id(block->page.id().space()); + + root = block->frame; + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, block->page.id().space())); +#endif /* UNIV_BTR_DEBUG */ + + finished = fseg_free_step_not_header( + root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr); + mtr_commit(&mtr); + + if (!finished) { + goto top_loop; + } +} + +/** Free a persistent index tree if it exists. +@param[in] page_id root page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction */ +void +btr_free_if_exists( + const page_id_t page_id, + ulint zip_size, + index_id_t index_id, + mtr_t* mtr) +{ + buf_block_t* root = btr_free_root_check( + page_id, zip_size, index_id, mtr); + + if (root == NULL) { + return; + } + + btr_free_but_not_root(root, mtr->get_log_mode()); + mtr->set_named_space_id(page_id.space()); + btr_free_root(root, mtr); +} + +/** Free an index tree in a temporary tablespace. +@param[in] page_id root page id */ +void btr_free(const page_id_t page_id) +{ + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr); + + if (block) { + btr_free_but_not_root(block, MTR_LOG_NO_REDO); + btr_free_root(block, &mtr); + } + mtr.commit(); +} + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@return the last used AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc(dict_index_t* index) +{ + ut_ad(index->is_primary()); + ut_ad(index->table->persistent_autoinc); + ut_ad(!index->table->is_temporary()); + mtr_t mtr; + mtr.start(); + ib_uint64_t autoinc; + if (buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + index->table->space->zip_size(), + RW_S_LATCH, &mtr)) { + autoinc = page_get_autoinc(block->frame); + } else { + autoinc = 0; + } + mtr.commit(); + return autoinc; +} + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, +or fall back to MAX(auto_increment_column). +@param[in] table table containing an AUTO_INCREMENT column +@param[in] col_no index of the AUTO_INCREMENT column +@return the AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) +{ + ut_ad(table->persistent_autoinc); + ut_ad(!table->is_temporary()); + + dict_index_t* index = dict_table_get_first_index(table); + + if (index == NULL) { + return 0; + } + + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + index->table->space->zip_size(), + RW_S_LATCH, &mtr); + + ib_uint64_t autoinc = block ? page_get_autoinc(block->frame) : 0; + const bool retry = block && autoinc == 0 + && !page_is_empty(block->frame); + mtr.commit(); + + if (retry) { + /* This should be an old data file where + PAGE_ROOT_AUTO_INC was initialized to 0. + Fall back to reading MAX(autoinc_col). + There should be an index on it. */ + const dict_col_t* autoinc_col + = dict_table_get_nth_col(table, col_no); + while (index && index->fields[0].col != autoinc_col) { + index = dict_table_get_next_index(index); + } + + if (index) { + autoinc = row_search_max_autoinc(index); + } + } + + return autoinc; +} + +/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@param[in] autoinc the AUTO_INCREMENT value +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) +{ + ut_ad(index->is_primary()); + ut_ad(index->table->persistent_autoinc); + ut_ad(!index->table->is_temporary()); + + mtr_t mtr; + mtr.start(); + fil_space_t* space = index->table->space; + mtr.set_named_space(space); + page_set_autoinc(buf_page_get(page_id_t(space->id, index->page), + space->zip_size(), + RW_SX_LATCH, &mtr), + autoinc, &mtr, reset); + mtr.commit(); +} + +/** Reorganize an index page. +@param cursor index page cursor +@param index the index that the cursor belongs to +@param mtr mini-transaction */ +static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index, + mtr_t *mtr) +{ + const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO); + + buf_block_t *const block= cursor->block; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!is_buf_block_get_page_zip(block)); + btr_assert_not_corrupted(block, index); + ut_ad(fil_page_index_page_check(block->frame)); + ut_ad(index->is_dummy || + block->page.id().space() == index->table->space->id); + ut_ad(index->is_dummy || block->page.id().page_no() != index->page || + !page_has_siblings(block->frame)); + + buf_block_t *old= buf_block_alloc(); + /* Copy the old page to temporary space */ + memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->frame, block->frame, srv_page_size); + + btr_search_drop_page_hash_index(block); + + /* Save the cursor position. */ + const ulint pos= page_rec_get_n_recs_before(cursor->rec); + + page_create(block, mtr, index->table->not_redundant()); + if (index->is_spatial()) + block->frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE); + + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, old, page_get_infimum_rec(old->frame), + index, mtr); + + /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */ + ut_ad(!page_get_max_trx_id(block->frame)); + memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->frame, + PAGE_MAX_TRX_ID + PAGE_HEADER + old->frame, 8); +#ifdef UNIV_DEBUG + if (page_get_max_trx_id(block->frame)) + /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than + clustered index root pages. */ + ut_ad(dict_index_is_sec_or_ibuf(index) + ? page_is_leaf(block->frame) + : block->page.id().page_no() == index->page); + else + /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than + the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf + pages, and in temporary tables. It was always zero-initialized in + page_create(). PAGE_MAX_TRX_ID must be nonzero on + dict_index_is_sec_or_ibuf() leaf pages. */ + ut_ad(index->table->is_temporary() || !page_is_leaf(block->frame) || + !dict_index_is_sec_or_ibuf(index)); +#endif + + const uint16_t data_size1= page_get_data_size(old->frame); + const uint16_t data_size2= page_get_data_size(block->frame); + const ulint max1= page_get_max_insert_size_after_reorganize(old->frame, 1); + const ulint max2= page_get_max_insert_size_after_reorganize(block->frame, 1); + + if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2)) + ib::fatal() << "Page old data size " << data_size1 + << " new data size " << data_size2 + << ", page old max ins size " << max1 + << " new max ins size " << max2; + + /* Restore the cursor position. */ + if (pos) + cursor->rec = page_rec_get_nth(block->frame, pos); + else + ut_ad(cursor->rec == page_get_infimum_rec(block->frame)); + + if (block->page.id().page_no() == index->page && + fil_page_get_type(old->frame) == FIL_PAGE_TYPE_INSTANT) + { + /* Preserve the PAGE_INSTANT information. */ + ut_ad(index->is_instant()); + memcpy_aligned<2>(FIL_PAGE_TYPE + block->frame, + FIL_PAGE_TYPE + old->frame, 2); + memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->frame, + PAGE_HEADER + PAGE_INSTANT + old->frame, 2); + if (!index->table->instant); + else if (page_is_comp(block->frame)) + { + memcpy(PAGE_NEW_INFIMUM + block->frame, + PAGE_NEW_INFIMUM + old->frame, 8); + memcpy(PAGE_NEW_SUPREMUM + block->frame, + PAGE_NEW_SUPREMUM + old->frame, 8); + } + else + { + memcpy(PAGE_OLD_INFIMUM + block->frame, + PAGE_OLD_INFIMUM + old->frame, 8); + memcpy(PAGE_OLD_SUPREMUM + block->frame, + PAGE_OLD_SUPREMUM + old->frame, 8); + } + } + + ut_ad(!memcmp(old->frame, block->frame, PAGE_HEADER)); + ut_ad(!memcmp(old->frame + PAGE_MAX_TRX_ID + PAGE_HEADER, + block->frame + PAGE_MAX_TRX_ID + PAGE_HEADER, + PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER))); + + if (!dict_table_is_locking_disabled(index->table)) + lock_move_reorganize_page(block, old); + + /* Write log for the changes, if needed. */ + mtr->set_log_mode(log_mode); + if (log_mode == MTR_LOG_ALL) + { + /* Check and log the changes in the page header. */ + ulint a, e; + for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++) + { + if (old->frame[a] == block->frame[a]) + continue; + while (--e, old->frame[e] == block->frame[e]); + e++; + ut_ad(a < e); + /* Write log for the changed page header fields. */ + mtr->memcpy(*block, a, e - a); + break; + } + + const uint16_t top= page_header_get_offs(block->frame, PAGE_HEAP_TOP); + + if (page_is_comp(block->frame)) + { + /* info_bits=0, n_owned=1, heap_no=0, status */ + ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + block->frame, + PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + old->frame, 3)); + /* If the 'next' pointer of the infimum record has changed, log it. */ + a= PAGE_NEW_INFIMUM - 2; + e= a + 2; + if (block->frame[a] == old->frame[a]) + a++; + if (--e, block->frame[e] != old->frame[e]) + e++; + if (ulint len= e - a) + mtr->memcpy(*block, a, len); + /* The infimum record itself must not change. */ + ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->frame, + PAGE_NEW_INFIMUM + old->frame, 8)); + /* Log any change of the n_owned of the supremum record. */ + a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES; + if (block->frame[a] != old->frame[a]) + mtr->memcpy(*block, a, 1); + /* The rest of the supremum record must not change. */ + ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1], + PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM + + REC_N_NEW_EXTRA_BYTES - 1)); + + /* Log the differences in the payload. */ + for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++) + { + if (old->frame[a] == block->frame[a]) + continue; + while (--e, old->frame[e] == block->frame[e]); + e++; + ut_ad(a < e); + /* TODO: write MEMMOVE records to minimize this further! */ + mtr->memcpy(*block, a, e - a); + break; + } + } + else + { + /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */ + ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + block->frame, + PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + old->frame, 4)); + /* If the 'next' pointer of the infimum record has changed, log it. */ + a= PAGE_OLD_INFIMUM - 2; + e= a + 2; + if (block->frame[a] == old->frame[a]) + a++; + if (--e, block->frame[e] != old->frame[e]) + e++; + if (ulint len= e - a) + mtr->memcpy(*block, a, len); + /* The infimum record itself must not change. */ + ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->frame, + PAGE_OLD_INFIMUM + old->frame, 8)); + /* Log any change of the n_owned of the supremum record. */ + a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES; + if (block->frame[a] != old->frame[a]) + mtr->memcpy(*block, a, 1); + ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1], + PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM + + REC_N_OLD_EXTRA_BYTES - 1)); + + /* Log the differences in the payload. */ + for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++) + { + if (old->frame[a] == block->frame[a]) + continue; + while (--e, old->frame[e] == block->frame[e]); + e++; + ut_ad(a < e); + /* TODO: write MEMMOVE records to minimize this further! */ + mtr->memcpy(*block, a, e - a); + break; + } + } + + e= srv_page_size - PAGE_DIR; + a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->frame); + + /* Zero out the payload area. */ + mtr->memset(*block, top, a - top, 0); + + /* Log changes to the page directory. */ + for (; a < e; a++) + { + if (old->frame[a] == block->frame[a]) + continue; + while (--e, old->frame[e] == block->frame[e]); + e++; + ut_ad(a < e); + /* Write log for the changed page directory slots. */ + mtr->memcpy(*block, a, e - a); + break; + } + } + + buf_block_free(old); + + MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS); + MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL); +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +bool +btr_page_reorganize_block( + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (buf_block_get_page_zip(block)) { + return page_zip_reorganize(block, index, z_level, mtr, true); + } + + page_cur_t cur; + page_cur_set_before_first(block, &cur); + + btr_page_reorganize_low(&cur, index, mtr); + return true; +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +bool +btr_page_reorganize( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (!buf_block_get_page_zip(cursor->block)) { + btr_page_reorganize_low(cursor, index, mtr); + return true; + } + + ulint pos = page_rec_get_n_recs_before(cursor->rec); + if (!page_zip_reorganize(cursor->block, index, page_zip_level, mtr, + true)) { + return false; + } + if (pos) { + cursor->rec = page_rec_get_nth(cursor->block->frame, pos); + } else { + ut_ad(cursor->rec == page_get_infimum_rec( + cursor->block->frame)); + } + + return true; +} + +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ +void +btr_page_empty( + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_zip == buf_block_get_page_zip(block)); + ut_ad(!index->is_dummy); + ut_ad(index->table->space->id == block->page.id().space()); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + btr_search_drop_page_hash_index(block); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index + root page. */ + const ib_uint64_t autoinc + = dict_index_is_clust(index) + && index->page == block->page.id().page_no() + ? page_get_autoinc(block->frame) + : 0; + + if (page_zip) { + page_create_zip(block, index, level, autoinc, mtr); + } else { + page_create(block, mtr, index->table->not_redundant()); + if (index->is_spatial()) { + static_assert(((FIL_PAGE_INDEX & 0xff00) + | byte(FIL_PAGE_RTREE)) + == FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->frame + + FIL_RTREE_SPLIT_SEQ_NUM)) { + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, + 8, 0); + } + } + mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL + + block->frame, level); + if (autoinc) { + mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID + + block->frame, autoinc); + } + } +} + +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) +{ + ut_ad(index.n_core_fields > 0); + ut_ad(index.n_core_fields < REC_MAX_N_FIELDS); + ut_ad(index.is_instant()); + ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT + || fil_page_get_type(root->frame) == FIL_PAGE_INDEX); + ut_ad(!page_has_siblings(root->frame)); + ut_ad(root->page.id().page_no() == index.page); + + rec_t* infimum = page_get_infimum_rec(root->frame); + rec_t* supremum = page_get_supremum_rec(root->frame); + byte* page_type = root->frame + FIL_PAGE_TYPE; + uint16_t i = page_header_get_field(root->frame, PAGE_INSTANT); + + switch (mach_read_from_2(page_type)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_instant(root->frame) == index.n_core_fields); + if (memcmp(infimum, "infimum", 8) + || memcmp(supremum, "supremum", 8)) { + ut_ad(index.table->instant); + ut_ad(!memcmp(infimum, field_ref_zero, 8)); + ut_ad(!memcmp(supremum, field_ref_zero, 7)); + /* The n_core_null_bytes only matters for + ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ + ut_ad(supremum[7] == index.n_core_null_bytes + || !index.table->not_redundant()); + return; + } + break; + default: + ut_ad("wrong page type" == 0); + /* fall through */ + case FIL_PAGE_INDEX: + ut_ad(!page_is_comp(root->frame) + || !page_get_instant(root->frame)); + ut_ad(!memcmp(infimum, "infimum", 8)); + ut_ad(!memcmp(supremum, "supremum", 8)); + mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT); + ut_ad(i <= PAGE_NO_DIRECTION); + i |= static_cast<uint16_t>(index.n_core_fields << 3); + mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + root->frame, + i); + break; + } + + if (index.table->instant) { + mtr->memset(root, infimum - root->frame, 8, 0); + mtr->memset(root, supremum - root->frame, 7, 0); + mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7], + index.n_core_null_bytes); + } +} + +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) +{ + ut_ad(!index.table->is_temporary()); + ut_ad(index.is_primary()); + if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr)) + { + byte *page_type= root->frame + FIL_PAGE_TYPE; + if (all) + { + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || + mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX); + byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame; + mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant, + page_ptr_get_direction(instant + 1)); + } + else + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); + static const byte supremuminfimum[8 + 8] = "supremuminfimum"; + uint16_t infimum, supremum; + if (page_is_comp(root->frame)) + { + infimum= PAGE_NEW_INFIMUM; + supremum= PAGE_NEW_SUPREMUM; + } + else + { + infimum= PAGE_OLD_INFIMUM; + supremum= PAGE_OLD_SUPREMUM; + } + ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) == + !memcmp(&root->frame[supremum], supremuminfimum, 8)); + mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[infimum], + supremuminfimum + 8, 8); + mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[supremum], + supremuminfimum, 8); + } +} + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + ulint new_page_no; + rec_t* rec; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t* page_cursor; + page_zip_des_t* root_page_zip; + page_zip_des_t* new_page_zip; + buf_block_t* root; + buf_block_t* new_block; + + root = btr_cur_get_block(cursor); + root_page_zip = buf_block_get_page_zip(root); + ut_ad(!page_is_empty(root->frame)); + index = btr_cur_get_index(cursor); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!root_page_zip || page_zip_validate(root_page_zip, root->frame, + index)); +#endif /* UNIV_ZIP_DEBUG */ +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + ulint space = index->table->space_id; + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root->frame, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root->frame, space)); + } + + ut_a(dict_index_get_page(index) == root->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX)); + + /* Allocate a new page to the tree. Root splitting is done by first + moving the root records to the new page, emptying the root, putting + a node pointer to the new page, and then splitting the new page. */ + + level = btr_page_get_level(root->frame); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr); + + if (new_block == NULL && os_has_said_disk_full) { + return(NULL); + } + + new_page_zip = buf_block_get_page_zip(new_block); + ut_a(!new_page_zip == !root_page_zip); + ut_a(!new_page_zip + || page_zip_get_size(new_page_zip) + == page_zip_get_size(root_page_zip)); + + btr_page_create(new_block, new_page_zip, index, level, mtr); + if (page_has_siblings(new_block->frame)) { + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); + memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8); + mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff); + if (UNIV_LIKELY_NULL(new_page_zip)) { + memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV, + 0xff, 8); + } + } + + /* Copy the records from root to the new page one by one. */ + + if (0 +#ifdef UNIV_ZIP_COPY + || new_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(new_block, root, + page_get_infimum_rec(root->frame), + index, mtr)) { + ut_a(new_page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(new_block, + root_page_zip, root->frame, index, mtr); + + /* Update the lock table and possible hash index. */ + lock_move_rec_list_end(new_block, root, + page_get_infimum_rec(root->frame)); + + /* Move any existing predicate locks */ + if (dict_index_is_spatial(index)) { + lock_prdt_rec_move(new_block, root); + } else { + btr_search_move_or_delete_hash_entries( + new_block, root); + } + } + + constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID; + if (dict_index_is_sec_or_ibuf(index)) { + /* In secondary indexes and the change buffer, + PAGE_MAX_TRX_ID can be reset on the root page, because + the field only matters on leaf pages, and the root no + longer is a leaf page. (Older versions of InnoDB did + set PAGE_MAX_TRX_ID on all secondary index pages.) */ + byte* p = my_assume_aligned<8>( + PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame); + if (mach_read_from_8(p)) { + mtr->memset(root, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(root->page.zip.data)) { + memset_aligned<8>(max_trx_id + + root->page.zip.data, 0, 8); + } + } + } else { + /* PAGE_ROOT_AUTO_INC is only present in the clustered index + root page; on other clustered index pages, we want to reserve + the field PAGE_MAX_TRX_ID for future use. */ + byte* p = my_assume_aligned<8>( + PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame); + if (mach_read_from_8(p)) { + mtr->memset(new_block, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + memset_aligned<8>(max_trx_id + + new_block->page.zip.data, + 0, 8); + } + } + } + + /* If this is a pessimistic insert which is actually done to + perform a pessimistic update then we have stored the lock + information of the record to be inserted on the infimum of the + root page: we cannot discard the lock structs on the root page */ + + if (!dict_table_is_locking_disabled(index->table)) { + lock_update_root_raise(new_block, root); + } + + /* Create a memory heap where the node pointer is stored */ + if (!*heap) { + *heap = mem_heap_create(1000); + } + + rec = page_rec_get_next(page_get_infimum_rec(new_block->frame)); + new_page_no = new_block->page.id().page_no(); + + /* Build the node pointer (= node key and page address) for the + child */ + if (dict_index_is_spatial(index)) { + rtr_mbr_t new_mbr; + + rtr_page_cal_mbr(index, new_block, &new_mbr, *heap); + node_ptr = rtr_index_build_node_ptr( + index, &new_mbr, rec, new_page_no, *heap); + } else { + node_ptr = dict_index_build_node_ptr( + index, rec, new_page_no, *heap, level); + } + /* The node pointer must be marked as the predefined minimum record, + as there is no lower alphabetical limit to records in the leftmost + node of a level: */ + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + + /* Rebuild the root page to get free space */ + btr_page_empty(root, root_page_zip, index, level + 1, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(root->frame)); + + if (index->is_instant()) { + ut_ad(!root_page_zip); + btr_set_instant(root, *index, mtr); + } + + ut_ad(!page_has_siblings(root->frame)); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Insert node pointer to the root */ + + page_cur_set_before_first(root, page_cursor); + + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + index, offsets, heap, 0, mtr); + + /* The root page should only contain the node pointer + to new_block at this point. Thus, the data should fit. */ + ut_a(node_ptr_rec); + + /* We play safe and reset the free bits for the new page */ + + if (!dict_index_is_clust(index) + && !index->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + } + + if (tuple != NULL) { + /* Reposition the cursor to the child node */ + page_cur_search(new_block, index, tuple, page_cursor); + } else { + /* Set cursor to first record on child node */ + page_cur_set_before_first(new_block, page_cursor); + } + + /* Split the child and insert tuple */ + if (dict_index_is_spatial(index)) { + /* Split rtree page and insert tuple */ + return(rtr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr)); + } else { + return(btr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr)); + } +} + +/** Decide if the page should be split at the convergence point of inserts +converging to the left. +@param[in] cursor insert position +@return the first record to be moved to the right half page +@retval NULL if no split is recommended */ +rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor) +{ + rec_t* split_rec = btr_cur_get_rec(cursor); + const page_t* page = page_align(split_rec); + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) + != page_rec_get_next(split_rec)) { + return NULL; + } + + /* The metadata record must be present in the leftmost leaf page + of the clustered index, if and only if index->is_instant(). + However, during innobase_instant_try(), index->is_instant() + would already hold when row_ins_clust_index_entry_low() + is being invoked to insert the the metadata record. + So, we can only assert that when the metadata record exists, + index->is_instant() must hold. */ + ut_ad(!page_is_leaf(page) || page_has_prev(page) + || cursor->index->is_instant() + || !(rec_get_info_bits(page_rec_get_next_const( + page_get_infimum_rec(page)), + cursor->index->table->not_redundant()) + & REC_INFO_MIN_REC_FLAG)); + + const rec_t* infimum = page_get_infimum_rec(page); + + /* If the convergence is in the middle of a page, include also + the record immediately before the new insert to the upper + page. Otherwise, we could repeatedly move from page to page + lots of records smaller than the convergence point. */ + + if (split_rec == infimum + || split_rec == page_rec_get_next_const(infimum)) { + split_rec = page_rec_get_next(split_rec); + } + + return split_rec; +} + +/** Decide if the page should be split at the convergence point of inserts +converging to the right. +@param[in] cursor insert position +@param[out] split_rec if split recommended, the first record + on the right half page, or + NULL if the to-be-inserted record + should be first +@return whether split is recommended */ +bool +btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec) +{ + rec_t* insert_point = btr_cur_get_rec(cursor); + const page_t* page = page_align(insert_point); + + /* We use eager heuristics: if the new insert would be right after + the previous insert on the same page, we assume that there is a + pattern of sequential inserts here. */ + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) { + return false; + } + + insert_point = page_rec_get_next(insert_point); + + if (page_rec_is_supremum(insert_point)) { + insert_point = NULL; + } else { + insert_point = page_rec_get_next(insert_point); + if (page_rec_is_supremum(insert_point)) { + insert_point = NULL; + } + + /* If there are >= 2 user records up from the insert + point, split all but 1 off. We want to keep one because + then sequential inserts can use the adaptive hash + index, as they can do the necessary checks of the right + search position just by looking at the records on this + page. */ + } + + *split_rec = insert_point; + return true; +} + +/*************************************************************//** +Calculates a split record such that the tuple will certainly fit on +its half-page when the split is performed. We assume in this function +only that the cursor page has at least one user record. +@return split record, or NULL if tuple will be the first record on +the lower or upper half-page (determined by btr_page_tuple_smaller()) */ +static +rec_t* +btr_page_get_split_rec( +/*===================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + page_t* page; + page_zip_des_t* page_zip; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + ulint total_space; + ulint incl_data; + rec_t* ins_rec; + rec_t* rec; + rec_t* next_rec; + ulint n; + mem_heap_t* heap; + rec_offs* offsets; + + page = btr_cur_get_page(cursor); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + page_zip = btr_cur_get_page_zip(cursor); + if (page_zip) { + /* Estimate the free space of an empty compressed page. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, + page_zip_get_size(page_zip)); + + if (free_space > (ulint) free_space_zip) { + free_space = (ulint) free_space_zip; + } + } + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = ulint(page_get_n_recs(page)) + 1; + ut_ad(total_n_recs >= 2); + total_space = total_data + page_dir_calc_reserved_space(total_n_recs); + + n = 0; + incl_data = 0; + ins_rec = btr_cur_get_rec(cursor); + rec = page_get_infimum_rec(page); + + heap = NULL; + offsets = NULL; + + /* We start to include records to the left half, and when the + space reserved by them exceeds half of total_space, then if + the included records fit on the left page, they will be put there + if something was left over also for the right page, + otherwise the last included record will be the first on the right + half page */ + + do { + /* Decide the next record to include */ + if (rec == ins_rec) { + rec = NULL; /* NULL denotes that tuple is + now included */ + } else if (rec == NULL) { + rec = page_rec_get_next(ins_rec); + } else { + rec = page_rec_get_next(rec); + } + + if (rec == NULL) { + /* Include tuple */ + incl_data += insert_size; + } else { + offsets = rec_get_offsets(rec, cursor->index, offsets, + page_is_leaf(page) + ? cursor->index->n_core_fields + : 0, + ULINT_UNDEFINED, &heap); + incl_data += rec_offs_size(offsets); + } + + n++; + } while (incl_data + page_dir_calc_reserved_space(n) + < total_space / 2); + + if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { + /* The next record will be the first on + the right half page if it is not the + supremum record of page */ + + if (rec == ins_rec) { + rec = NULL; + + goto func_exit; + } else if (rec == NULL) { + next_rec = page_rec_get_next(ins_rec); + } else { + next_rec = page_rec_get_next(rec); + } + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; + } + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(rec); +} + +/*************************************************************//** +Returns TRUE if the insert fits on the appropriate half-page with the +chosen split_rec. +@return true if fits */ +static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result)) +bool +btr_page_insert_fits( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert + should be made */ + const rec_t* split_rec,/*!< in: suggestion for first record + on upper half-page, or NULL if + tuple to be inserted should be first */ + rec_offs** offsets,/*!< in: rec_get_offsets( + split_rec, cursor->index); out: garbage */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mem_heap_t** heap) /*!< in: temporary memory heap */ +{ + page_t* page; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + const rec_t* rec; + const rec_t* end_rec; + + page = btr_cur_get_page(cursor); + + ut_ad(!split_rec + || !page_is_comp(page) == !rec_offs_comp(*offsets)); + ut_ad(!split_rec + || rec_offs_validate(split_rec, cursor->index, *offsets)); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = ulint(page_get_n_recs(page)) + 1; + + /* We determine which records (from rec to end_rec, not including + end_rec) will end up on the other half page from tuple when it is + inserted. */ + + if (split_rec == NULL) { + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + } else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) { + + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = split_rec; + } else { + rec = split_rec; + end_rec = page_get_supremum_rec(page); + } + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + while (rec != end_rec) { + /* In this loop we calculate the amount of reserved + space after rec is removed from page. */ + + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + page_is_leaf(page) + ? cursor->index->n_core_fields + : 0, + ULINT_UNDEFINED, heap); + + total_data -= rec_offs_size(*offsets); + total_n_recs--; + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + rec = page_rec_get_next_const(rec); + } + + return(false); +} + +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +void +btr_insert_on_non_leaf_level_func( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + big_rec_t* dummy_big_rec; + btr_cur_t cursor; + dberr_t err; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + rtr_info_t rtr_info; + + ut_ad(level > 0); + + if (!dict_index_is_spatial(index)) { + dberr_t err = btr_cur_search_to_nth_level( + index, level, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, + &cursor, 0, file, line, mtr); + + if (err != DB_SUCCESS) { + ib::warn() << " Error code: " << err + << " btr_page_get_father_node_ptr_func " + << " level: " << level + << " called from file: " + << file << " line: " << line + << " table: " << index->table->name + << " index: " << index->name; + } + } else { + /* For spatial index, initialize structures to track + its parents etc. */ + rtr_init_rtr_info(&rtr_info, false, &cursor, index, false); + + rtr_info_update_btr(&cursor, &rtr_info); + + btr_cur_search_to_nth_level(index, level, tuple, + PAGE_CUR_RTREE_INSERT, + BTR_CONT_MODIFY_TREE, + &cursor, 0, file, line, mtr); + } + + ut_ad(cursor.flag == BTR_CUR_BINARY); + + err = btr_cur_optimistic_insert( + flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + tuple, &rec, &dummy_big_rec, 0, NULL, mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert(flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + ut_a(err == DB_SUCCESS); + } + + if (heap != NULL) { + mem_heap_free(heap); + } + + if (dict_index_is_spatial(index)) { + ut_ad(cursor.rtr_info); + + rtr_clean_rtr_info(&rtr_info, true); + } +} + +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +static MY_ATTRIBUTE((nonnull)) +void +btr_attach_half_pages( +/*==================*/ + ulint flags, /*!< in: undo logging and + locking flags */ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + const rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* node_ptr_upper; + mem_heap_t* heap; + buf_block_t* prev_block = NULL; + buf_block_t* next_block = NULL; + buf_block_t* lower_block; + buf_block_t* upper_block; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + /* Based on split direction, decide upper and lower pages */ + if (direction == FSP_DOWN) { + + btr_cur_t cursor; + rec_offs* offsets; + + lower_block = new_block; + upper_block = block; + + /* Look up the index for the node pointer to page */ + offsets = btr_page_get_father_block(NULL, heap, index, + block, mtr, &cursor); + + /* Replace the address of the old child node (= page) with the + address of the new lower half */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_block(&cursor), + btr_cur_get_rec(&cursor), + offsets, lower_block->page.id().page_no(), mtr); + mem_heap_empty(heap); + } else { + lower_block = block; + upper_block = new_block; + } + + /* Get the level of the split pages */ + const ulint level = btr_page_get_level(buf_block_get_frame(block)); + ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block))); + + /* Get the previous and next pages of page */ + const uint32_t prev_page_no = btr_page_get_prev(block->frame); + const uint32_t next_page_no = btr_page_get_next(block->frame); + + /* for consistency, both blocks should be locked, before change */ + if (prev_page_no != FIL_NULL && direction == FSP_DOWN) { + prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH, + !level, mtr); + } + if (next_page_no != FIL_NULL && direction != FSP_DOWN) { + next_block = btr_block_get(*index, next_page_no, RW_X_LATCH, + !level, mtr); + } + + /* Build the node pointer (= node key and page address) for the upper + half */ + + node_ptr_upper = dict_index_build_node_ptr( + index, split_rec, upper_block->page.id().page_no(), + heap, level); + + /* Insert it next to the pointer to the lower half. Note that this + may generate recursion leading to a split on the higher level. */ + + btr_insert_on_non_leaf_level(flags, index, level + 1, + node_ptr_upper, mtr); + + /* Free the memory heap */ + mem_heap_free(heap); + + /* Update page links of the level */ + + if (prev_block) { +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_block->frame) + == page_is_comp(block->frame)); + ut_a(btr_page_get_next(prev_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + btr_page_set_next(prev_block, lower_block->page.id().page_no(), + mtr); + } + + if (next_block) { +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_block->frame) + == page_is_comp(block->frame)); + ut_a(btr_page_get_prev(next_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + btr_page_set_prev(next_block, upper_block->page.id().page_no(), + mtr); + } + + if (direction == FSP_DOWN) { + ut_ad(lower_block == new_block); + ut_ad(btr_page_get_next(upper_block->frame) == next_page_no); + btr_page_set_prev(lower_block, prev_page_no, mtr); + } else { + ut_ad(upper_block == new_block); + ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no); + btr_page_set_next(upper_block, next_page_no, mtr); + } + + btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr); + btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr); +} + +/*************************************************************//** +Determine if a tuple is smaller than any record on the page. +@return TRUE if smaller */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +btr_page_tuple_smaller( +/*===================*/ + btr_cur_t* cursor, /*!< in: b-tree cursor */ + const dtuple_t* tuple, /*!< in: tuple to consider */ + rec_offs** offsets,/*!< in/out: temporary storage */ + ulint n_uniq, /*!< in: number of unique fields + in the index page records */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + buf_block_t* block; + const rec_t* first_rec; + page_cur_t pcur; + + /* Read the first user record in the page. */ + block = btr_cur_get_block(cursor); + page_cur_set_before_first(block, &pcur); + page_cur_move_to_next(&pcur); + first_rec = page_cur_get_rec(&pcur); + + *offsets = rec_get_offsets( + first_rec, cursor->index, *offsets, + page_is_leaf(block->frame) ? cursor->index->n_core_fields : 0, + n_uniq, heap); + + return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0); +} + +/** Insert the tuple into the right sibling page, if the cursor is at the end +of a page. +@param[in] flags undo logging and locking flags +@param[in,out] cursor cursor at which to insert; when the function succeeds, + the cursor is positioned before the insert point. +@param[out] offsets offsets on inserted record +@param[in,out] heap memory heap for allocating offsets +@param[in] tuple tuple to insert +@param[in] n_ext number of externally stored columns +@param[in,out] mtr mini-transaction +@return inserted record (first record on the right sibling page); + the cursor will be positioned on the page infimum +@retval NULL if the operation was not performed */ +static +rec_t* +btr_insert_into_right_sibling( + ulint flags, + btr_cur_t* cursor, + rec_offs** offsets, + mem_heap_t* heap, + const dtuple_t* tuple, + ulint n_ext, + mtr_t* mtr) +{ + buf_block_t* block = btr_cur_get_block(cursor); + page_t* page = buf_block_get_frame(block); + const uint32_t next_page_no = btr_page_get_next(page); + + ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(heap); + + if (next_page_no == FIL_NULL || !page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(cursor)))) { + + return(NULL); + } + + page_cur_t next_page_cursor; + buf_block_t* next_block; + page_t* next_page; + btr_cur_t next_father_cursor; + rec_t* rec = NULL; + ulint max_size; + + next_block = btr_block_get(*cursor->index, next_page_no, RW_X_LATCH, + page_is_leaf(page), mtr); + if (UNIV_UNLIKELY(!next_block)) { + return NULL; + } + next_page = buf_block_get_frame(next_block); + + bool is_leaf = page_is_leaf(next_page); + + btr_page_get_father( + cursor->index, next_block, mtr, &next_father_cursor); + + page_cur_search( + next_block, cursor->index, tuple, PAGE_CUR_LE, + &next_page_cursor); + + max_size = page_get_max_insert_size_after_reorganize(next_page, 1); + + /* Extends gap lock for the next page */ + if (!dict_table_is_locking_disabled(cursor->index->table)) { + lock_update_split_left(next_block, block); + } + + rec = page_cur_tuple_insert( + &next_page_cursor, tuple, cursor->index, offsets, &heap, + n_ext, mtr); + + if (rec == NULL) { + if (is_leaf + && next_block->page.zip.ssize + && !dict_index_is_clust(cursor->index) + && !cursor->index->table->is_temporary()) { + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + ibuf_reset_free_bits(next_block); + } + return(NULL); + } + + ibool compressed; + dberr_t err; + ulint level = btr_page_get_level(next_page); + + /* adjust cursor position */ + *btr_cur_get_page_cur(cursor) = next_page_cursor; + + ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page)); + ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec); + + /* We have to change the parent node pointer */ + + compressed = btr_cur_pessimistic_delete( + &err, TRUE, &next_father_cursor, + BTR_CREATE_FLAG, false, mtr); + + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr); + } + + dtuple_t* node_ptr = dict_index_build_node_ptr( + cursor->index, rec, next_block->page.id().page_no(), + heap, level); + + btr_insert_on_non_leaf_level( + flags, cursor->index, level + 1, node_ptr, mtr); + + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + + if (is_leaf + && !dict_index_is_clust(cursor->index) + && !cursor->index->table->is_temporary()) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + if (next_block->page.zip.ssize) { + ibuf_update_free_bits_zip(next_block, mtr); + } else { + ibuf_update_free_bits_if_full( + next_block, max_size, + rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE); + } + } + + return(rec); +} + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. +NOTE: jonaso added support for calling function with tuple == NULL +which cause it to only split a page. + +@return inserted record or NULL if run out of space */ +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + buf_block_t* left_block; + buf_block_t* right_block; + page_cur_t* page_cursor; + rec_t* first_rec; + byte* buf = 0; /* remove warning */ + rec_t* move_limit; + ulint n_iterations = 0; + ulint n_uniq; + + if (cursor->index->is_spatial()) { + /* Split rtree page and update parent */ + return(rtr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr)); + } + + if (!*heap) { + *heap = mem_heap_create(1024); + } + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); +func_start: + mem_heap_empty(*heap); + *offsets = NULL; + + ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(!dict_index_is_online_ddl(cursor->index) + || (flags & BTR_CREATE_FLAG) + || dict_index_is_clust(cursor->index)); + ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index), + RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_is_empty(page)); + + /* try to insert to the next page if possible before split */ + if (rec_t* rec = btr_insert_into_right_sibling( + flags, cursor, offsets, *heap, tuple, n_ext, mtr)) { + return(rec); + } + + /* 1. Decide the split record; split_rec == NULL means that the + tuple to be inserted should be the first record on the upper + half-page */ + bool insert_left = false; + uint32_t hint_page_no = block->page.id().page_no() + 1; + byte direction = FSP_UP; + + if (tuple && n_iterations > 0) { + split_rec = btr_page_get_split_rec(cursor, tuple, n_ext); + + if (split_rec == NULL) { + insert_left = btr_page_tuple_smaller( + cursor, tuple, offsets, n_uniq, heap); + } + } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { + } else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) { + direction = FSP_DOWN; + hint_page_no -= 2; + } else { + /* If there is only one record in the index page, we + can't split the node in the middle by default. We need + to determine whether the new record will be inserted + to the left or right. */ + + if (page_get_n_recs(page) > 1) { + split_rec = page_get_middle_rec(page); + } else if (btr_page_tuple_smaller(cursor, tuple, + offsets, n_uniq, heap)) { + split_rec = page_rec_get_next( + page_get_infimum_rec(page)); + } else { + split_rec = NULL; + } + } + + DBUG_EXECUTE_IF("disk_is_full", + os_has_said_disk_full = true; + return(NULL);); + + /* 2. Allocate a new page to the index */ + const uint16_t page_level = btr_page_get_level(page); + new_block = btr_page_alloc(cursor->index, hint_page_no, direction, + page_level, mtr, mtr); + + if (!new_block) { + return(NULL); + } + + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + + if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { + /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected + to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ + memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4); + } + btr_page_create(new_block, new_page_zip, cursor->index, + page_level, mtr); + /* Only record the leaf level page splits. */ + if (!page_level) { + cursor->index->stat_defrag_n_page_split ++; + cursor->index->stat_defrag_modified_counter ++; + btr_defragment_save_defrag_stats_if_needed(cursor->index); + } + + /* 3. Calculate the first record on the upper half-page, and the + first record (move_limit) on original page which ends up on the + upper half */ + + if (split_rec) { + first_rec = move_limit = split_rec; + + *offsets = rec_get_offsets(split_rec, cursor->index, *offsets, + page_is_leaf(page) + ? cursor->index->n_core_fields : 0, + n_uniq, heap); + + insert_left = !tuple + || cmp_dtuple_rec(tuple, split_rec, *offsets) < 0; + + if (!insert_left && new_page_zip && n_iterations > 0) { + /* If a compressed page has already been split, + avoid further splits by inserting the record + to an empty page. */ + split_rec = NULL; + goto insert_empty; + } + } else if (insert_left) { + ut_a(n_iterations > 0); + first_rec = page_rec_get_next(page_get_infimum_rec(page)); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } else { +insert_empty: + ut_ad(!split_rec); + ut_ad(!insert_left); + buf = UT_NEW_ARRAY_NOKEY( + byte, + rec_get_converted_size(cursor->index, tuple, n_ext)); + + first_rec = rec_convert_dtuple_to_rec(buf, cursor->index, + tuple, n_ext); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } + + /* 4. Do first the modifications in the tree structure */ + + /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */ + btr_attach_half_pages(flags, cursor->index, block, + first_rec, new_block, direction, mtr); + + /* If the split is made on the leaf level and the insert will fit + on the appropriate half-page, we may release the tree x-latch. + We can then move the records after releasing the tree latch, + thus reducing the tree latch contention. */ + bool insert_will_fit; + if (tuple == NULL) { + insert_will_fit = true; + } else if (split_rec) { + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, split_rec, + offsets, tuple, n_ext, heap); + } else { + if (!insert_left) { + UT_DELETE_ARRAY(buf); + buf = NULL; + } + + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, NULL, + offsets, tuple, n_ext, heap); + } + + if (!srv_read_only_mode + && insert_will_fit + && page_is_leaf(page) + && !dict_index_is_online_ddl(cursor->index)) { + + mtr->memo_release( + dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK); + + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } + + /* 5. Move then the records to the new page */ + if (direction == FSP_DOWN) { + /* fputs("Split left\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_move_rec_list_start(new_block, block, move_limit, + cursor->index, mtr)) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_end(move_limit - page + new_page, + new_block, cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + + /* Update the lock table and possible hash index. */ + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); + + btr_search_move_or_delete_hash_entries( + new_block, block); + + /* Delete the records from the source page. */ + + page_delete_rec_list_start(move_limit, block, + cursor->index, mtr); + } + + left_block = new_block; + right_block = block; + + if (!dict_table_is_locking_disabled(cursor->index->table)) { + lock_update_split_left(right_block, left_block); + } + } else { + /* fputs("Split right\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_move_rec_list_end(new_block, block, move_limit, + cursor->index, mtr)) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_start(move_limit - page + + new_page, new_block, + cursor->index, mtr); + + /* Update the lock table and possible hash index. */ + lock_move_rec_list_end(new_block, block, move_limit); + + btr_search_move_or_delete_hash_entries( + new_block, block); + + /* Delete the records from the source page. */ + + page_delete_rec_list_end(move_limit, block, + cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + } + + left_block = block; + right_block = new_block; + + if (!dict_table_is_locking_disabled(cursor->index->table)) { + lock_update_split_right(right_block, left_block); + } + } + +#ifdef UNIV_ZIP_DEBUG + if (page_zip) { + ut_a(page_zip_validate(page_zip, page, cursor->index)); + ut_a(page_zip_validate(new_page_zip, new_page, cursor->index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* At this point, split_rec, move_limit and first_rec may point + to garbage on the old page. */ + + /* 6. The split and the tree modification is now completed. Decide the + page where the tuple should be inserted */ + rec_t* rec; + buf_block_t* const insert_block = insert_left + ? left_block : right_block; + + if (UNIV_UNLIKELY(!tuple)) { + rec = NULL; + goto func_exit; + } + + /* 7. Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_search(insert_block, cursor->index, tuple, page_cursor); + + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_t* insert_page + = buf_block_get_frame(insert_block); + + page_zip_des_t* insert_page_zip + = buf_block_get_page_zip(insert_block); + + ut_a(!insert_page_zip + || page_zip_validate(insert_page_zip, insert_page, + cursor->index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (rec != NULL) { + + goto func_exit; + } + + /* 8. If insert did not fit, try page reorganization. + For compressed pages, page_cur_tuple_insert() will have + attempted this already. */ + + if (page_cur_get_page_zip(page_cursor) + || !btr_page_reorganize(page_cursor, cursor->index, mtr)) { + + goto insert_failed; + } + + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + + if (rec == NULL) { + /* The insert did not fit on the page: loop back to the + start of the function for a new split */ +insert_failed: + /* We play safe and reset the free bits for new_page */ + if (!dict_index_is_clust(cursor->index) + && !cursor->index->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + ibuf_reset_free_bits(block); + } + + n_iterations++; + ut_ad(n_iterations < 2 + || buf_block_get_page_zip(insert_block)); + ut_ad(!insert_will_fit); + + goto func_start; + } + +func_exit: + /* Insert fit on the page: update the free bits for the + left and right pages in the same mtr */ + + if (!dict_index_is_clust(cursor->index) + && !cursor->index->table->is_temporary() + && page_is_leaf(page)) { + + ibuf_update_free_bits_for_two_pages_low( + left_block, right_block, mtr); + } + + MONITOR_INC(MONITOR_INDEX_SPLIT); + + ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index)); + ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index)); + + ut_ad(tuple || !rec); + ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets)); + return(rec); +} + +/** Remove a page from the level list of pages. +@param[in] block page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ +void btr_level_list_remove(const buf_block_t& block, const dict_index_t& index, + mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(block.zip_size() == index.table->space->zip_size()); + ut_ad(index.table->space->id == block.page.id().space()); + /* Get the previous and next page numbers of page */ + + const page_t* page = block.frame; + const uint32_t prev_page_no = btr_page_get_prev(page); + const uint32_t next_page_no = btr_page_get_next(page); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block = btr_block_get( + index, prev_page_no, RW_X_LATCH, page_is_leaf(page), + mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_block->frame) == page_is_comp(page)); + static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + ut_a(!memcmp_aligned<4>(prev_block->frame + FIL_PAGE_NEXT, + page + FIL_PAGE_OFFSET, 4)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(prev_block, next_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block = btr_block_get( + index, next_page_no, RW_X_LATCH, page_is_leaf(page), + mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_block->frame) == page_is_comp(page)); + static_assert(FIL_PAGE_PREV % 4 == 0, "alignment"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + ut_a(!memcmp_aligned<4>(next_block->frame + FIL_PAGE_PREV, + page + FIL_PAGE_OFFSET, 4)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(next_block, prev_page_no, mtr); + } +} + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +UNIV_INTERN +buf_block_t* +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* father_block; + ulint page_level; + page_zip_des_t* father_page_zip; + page_t* page = buf_block_get_frame(block); + ulint root_page_no; + buf_block_t* blocks[BTR_MAX_LEVELS]; + ulint n_blocks; /*!< last used index in blocks[] */ + ulint i; + bool lift_father_up; + buf_block_t* block_orig = block; + + ut_ad(!page_has_siblings(page)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + page_level = btr_page_get_level(page); + root_page_no = dict_index_get_page(index); + + { + btr_cur_t cursor; + rec_offs* offsets = NULL; + mem_heap_t* heap = mem_heap_create( + sizeof(*offsets) + * (REC_OFFS_HEADER_SIZE + 1 + 1 + + unsigned(index->n_fields))); + buf_block_t* b; + + if (dict_index_is_spatial(index)) { + offsets = rtr_page_get_father_block( + NULL, heap, index, block, mtr, + NULL, &cursor); + } else { + offsets = btr_page_get_father_block(offsets, heap, + index, block, + mtr, &cursor); + } + father_block = btr_cur_get_block(&cursor); + father_page_zip = buf_block_get_page_zip(father_block); + + n_blocks = 0; + + /* Store all ancestor pages so we can reset their + levels later on. We have to do all the searches on + the tree now because later on, after we've replaced + the first level, the tree is in an inconsistent state + and can not be searched. */ + for (b = father_block; + b->page.id().page_no() != root_page_no; ) { + ut_a(n_blocks < BTR_MAX_LEVELS); + + if (dict_index_is_spatial(index)) { + offsets = rtr_page_get_father_block( + NULL, heap, index, b, mtr, + NULL, &cursor); + } else { + offsets = btr_page_get_father_block(offsets, + heap, + index, b, + mtr, + &cursor); + } + + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); + } + + lift_father_up = (n_blocks && page_level == 0); + if (lift_father_up) { + /* The father page also should be the only on its level (not + root). We should lift up the father page at first. + Because the leaf page should be lifted up only for root page. + The freeing page is based on page_level (==0 or !=0) + to choose segment. If the page_level is changed ==0 from !=0, + later freeing of the page doesn't find the page allocation + to be freed.*/ + + block = father_block; + page = buf_block_get_frame(block); + page_level = btr_page_get_level(page); + + ut_ad(!page_has_siblings(page)); + ut_ad(mtr->memo_contains_flagged(block, + MTR_MEMO_PAGE_X_FIX)); + + father_block = blocks[0]; + father_page_zip = buf_block_get_page_zip(father_block); + } + + mem_heap_free(heap); + } + + btr_search_drop_page_hash_index(block); + + /* Make the father empty */ + btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(father_block->frame)); + + if (index->is_instant() + && father_block->page.id().page_no() == root_page_no) { + ut_ad(!father_page_zip); + btr_set_instant(father_block, *index, mtr); + } + + page_level++; + + /* Copy the records to the father page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || father_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(father_block, block, + page_get_infimum_rec(page), + index, mtr)) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(father_page_zip); + ut_a(page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(father_block, + page_zip, page, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); + + /* Also update the predicate locks */ + if (dict_index_is_spatial(index)) { + lock_prdt_rec_move(father_block, block); + } else { + btr_search_move_or_delete_hash_entries( + father_block, block); + } + } + + if (!dict_table_is_locking_disabled(index->table)) { + /* Free predicate page locks on the block */ + if (dict_index_is_spatial(index)) { + lock_mutex_enter(); + lock_prdt_page_free_from_discard( + block, &lock_sys.prdt_page_hash); + lock_mutex_exit(); + } + lock_update_copy_and_discard(father_block, block); + } + + /* Go upward to root page, decrementing levels by one. */ + for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) { + ut_ad(btr_page_get_level(blocks[i]->frame) == page_level + 1); + btr_page_set_level(blocks[i], page_level, mtr); + } + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, NULL, block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* We play it safe and reset the free bits for the father */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary()) { + ibuf_reset_free_bits(father_block); + } + ut_ad(page_validate(father_block->frame, index)); + ut_ad(btr_check_node_ptr(index, father_block, mtr)); + + return(lift_father_up ? block_orig : father_block); +} + +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the brother +reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to the +brothers, if they exist. +@return TRUE on success */ +ibool +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index; + buf_block_t* merge_block; + page_t* merge_page = NULL; + page_zip_des_t* merge_page_zip; + ibool is_left; + buf_block_t* block; + page_t* page; + btr_cur_t father_cursor; + mem_heap_t* heap; + rec_offs* offsets; + ulint nth_rec = 0; /* remove bogus warning */ + bool mbr_changed = false; +#ifdef UNIV_DEBUG + bool leftmost_child; +#endif + DBUG_ENTER("btr_compress"); + + block = btr_cur_get_block(cursor); + page = btr_cur_get_page(cursor); + index = btr_cur_get_index(cursor); + + btr_assert_not_corrupted(block, index); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); + + const uint32_t left_page_no = btr_page_get_prev(page); + const uint32_t right_page_no = btr_page_get_next(page); + +#ifdef UNIV_DEBUG + if (!page_is_leaf(page) && left_page_no == FIL_NULL) { + ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); + } +#endif /* UNIV_DEBUG */ + + heap = mem_heap_create(100); + + if (dict_index_is_spatial(index)) { + offsets = rtr_page_get_father_block( + NULL, heap, index, block, mtr, cursor, &father_cursor); + ut_ad(cursor->page_cur.block->page.id() == block->page.id()); + rec_t* my_rec = father_cursor.page_cur.rec; + + ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets); + + if (page_no != block->page.id().page_no()) { + ib::info() << "father positioned on page " + << page_no << "instead of " + << block->page.id().page_no(); + offsets = btr_page_get_father_block( + NULL, heap, index, block, mtr, &father_cursor); + } + } else { + offsets = btr_page_get_father_block( + NULL, heap, index, block, mtr, &father_cursor); + } + + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + ut_ad(nth_rec > 0); + } + + if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) { + /* The page is the only one on the level, lift the records + to the father */ + + merge_block = btr_lift_page_up(index, block, mtr); + goto func_exit; + } + + ut_d(leftmost_child = + left_page_no != FIL_NULL + && (page_rec_get_next( + page_get_infimum_rec( + btr_cur_get_page(&father_cursor))) + == btr_cur_get_rec(&father_cursor))); + + /* Decide the page to which we try to merge and which will inherit + the locks */ + + is_left = btr_can_merge_with_page(cursor, left_page_no, + &merge_block, mtr); + + DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;); +retry: + if (!is_left + && !btr_can_merge_with_page(cursor, right_page_no, &merge_block, + mtr)) { + if (!merge_block) { + merge_page = NULL; + } + goto err_exit; + } + + merge_page = buf_block_get_frame(merge_block); + +#ifdef UNIV_BTR_DEBUG + if (is_left) { + ut_a(btr_page_get_next(merge_page) + == block->page.id().page_no()); + } else { + ut_a(btr_page_get_prev(merge_page) + == block->page.id().page_no()); + } +#endif /* UNIV_BTR_DEBUG */ + + ut_ad(page_validate(merge_page, index)); + + merge_page_zip = buf_block_get_page_zip(merge_block); +#ifdef UNIV_ZIP_DEBUG + if (merge_page_zip) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(page_zip); + ut_a(page_zip_validate(merge_page_zip, merge_page, index)); + ut_a(page_zip_validate(page_zip, page, index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* Move records to the merge page */ + if (is_left) { + btr_cur_t cursor2; + rtr_mbr_t new_mbr; + rec_offs* offsets2 = NULL; + + /* For rtree, we need to update father's mbr. */ + if (index->is_spatial()) { + /* We only support merge pages with the same parent + page */ + if (!rtr_check_same_block( + index, &cursor2, + btr_cur_get_block(&father_cursor), + merge_block, heap)) { + is_left = false; + goto retry; + } + + /* Set rtr_info for cursor2, since it is + necessary in recursive page merge. */ + cursor2.rtr_info = cursor->rtr_info; + cursor2.tree_height = cursor->tree_height; + + offsets2 = rec_get_offsets( + btr_cur_get_rec(&cursor2), index, NULL, + page_is_leaf(cursor2.page_cur.block->frame) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + /* Check if parent entry needs to be updated */ + mbr_changed = rtr_merge_mbr_changed( + &cursor2, &father_cursor, + offsets2, offsets, &new_mbr); + } + + rec_t* orig_pred = page_copy_rec_list_start( + merge_block, block, page_get_supremum_rec(page), + index, mtr); + + if (!orig_pred) { + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + + /* Remove the page from the level list */ + btr_level_list_remove(*block, *index, mtr); + + if (dict_index_is_spatial(index)) { + rec_t* my_rec = father_cursor.page_cur.rec; + + ulint page_no = btr_node_ptr_get_child_page_no( + my_rec, offsets); + + if (page_no != block->page.id().page_no()) { + ib::fatal() << "father positioned on " + << page_no << " instead of " + << block->page.id().page_no(); + } + + if (mbr_changed) { +#ifdef UNIV_DEBUG + bool success = rtr_update_mbr_field( + &cursor2, offsets2, &father_cursor, + merge_page, &new_mbr, NULL, mtr); + + ut_ad(success); +#else + rtr_update_mbr_field( + &cursor2, offsets2, &father_cursor, + merge_page, &new_mbr, NULL, mtr); +#endif + } else { + rtr_node_ptr_delete(&father_cursor, mtr); + } + + /* No GAP lock needs to be worrying about */ + lock_mutex_enter(); + lock_prdt_page_free_from_discard( + block, &lock_sys.prdt_page_hash); + lock_rec_free_all_from_discard_page(block); + lock_mutex_exit(); + } else { + btr_cur_node_ptr_delete(&father_cursor, mtr); + if (!dict_table_is_locking_disabled(index->table)) { + lock_update_merge_left( + merge_block, orig_pred, block); + } + } + + if (adjust) { + nth_rec += page_rec_get_n_recs_before(orig_pred); + } + } else { + rec_t* orig_succ; + ibool compressed; + dberr_t err; + btr_cur_t cursor2; + /* father cursor pointing to node ptr + of the right sibling */ +#ifdef UNIV_BTR_DEBUG + byte fil_page_prev[4]; +#endif /* UNIV_BTR_DEBUG */ + + if (dict_index_is_spatial(index)) { + cursor2.rtr_info = NULL; + + /* For spatial index, we disallow merge of blocks + with different parents, since the merge would need + to update entry (for MBR and Primary key) in the + parent of block being merged */ + if (!rtr_check_same_block( + index, &cursor2, + btr_cur_get_block(&father_cursor), + merge_block, heap)) { + goto err_exit; + } + + /* Set rtr_info for cursor2, since it is + necessary in recursive page merge. */ + cursor2.rtr_info = cursor->rtr_info; + cursor2.tree_height = cursor->tree_height; + } else { + btr_page_get_father(index, merge_block, mtr, &cursor2); + } + + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* The function page_zip_compress(), which will be + invoked by page_copy_rec_list_end() below, + requires that FIL_PAGE_PREV be FIL_NULL. + Clear the field, but prepare to restore it. */ + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); +#ifdef UNIV_BTR_DEBUG + memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); +#endif /* UNIV_BTR_DEBUG */ + compile_time_assert(FIL_NULL == 0xffffffffU); + memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4); + } + + orig_succ = page_copy_rec_list_end(merge_block, block, + page_get_infimum_rec(page), + cursor->index, mtr); + + if (!orig_succ) { + ut_a(merge_page_zip); +#ifdef UNIV_BTR_DEBUG + if (left_page_no == FIL_NULL) { + /* FIL_PAGE_PREV was restored from + merge_page_zip. */ + ut_a(!memcmp(fil_page_prev, + merge_page + FIL_PAGE_PREV, 4)); + } +#endif /* UNIV_BTR_DEBUG */ + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + +#ifdef UNIV_BTR_DEBUG + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* Restore FIL_PAGE_PREV in order to avoid an assertion + failure in btr_level_list_remove(), which will set + the field again to FIL_NULL. Even though this makes + merge_page and merge_page_zip inconsistent for a + split second, it is harmless, because the pages + are X-latched. */ + memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); + } +#endif /* UNIV_BTR_DEBUG */ + + /* Remove the page from the level list */ + btr_level_list_remove(*block, *index, mtr); + + ut_ad(btr_node_ptr_get_child_page_no( + btr_cur_get_rec(&father_cursor), offsets) + == block->page.id().page_no()); + + /* Replace the address of the old child node (= page) with the + address of the merge page to the right */ + btr_node_ptr_set_child_page_no( + btr_cur_get_block(&father_cursor), + btr_cur_get_rec(&father_cursor), + offsets, right_page_no, mtr); + +#ifdef UNIV_DEBUG + if (!page_is_leaf(page) && left_page_no == FIL_NULL) { + ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec( + buf_block_get_frame(merge_block))), + page_is_comp(page))); + } +#endif /* UNIV_DEBUG */ + + /* For rtree, we need to update father's mbr. */ + if (index->is_spatial()) { + rec_offs* offsets2; + ulint rec_info; + + offsets2 = rec_get_offsets( + btr_cur_get_rec(&cursor2), index, NULL, + page_is_leaf(cursor2.page_cur.block->frame) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + ut_ad(btr_node_ptr_get_child_page_no( + btr_cur_get_rec(&cursor2), offsets2) + == right_page_no); + + rec_info = rec_get_info_bits( + btr_cur_get_rec(&father_cursor), + rec_offs_comp(offsets)); + if (rec_info & REC_INFO_MIN_REC_FLAG) { + /* When the father node ptr is minimal rec, + we will keep it and delete the node ptr of + merge page. */ + rtr_merge_and_update_mbr(&father_cursor, + &cursor2, + offsets, offsets2, + merge_page, mtr); + } else { + /* Otherwise, we will keep the node ptr of + merge page and delete the father node ptr. + This is for keeping the rec order in upper + level. */ + rtr_merge_and_update_mbr(&cursor2, + &father_cursor, + offsets2, offsets, + merge_page, mtr); + } + lock_mutex_enter(); + lock_prdt_page_free_from_discard( + block, &lock_sys.prdt_page_hash); + lock_rec_free_all_from_discard_page(block); + lock_mutex_exit(); + } else { + + compressed = btr_cur_pessimistic_delete(&err, TRUE, + &cursor2, + BTR_CREATE_FLAG, + false, mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor2, + FALSE, + mtr); + } + + if (!dict_table_is_locking_disabled(index->table)) { + lock_update_merge_right( + merge_block, orig_succ, block); + } + } + } + + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(merge_page)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (merge_block->zip_size()) { + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); + } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full(merge_block, + srv_page_size, + ULINT_UNDEFINED); + } + } + + ut_ad(page_validate(merge_page, index)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page, + index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, NULL, block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* btr_check_node_ptr() needs parent block latched. + If the merge_block's parent block is not same, + we cannot use btr_check_node_ptr() */ + ut_ad(leftmost_child + || btr_check_node_ptr(index, merge_block, mtr)); +func_exit: + mem_heap_free(heap); + + if (adjust) { + ut_ad(nth_rec > 0); + btr_cur_position( + index, + page_rec_get_nth(merge_block->frame, nth_rec), + merge_block, cursor); + } + + MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL); + + DBUG_RETURN(TRUE); + +err_exit: + /* We play it safe and reset the free bits. */ + if (merge_block && merge_block->zip_size() + && page_is_leaf(merge_block->frame) + && !dict_index_is_clust(index)) { + + ibuf_reset_free_bits(merge_block); + } + + mem_heap_free(heap); + DBUG_RETURN(FALSE); +} + +/*************************************************************//** +Discards a page that is the only page on its level. This will empty +the whole B-tree, leaving just an empty root page. This function +should almost never be reached, because btr_compress(), which is invoked in +delete operations, calls btr_lift_page_up() to flatten the B-tree. */ +ATTRIBUTE_COLD +static +void +btr_discard_only_page_on_level( +/*===========================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_level = 0; + + ut_ad(!index->is_dummy); + + /* Save the PAGE_MAX_TRX_ID from the leaf page. */ + const trx_id_t max_trx_id = page_get_max_trx_id(block->frame); + const rec_t* r = page_rec_get_next(page_get_infimum_rec(block->frame)); + ut_ad(rec_is_metadata(r, *index) == index->is_instant()); + + while (block->page.id().page_no() != dict_index_get_page(index)) { + btr_cur_t cursor; + buf_block_t* father; + const page_t* page = buf_block_get_frame(block); + + ut_a(page_get_n_recs(page) == 1); + ut_a(page_level == btr_page_get_level(page)); + ut_a(!page_has_siblings(page)); + ut_ad(fil_page_index_page_check(page)); + ut_ad(block->page.id().space() == index->table->space->id); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + btr_search_drop_page_hash_index(block); + + if (dict_index_is_spatial(index)) { + /* Check any concurrent search having this page */ + rtr_check_discard_page(index, NULL, block); + rtr_page_get_father(index, block, mtr, NULL, &cursor); + } else { + btr_page_get_father(index, block, mtr, &cursor); + } + father = btr_cur_get_block(&cursor); + + if (!dict_table_is_locking_disabled(index->table)) { + lock_update_discard( + father, PAGE_HEAP_NO_SUPREMUM, block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + block = father; + page_level++; + } + + /* block is the root page, which must be empty, except + for the node pointer to the (now discarded) block(s). */ + ut_ad(!page_has_siblings(block->frame)); + +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + const ulint space = index->table->space_id; + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + mem_heap_t* heap = nullptr; + const rec_t* rec = nullptr; + rec_offs* offsets = nullptr; + if (index->table->instant || index->must_avoid_clear_instant_add()) { + if (!rec_is_metadata(r, *index)) { + } else if (!index->table->instant + || rec_is_alter_metadata(r, *index)) { + heap = mem_heap_create(srv_page_size); + offsets = rec_get_offsets(r, index, nullptr, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + rec = rec_copy(mem_heap_alloc(heap, + rec_offs_size(offsets)), + r, offsets); + rec_offs_make_valid(rec, index, true, offsets); + } + } + + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(block->frame)); + + if (index->is_primary()) { + if (rec) { + page_cur_t cur; + page_cur_set_before_first(block, &cur); + DBUG_ASSERT(index->table->instant); + DBUG_ASSERT(rec_is_alter_metadata(rec, *index)); + btr_set_instant(block, *index, mtr); + rec = page_cur_insert_rec_low(&cur, index, rec, + offsets, mtr); + ut_ad(rec); + mem_heap_free(heap); + } else if (index->is_instant()) { + index->clear_instant_add(); + } + } else if (!index->table->is_temporary()) { + /* We play it safe and reset the free bits for the root */ + ibuf_reset_free_bits(block); + + ut_a(max_trx_id); + page_set_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, mtr); + } +} + +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + buf_block_t* merge_block; + buf_block_t* block; + btr_cur_t parent_cursor; + + block = btr_cur_get_block(cursor); + index = btr_cur_get_index(cursor); + + ut_ad(dict_index_get_page(index) != block->page.id().page_no()); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + MONITOR_INC(MONITOR_INDEX_DISCARD); + + if (dict_index_is_spatial(index)) { + rtr_page_get_father(index, block, mtr, cursor, &parent_cursor); + } else { + btr_page_get_father(index, block, mtr, &parent_cursor); + } + + /* Decide the page which will inherit the locks */ + + const uint32_t left_page_no = btr_page_get_prev(block->frame); + const uint32_t right_page_no = btr_page_get_next(block->frame); + + ut_d(bool parent_is_different = false); + if (left_page_no != FIL_NULL) { + merge_block = btr_block_get(*index, left_page_no, RW_X_LATCH, + true, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(merge_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + ut_d(parent_is_different = + (page_rec_get_next( + page_get_infimum_rec( + btr_cur_get_page( + &parent_cursor))) + == btr_cur_get_rec(&parent_cursor))); + } else if (right_page_no != FIL_NULL) { + merge_block = btr_block_get(*index, right_page_no, RW_X_LATCH, + true, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(merge_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + ut_d(parent_is_different = page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(&parent_cursor)))); + if (!page_is_leaf(merge_block->frame)) { + rec_t* node_ptr = page_rec_get_next( + page_get_infimum_rec(merge_block->frame)); + ut_ad(page_rec_is_user_rec(node_ptr)); + /* We have to mark the leftmost node pointer as the + predefined minimum record. */ + btr_set_min_rec_mark<true>(node_ptr, *merge_block, + mtr); + } + } else { + btr_discard_only_page_on_level(index, block, mtr); + + return; + } + + ut_a(page_is_comp(merge_block->frame) == page_is_comp(block->frame)); + ut_ad(!memcmp_aligned<2>(&merge_block->frame[PAGE_HEADER + PAGE_LEVEL], + &block->frame[PAGE_HEADER + PAGE_LEVEL], 2)); + btr_search_drop_page_hash_index(block); + + if (dict_index_is_spatial(index)) { + rtr_node_ptr_delete(&parent_cursor, mtr); + } else { + btr_cur_node_ptr_delete(&parent_cursor, mtr); + } + + /* Remove the page from the level list */ + btr_level_list_remove(*block, *index, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block); + ut_a(!merge_page_zip + || page_zip_validate(merge_page_zip, merge_block->frame, + index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (!dict_table_is_locking_disabled(index->table)) { + if (left_page_no != FIL_NULL) { + lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, + block); + } else { + lock_update_discard(merge_block, + lock_get_min_heap_no(merge_block), + block); + } + } + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, cursor, block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* btr_check_node_ptr() needs parent block latched. + If the merge_block's parent block is not same, + we cannot use btr_check_node_ptr() */ + ut_ad(parent_is_different + || btr_check_node_ptr(index, merge_block, mtr)); + + if (btr_cur_get_block(&parent_cursor)->page.id().page_no() + == index->page + && !page_has_siblings(btr_cur_get_page(&parent_cursor)) + && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) { + btr_lift_page_up(index, merge_block, mtr); + } +} + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ +{ + page_t* root; + fseg_header_t* seg; + mtr_t mtr; + + if (dict_index_is_ibuf(index)) { + fputs("Sorry, cannot print info of an ibuf tree:" + " use ibuf functions\n", stderr); + + return; + } + + mtr_start(&mtr); + + root = btr_root_get(index, &mtr); + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + + if (!dict_index_is_ibuf(index)) { + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + } + + mtr_commit(&mtr); +} + +/************************************************************//** +Prints recursively index tree pages. */ +static +void +btr_print_recursive( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + ulint width, /*!< in: print this many entries from start + and end */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + rec_offs** offsets,/*!< in/out: buffer for rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + page_cur_t cursor; + ulint n_recs; + ulint i = 0; + mtr_t mtr2; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX)); + + ib::info() << "NODE ON LEVEL " << btr_page_get_level(page) + << " page " << block->page.id; + + page_print(block, index, width, width); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + while (!page_cur_is_after_last(&cursor)) { + + if (page_is_leaf(page)) { + + /* If this is the leaf level, do nothing */ + + } else if ((i <= width) || (i >= n_recs - width)) { + + const rec_t* node_ptr; + + mtr_start(&mtr2); + + node_ptr = page_cur_get_rec(&cursor); + + *offsets = rec_get_offsets( + node_ptr, index, *offsets, 0, + ULINT_UNDEFINED, heap); + btr_print_recursive(index, + btr_node_ptr_get_child(node_ptr, + index, + *offsets, + &mtr2), + width, heap, offsets, &mtr2); + mtr_commit(&mtr2); + } + + page_cur_move_to_next(&cursor); + i++; + } +} + +/**************************************************************//** +Prints directories and other info of all nodes in the tree. */ +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ +{ + mtr_t mtr; + buf_block_t* root; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + fputs("--------------------------\n" + "INDEX TREE PRINT\n", stderr); + + mtr_start(&mtr); + + root = btr_root_block_get(index, RW_SX_LATCH, &mtr); + + btr_print_recursive(index, root, width, &heap, &offsets, &mtr); + if (heap) { + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + ut_ad(btr_validate_index(index, 0)); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* tuple; + rec_offs* offsets; + btr_cur_t cursor; + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + if (dict_index_get_page(index) == block->page.id().page_no()) { + + return(TRUE); + } + + heap = mem_heap_create(256); + + if (dict_index_is_spatial(index)) { + offsets = rtr_page_get_father_block(NULL, heap, index, block, mtr, + NULL, &cursor); + } else { + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &cursor); + } + + if (page_is_leaf(page)) { + + goto func_exit; + } + + tuple = dict_index_build_node_ptr( + index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, + btr_page_get_level(page)); + + /* For spatial index, the MBR in the parent rec could be different + with that of first rec of child, their relationship should be + "WITHIN" relationship */ + if (dict_index_is_spatial(index)) { + ut_a(!cmp_dtuple_rec_with_gis( + tuple, btr_cur_get_rec(&cursor), + PAGE_CUR_WITHIN)); + } else { + ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets)); + } +func_exit: + mem_heap_free(heap); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +Display identification information for a record. */ +static +void +btr_index_rec_validate_report( +/*==========================*/ + const page_t* page, /*!< in: index page */ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index) /*!< in: index */ +{ + ib::info() << "Record in index " << index->name + << " of table " << index->table->name + << ", page " << page_id_t(page_get_space_id(page), + page_get_page_no(page)) + << ", at offset " << page_offset(rec); +} + +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ +{ + ulint len; + const page_t* page; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + page = page_align(rec); + + ut_ad(index->n_core_fields); + + if (index->is_ibuf()) { + /* The insert buffer index tree can contain records from any + other index: we cannot check the number of fields or + their length */ + + return(TRUE); + } + +#ifdef VIRTUAL_INDEX_DEBUG + if (dict_index_has_virtual(index)) { + fprintf(stderr, "index name is %s\n", index->name()); + } +#endif + if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "Compact flag=" << !!page_is_comp(page) + << ", should be " << dict_table_is_comp(index->table); + + return(FALSE); + } + + const bool is_alter_metadata = page_is_leaf(page) + && !page_has_prev(page) + && index->is_primary() && index->table->instant + && rec == page_rec_get_next_const(page_get_infimum_rec(page)); + + if (is_alter_metadata + && !rec_is_alter_metadata(rec, page_is_comp(page))) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "First record is not ALTER TABLE metadata"; + return FALSE; + } + + if (!page_is_comp(page)) { + const ulint n_rec_fields = rec_get_n_fields_old(rec); + if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD + && index->id == DICT_INDEXES_ID) { + /* A record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + } else if (is_alter_metadata) { + if (n_rec_fields != ulint(index->n_fields) + 1) { + goto n_field_mismatch; + } + } else if (n_rec_fields < index->n_core_fields + || n_rec_fields > index->n_fields) { +n_field_mismatch: + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "Has " << rec_get_n_fields_old(rec) + << " fields, should have " + << index->n_core_fields << ".." + << index->n_fields; + + if (dump_on_error) { + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); + } + } + + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + const dict_field_t* field = index->fields; + ut_ad(rec_offs_n_fields(offsets) + == ulint(index->n_fields) + is_alter_metadata); + + for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) { + rec_get_nth_field_offs(offsets, i, &len); + + ulint fixed_size; + + if (is_alter_metadata && i == index->first_user_field()) { + fixed_size = FIELD_REF_SIZE; + if (len != FIELD_REF_SIZE + || !rec_offs_nth_extern(offsets, i)) { + goto len_mismatch; + } + + continue; + } else { + fixed_size = dict_col_get_fixed_size( + field->col, page_is_comp(page)); + if (rec_offs_nth_extern(offsets, i)) { + const byte* data = rec_get_nth_field( + rec, offsets, i, &len); + len -= BTR_EXTERN_FIELD_REF_SIZE; + ulint extern_len = mach_read_from_4( + data + len + BTR_EXTERN_LEN + 4); + if (fixed_size == extern_len) { + goto next_field; + } + } + } + + /* Note that if fixed_size != 0, it equals the + length of a fixed-size column in the clustered index. + We should adjust it here. + A prefix index of the column is of fixed, but different + length. When fixed_size == 0, prefix_len is the maximum + length of the prefix index column. */ + + if (len_is_stored(len) + && (field->prefix_len + ? len > field->prefix_len + : (fixed_size && len != fixed_size))) { +len_mismatch: + btr_index_rec_validate_report(page, rec, index); + ib::error error; + + error << "Field " << i << " len is " << len + << ", should be " << fixed_size; + + if (dump_on_error) { + error << "; "; + rec_print(error.m_oss, rec, + rec_get_info_bits( + rec, rec_offs_comp(offsets)), + offsets); + } + if (heap) { + mem_heap_free(heap); + } + return(FALSE); + } +next_field: + field++; + } + +#ifdef VIRTUAL_INDEX_DEBUG + if (dict_index_has_virtual(index)) { + rec_print_new(stderr, rec, offsets); + } +#endif + + if (heap) { + mem_heap_free(heap); + } + return(TRUE); +} + +/************************************************************//** +Checks the size and number of fields in records based on the definition of +the index. +@return TRUE if ok */ +static +ibool +btr_index_page_validate( +/*====================*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index) /*!< in: index */ +{ + page_cur_t cur; + ibool ret = TRUE; +#ifndef DBUG_OFF + ulint nth = 1; +#endif /* !DBUG_OFF */ + + page_cur_set_before_first(block, &cur); + + /* Directory slot 0 should only contain the infimum record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(page_rec_get_nth_const( + page_cur_get_page(&cur), 0) + == cur.rec); + ut_a(page_dir_slot_get_n_owned( + page_dir_get_nth_slot( + page_cur_get_page(&cur), 0)) + == 1);); + + page_cur_move_to_next(&cur); + + for (;;) { + if (page_cur_is_after_last(&cur)) { + + break; + } + + if (!btr_index_rec_validate(cur.rec, index, TRUE)) { + + return(FALSE); + } + + /* Verify that page_rec_get_nth_const() is correctly + retrieving each record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(cur.rec == page_rec_get_nth_const( + page_cur_get_page(&cur), + page_rec_get_n_recs_before( + cur.rec))); + ut_a(nth++ == page_rec_get_n_recs_before( + cur.rec));); + + page_cur_move_to_next(&cur); + } + + return(ret); +} + +/************************************************************//** +Report an error on one page of an index tree. */ +static +void +btr_validate_report1( +/*=================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block) /*!< in: index page */ +{ + ib::error error; + error << "In page " << block->page.id().page_no() + << " of index " << index->name + << " of table " << index->table->name; + + if (level > 0) { + error << ", index tree level " << level; + } +} + +/************************************************************//** +Report an error on two pages of an index tree. */ +static +void +btr_validate_report2( +/*=================*/ + const dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block1, /*!< in: first index page */ + const buf_block_t* block2) /*!< in: second index page */ +{ + ib::error error; + error << "In pages " << block1->page.id() + << " and " << block2->page.id() << " of index " << index->name + << " of table " << index->table->name; + + if (level) + error << ", index tree level " << level; +} + +/************************************************************//** +Validates index tree level. +@return TRUE if ok */ +static +bool +btr_validate_level( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + const trx_t* trx, /*!< in: transaction or NULL */ + ulint level, /*!< in: level number */ + bool lockout)/*!< in: true if X-latch index is intended */ +{ + buf_block_t* block; + page_t* page; + buf_block_t* right_block = 0; /* remove warning */ + page_t* right_page = 0; /* remove warning */ + page_t* father_page; + btr_cur_t node_cur; + btr_cur_t right_node_cur; + rec_t* rec; + page_cur_t cursor; + dtuple_t* node_ptr_tuple; + bool ret = true; + mtr_t mtr; + mem_heap_t* heap = mem_heap_create(256); + rec_offs* offsets = NULL; + rec_offs* offsets2= NULL; +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip; +#endif /* UNIV_ZIP_DEBUG */ + ulint savepoint = 0; + ulint savepoint2 = 0; + uint32_t parent_page_no = FIL_NULL; + uint32_t parent_right_page_no = FIL_NULL; + bool rightmost_child = false; + + mtr.start(); + + if (!srv_read_only_mode) { + if (lockout) { + mtr_x_lock_index(index, &mtr); + } else { + mtr_sx_lock_index(index, &mtr); + } + } + + block = btr_root_block_get(index, RW_SX_LATCH, &mtr); + page = buf_block_get_frame(block); + + fil_space_t* space = index->table->space; + + while (level != btr_page_get_level(page)) { + const rec_t* node_ptr; + + if (fseg_page_is_free(space, block->page.id().page_no())) { + + btr_validate_report1(index, level, block); + + ib::warn() << "Page is free"; + + ret = false; + } + + ut_a(index->table->space_id == block->page.id().space()); + ut_a(block->page.id().space() == page_get_space_id(page)); +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(!page_is_leaf(page)); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + node_ptr = page_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + savepoint2 = mtr_set_savepoint(&mtr); + block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr); + page = buf_block_get_frame(block); + + /* For R-Tree, since record order might not be the same as + linked index page in the lower level, we need to travers + backwards to get the first page rec in this level. + This is only used for index validation. Spatial index + does not use such scan for any of its DML or query + operations */ + if (dict_index_is_spatial(index)) { + uint32_t left_page_no = btr_page_get_prev(page); + + while (left_page_no != FIL_NULL) { + /* To obey latch order of tree blocks, + we should release the right_block once to + obtain lock of the uncle block. */ + mtr_release_block_at_savepoint( + &mtr, savepoint2, block); + + savepoint2 = mtr_set_savepoint(&mtr); + block = btr_block_get(*index, left_page_no, + RW_SX_LATCH, false, + &mtr); + page = buf_block_get_frame(block); + left_page_no = btr_page_get_prev(page); + } + } + } + + /* Now we are on the desired level. Loop through the pages on that + level. */ + +loop: + mem_heap_empty(heap); + offsets = offsets2 = NULL; + if (!srv_read_only_mode) { + if (lockout) { + mtr_x_lock_index(index, &mtr); + } else { + mtr_sx_lock_index(index, &mtr); + } + } + +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_a(block->page.id().space() == index->table->space_id); + + if (fseg_page_is_free(space, block->page.id().page_no())) { + + btr_validate_report1(index, level, block); + + ib::warn() << "Page is marked as free"; + ret = false; + + } else if (btr_page_get_index_id(page) != index->id) { + + ib::error() << "Page index id " << btr_page_get_index_id(page) + << " != data dictionary index id " << index->id; + + ret = false; + + } else if (!page_validate(page, index)) { + + btr_validate_report1(index, level, block); + ret = false; + + } else if (level == 0 && !btr_index_page_validate(block, index)) { + + /* We are on level 0. Check that the records have the right + number of fields, and field lengths are right. */ + + ret = false; + } + + ut_a(btr_page_get_level(page) == level); + + uint32_t right_page_no = btr_page_get_next(page); + uint32_t left_page_no = btr_page_get_prev(page); + + ut_a(!page_is_empty(page) + || (level == 0 + && page_get_page_no(page) == dict_index_get_page(index))); + + if (right_page_no != FIL_NULL) { + const rec_t* right_rec; + savepoint = mtr_set_savepoint(&mtr); + + right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, + !level, &mtr); + right_page = buf_block_get_frame(right_block); + + if (btr_page_get_prev(right_page) != page_get_page_no(page)) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: broken FIL_PAGE_NEXT" + " or FIL_PAGE_PREV links\n", stderr); + + ret = false; + } + + if (page_is_comp(right_page) != page_is_comp(page)) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: 'compact' flag mismatch\n", stderr); + + ret = false; + + goto node_ptr_fails; + } + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + offsets = rec_get_offsets(rec, index, offsets, + page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, offsets2, + page_is_leaf(right_page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + + /* For spatial index, we cannot guarantee the key ordering + across pages, so skip the record compare verification for + now. Will enhanced in special R-Tree index validation scheme */ + if (!dict_index_is_spatial(index) + && cmp_rec_rec(rec, right_rec, + offsets, offsets2, index) >= 0) { + + btr_validate_report2(index, level, block, right_block); + + fputs("InnoDB: records in wrong order" + " on adjacent pages\n", stderr); + + fputs("InnoDB: record ", stderr); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + fputs("InnoDB: record ", stderr); + rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + + ret = false; + } + } + + if (level > 0 && left_page_no == FIL_NULL) { + ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); + } + + /* Similarly skip the father node check for spatial index for now, + for a couple of reasons: + 1) As mentioned, there is no ordering relationship between records + in parent level and linked pages in the child level. + 2) Search parent from root is very costly for R-tree. + We will add special validation mechanism for R-tree later (WL #7520) */ + if (!dict_index_is_spatial(index) + && block->page.id().page_no() != dict_index_get_page(index)) { + + /* Check father node pointers */ + rec_t* node_ptr; + + btr_cur_position( + index, page_rec_get_next(page_get_infimum_rec(page)), + block, &node_cur); + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &node_cur, &mtr); + + father_page = btr_cur_get_page(&node_cur); + node_ptr = btr_cur_get_rec(&node_cur); + + parent_page_no = page_get_page_no(father_page); + parent_right_page_no = btr_page_get_next(father_page); + rightmost_child = page_rec_is_supremum( + page_rec_get_next(node_ptr)); + + btr_cur_position( + index, + page_rec_get_prev(page_get_supremum_rec(page)), + block, &node_cur); + + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &node_cur, &mtr); + + if (node_ptr != btr_cur_get_rec(&node_cur) + || btr_node_ptr_get_child_page_no(node_ptr, offsets) + != block->page.id().page_no()) { + + btr_validate_report1(index, level, block); + + fputs("InnoDB: node pointer to the page is wrong\n", + stderr); + + fputs("InnoDB: node ptr ", stderr); + rec_print(stderr, node_ptr, index); + + rec = btr_cur_get_rec(&node_cur); + fprintf(stderr, "\n" + "InnoDB: node ptr child page n:o %u\n", + btr_node_ptr_get_child_page_no(rec, offsets)); + + fputs("InnoDB: record on page ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + ret = false; + + goto node_ptr_fails; + } + + if (!page_is_leaf(page)) { + node_ptr_tuple = dict_index_build_node_ptr( + index, + page_rec_get_next(page_get_infimum_rec(page)), + 0, heap, btr_page_get_level(page)); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + const rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + btr_validate_report1(index, level, block); + + ib::error() << "Node ptrs differ on levels > 0"; + + fputs("InnoDB: node ptr ",stderr); + rec_print_new(stderr, node_ptr, offsets); + fputs("InnoDB: first rec ", stderr); + rec_print(stderr, first_rec, index); + putc('\n', stderr); + ret = false; + + goto node_ptr_fails; + } + } + + if (left_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_next( + page_get_infimum_rec(father_page))); + ut_a(!page_has_prev(father_page)); + } + + if (right_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_prev( + page_get_supremum_rec(father_page))); + ut_a(!page_has_next(father_page)); + } else { + const rec_t* right_node_ptr; + + right_node_ptr = page_rec_get_next(node_ptr); + + if (!lockout && rightmost_child) { + + /* To obey latch order of tree blocks, + we should release the right_block once to + obtain lock of the uncle block. */ + mtr_release_block_at_savepoint( + &mtr, savepoint, right_block); + + if (parent_right_page_no != FIL_NULL) { + btr_block_get(*index, + parent_right_page_no, + RW_SX_LATCH, false, + &mtr); + } + + right_block = btr_block_get(*index, + right_page_no, + RW_SX_LATCH, + !level, &mtr); + } + + btr_cur_position( + index, page_rec_get_next( + page_get_infimum_rec( + buf_block_get_frame( + right_block))), + right_block, &right_node_cur); + + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &right_node_cur, &mtr); + + if (right_node_ptr + != page_get_supremum_rec(father_page)) { + + if (btr_cur_get_rec(&right_node_cur) + != right_node_ptr) { + ret = false; + fputs("InnoDB: node pointer to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + } else { + page_t* right_father_page + = btr_cur_get_page(&right_node_cur); + + if (btr_cur_get_rec(&right_node_cur) + != page_rec_get_next( + page_get_infimum_rec( + right_father_page))) { + ret = false; + fputs("InnoDB: node pointer 2 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + + if (page_get_page_no(right_father_page) + != btr_page_get_next(father_page)) { + + ret = false; + fputs("InnoDB: node pointer 3 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + } + } + } + +node_ptr_fails: + /* Commit the mini-transaction to release the latch on 'page'. + Re-acquire the latch on right_page, which will become 'page' + on the next loop. The page has already been checked. */ + mtr.commit(); + + if (trx_is_interrupted(trx)) { + /* On interrupt, return the current status. */ + } else if (right_page_no != FIL_NULL) { + + mtr.start(); + + if (!lockout) { + if (rightmost_child) { + if (parent_right_page_no != FIL_NULL) { + btr_block_get(*index, + parent_right_page_no, + RW_SX_LATCH, false, + &mtr); + } + } else if (parent_page_no != FIL_NULL) { + btr_block_get(*index, parent_page_no, + RW_SX_LATCH, false, &mtr); + } + } + + block = btr_block_get(*index, right_page_no, RW_SX_LATCH, + !level, &mtr); + page = buf_block_get_frame(block); + + goto loop; + } + + mem_heap_free(heap); + + return(ret); +} + +/**************************************************************//** +Checks the consistency of an index tree. +@return DB_SUCCESS if ok, error code if not */ +dberr_t +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or NULL */ +{ + dberr_t err = DB_SUCCESS; + bool lockout = dict_index_is_spatial(index); + + /* Full Text index are implemented by auxiliary tables, + not the B-tree */ + if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) { + return(err); + } + + mtr_t mtr; + + mtr_start(&mtr); + + if (!srv_read_only_mode) { + if (lockout) { + mtr_x_lock_index(index, &mtr); + } else { + mtr_sx_lock_index(index, &mtr); + } + } + + page_t* root = btr_root_get(index, &mtr); + + if (!root) { + mtr_commit(&mtr); + return DB_CORRUPTION; + } + + ulint n = btr_page_get_level(root); + + btr_validate_index_running++; + for (ulint i = 0; i <= n; ++i) { + + if (!btr_validate_level(index, trx, n - i, lockout)) { + err = DB_CORRUPTION; + } + } + + mtr_commit(&mtr); + /* In theory we need release barrier here, so that + btr_validate_index_running decrement is guaranteed to + happen after latches are released. + + Original code issued SEQ_CST on update and non-atomic + access on load. Which means it had broken synchronisation + as well. */ + btr_validate_index_running--; + + return(err); +} + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return true if possible to merge. */ +static +bool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + uint32_t page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + dict_index_t* index; + page_t* page; + ulint n_recs; + ulint data_size; + ulint max_ins_size_reorg; + ulint max_ins_size; + buf_block_t* mblock; + page_t* mpage; + DBUG_ENTER("btr_can_merge_with_page"); + + if (page_no == FIL_NULL) { + *merge_block = NULL; + DBUG_RETURN(false); + } + + index = btr_cur_get_index(cursor); + page = btr_cur_get_page(cursor); + + mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page), + mtr); + mpage = buf_block_get_frame(mblock); + + n_recs = page_get_n_recs(page); + data_size = page_get_data_size(page); + + max_ins_size_reorg = page_get_max_insert_size_after_reorganize( + mpage, n_recs); + + if (data_size > max_ins_size_reorg) { + goto error; + } + + /* If compression padding tells us that merging will result in + too packed up page i.e.: which is likely to cause compression + failure then don't merge the pages. */ + if (mblock->page.zip.data && page_is_leaf(mpage) + && (page_get_data_size(mpage) + data_size + >= dict_index_zip_pad_optimal_page_size(index))) { + + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + if (data_size > max_ins_size) { + /* We have to reorganize mpage */ + if (!btr_page_reorganize_block(page_zip_level, mblock, index, + mtr)) { + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + ut_ad(page_validate(mpage, index)); + ut_ad(max_ins_size == max_ins_size_reorg); + + if (data_size > max_ins_size) { + + /* Add fault tolerance, though this should + never happen */ + + goto error; + } + } + + *merge_block = mblock; + DBUG_RETURN(true); + +error: + *merge_block = NULL; + DBUG_RETURN(false); +} diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc new file mode 100644 index 00000000..9004064a --- /dev/null +++ b/storage/innobase/btr/btr0bulk.cc @@ -0,0 +1,1238 @@ +/***************************************************************************** + +Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0bulk.cc +The B-tree bulk load + +Created 03/11/2014 Shaohua Wang +*******************************************************/ + +#include "btr0bulk.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "ibuf0ibuf.h" +#include "page0page.h" +#include "trx0trx.h" + +/** Innodb B-tree index fill factor for bulk load. */ +uint innobase_fill_factor; + +/** Initialize members, allocate page if needed and start mtr. +Note: we commit all mtrs on failure. +@return error code. */ +dberr_t +PageBulk::init() +{ + buf_block_t* new_block; + page_t* new_page; + + ut_ad(m_heap == NULL); + m_heap = mem_heap_create(1000); + + m_mtr.start(); + m_index->set_modified(m_mtr); + + if (m_page_no == FIL_NULL) { + mtr_t alloc_mtr; + + /* We commit redo log for allocation by a separate mtr, + because we don't guarantee pages are committed following + the allocation order, and we will always generate redo log + for page allocation, even when creating a new tablespace. */ + alloc_mtr.start(); + m_index->set_modified(alloc_mtr); + + uint32_t n_reserved; + if (!fsp_reserve_free_extents(&n_reserved, + m_index->table->space, + 1, FSP_NORMAL, &alloc_mtr)) { + alloc_mtr.commit(); + m_mtr.commit(); + return(DB_OUT_OF_FILE_SPACE); + } + + /* Allocate a new page. */ + new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level, + &alloc_mtr, &m_mtr); + + m_index->table->space->release_free_extents(n_reserved); + + alloc_mtr.commit(); + + new_page = buf_block_get_frame(new_block); + m_page_no = new_block->page.id().page_no(); + + byte* index_id = my_assume_aligned<2> + (PAGE_HEADER + PAGE_INDEX_ID + new_page); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8); + + if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + mach_write_to_8(index_id, m_index->id); + page_create_zip(new_block, m_index, m_level, 0, + &m_mtr); + } else { + ut_ad(!m_index->is_spatial()); + page_create(new_block, &m_mtr, + m_index->table->not_redundant()); + m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff); + m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER + + PAGE_LEVEL + + new_page, m_level); + m_mtr.write<8>(*new_block, index_id, m_index->id); + } + } else { + new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH, + false, &m_mtr); + + new_page = buf_block_get_frame(new_block); + ut_ad(new_block->page.id().page_no() == m_page_no); + + ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW); + + btr_page_set_level(new_block, m_level, &m_mtr); + } + + m_page_zip = buf_block_get_page_zip(new_block); + + if (!m_level && dict_index_is_sec_or_ibuf(m_index)) { + page_update_max_trx_id(new_block, m_page_zip, m_trx_id, + &m_mtr); + } + + m_block = new_block; + m_page = new_page; + m_cur_rec = page_get_infimum_rec(new_page); + ut_ad(m_is_comp == !!page_is_comp(new_page)); + m_free_space = page_get_free_space_of_empty(m_is_comp); + + if (innobase_fill_factor == 100 && dict_index_is_clust(m_index)) { + /* Keep default behavior compatible with 5.6 */ + m_reserved_space = dict_index_get_space_reserve(); + } else { + m_reserved_space = + srv_page_size * (100 - innobase_fill_factor) / 100; + } + + m_padding_space = + srv_page_size - dict_index_zip_pad_optimal_page_size(m_index); + m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP); + m_rec_no = page_header_get_field(new_page, PAGE_N_RECS); + /* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0, + without writing redo log, to ensure that needs_finish() will hold + on an empty page. */ + ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION); + m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0; + ut_d(m_total_data = 0); + + return(DB_SUCCESS); +} + +/** Insert a record in the page. +@tparam fmt the page format +@param[in,out] rec record +@param[in] offsets record offsets */ +template<PageBulk::format fmt> +inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets) +{ + ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED)); + ut_ad((fmt != REDUNDANT) == m_is_comp); + ut_ad(page_align(m_heap_top) == m_page); + ut_ad(m_heap); + + const ulint rec_size= rec_offs_size(offsets); + const ulint extra_size= rec_offs_extra_size(offsets); + ut_ad(page_align(m_heap_top + rec_size) == m_page); + ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec)); + +#ifdef UNIV_DEBUG + /* Check whether records are in order. */ + if (page_offset(m_cur_rec) != + (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM)) + { + const rec_t *old_rec = m_cur_rec; + rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf + ? m_index->n_core_fields : 0, + ULINT_UNDEFINED, &m_heap); + ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0); + } + + m_total_data+= rec_size; +#endif /* UNIV_DEBUG */ + + rec_t* const insert_rec= m_heap_top + extra_size; + + /* Insert the record in the linked list. */ + if (fmt != REDUNDANT) + { + const rec_t *next_rec= m_page + + page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT)); + if (fmt != COMPRESSED) + m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, + static_cast<uint16_t>(insert_rec - m_cur_rec)); + else + { + mach_write_to_2(m_cur_rec - REC_NEXT, + static_cast<uint16_t>(insert_rec - m_cur_rec)); + memcpy(m_heap_top, rec - extra_size, rec_size); + } + + rec_t * const this_rec= fmt != COMPRESSED + ? const_cast<rec_t*>(rec) : insert_rec; + rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK, + REC_N_OWNED_SHIFT); + rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no, + REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + mach_write_to_2(this_rec - REC_NEXT, + static_cast<uint16_t>(next_rec - insert_rec)); + } + else + { + memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2); + m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec)); + rec_set_bit_field_1(const_cast<rec_t*>(rec), 0, + REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(const_cast<rec_t*>(rec), + PAGE_HEAP_NO_USER_LOW + m_rec_no, + REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + } + + if (fmt == COMPRESSED) + /* We already wrote the record. Log is written in PageBulk::compress(). */; + else if (page_offset(m_cur_rec) == + (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM)) + m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size); + else + { + /* Try to copy common prefix from the preceding record. */ + const byte *r= rec - extra_size; + const byte * const insert_rec_end= m_heap_top + rec_size; + byte *b= m_heap_top; + + /* Skip any unchanged prefix of the record. */ + for (; * b == *r; b++, r++); + + ut_ad(b < insert_rec_end); + + const byte *c= m_cur_rec - (rec - r); + const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets), + m_heap_top); + + /* Try to copy any bytes of the preceding record. */ + if (UNIV_LIKELY(c >= m_page && c < c_end)) + { + const byte *cm= c; + byte *bm= b; + const byte *rm= r; + for (; cm < c_end && *rm == *cm; cm++, bm++, rm++); + ut_ad(bm <= insert_rec_end); + size_t len= static_cast<size_t>(rm - r); + ut_ad(!memcmp(r, c, len)); + if (len > 2) + { + memcpy(b, c, len); + m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len); + c= cm; + b= bm; + r= rm; + } + } + + if (c < m_cur_rec) + { + if (!rec_offs_data_size(offsets)) + { +no_data: + m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c); + goto rec_done; + } + /* Some header bytes differ. Compare the data separately. */ + const byte *cd= m_cur_rec; + byte *bd= insert_rec; + const byte *rd= rec; + /* Skip any unchanged prefix of the record. */ + for (;; cd++, bd++, rd++) + if (bd == insert_rec_end) + goto no_data; + else if (*bd != *rd) + break; + + /* Try to copy any data bytes of the preceding record. */ + if (c_end - cd > 2) + { + const byte *cdm= cd; + const byte *rdm= rd; + for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++) + ut_ad(rdm - rd + bd <= insert_rec_end); + size_t len= static_cast<size_t>(rdm - rd); + ut_ad(!memcmp(rd, cd, len)); + if (len > 2) + { + m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c); + memcpy(bd, cd, len); + m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len); + c= cdm; + b= rdm - rd + bd; + r= rdm; + } + } + } + + if (size_t len= static_cast<size_t>(insert_rec_end - b)) + m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len); + } + +rec_done: + ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size)); + rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets); + + /* Update the member variables. */ + ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) - + page_dir_calc_reserved_space(m_rec_no); + + ut_ad(m_free_space >= rec_size + slot_size); + ut_ad(m_heap_top + rec_size < m_page + srv_page_size); + + m_free_space-= rec_size + slot_size; + m_heap_top+= rec_size; + m_rec_no++; + m_cur_rec= insert_rec; +} + +/** Insert a record in the page. +@param[in] rec record +@param[in] offsets record offsets */ +inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets) +{ + byte rec_hdr[REC_N_OLD_EXTRA_BYTES]; + static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format"); + + if (UNIV_LIKELY_NULL(m_page_zip)) + insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets); + else if (m_is_comp) + { + memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES); + insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets); + memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr, + REC_N_NEW_EXTRA_BYTES); + } + else + { + memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES); + insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets); + memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr, + REC_N_OLD_EXTRA_BYTES); + } +} + +/** Set the number of owned records in the uncompressed page of +a ROW_FORMAT=COMPRESSED record without redo-logging. */ +static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned) +{ + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); +} + +/** Mark end of insertion to the page. Scan all records to set page dirs, +and set page header members. +@tparam fmt page format */ +template<PageBulk::format fmt> +inline void PageBulk::finishPage() +{ + ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED)); + ut_ad((fmt != REDUNDANT) == m_is_comp); + + ulint count= 0; + ulint n_recs= 0; + byte *slot= my_assume_aligned<2>(m_page + srv_page_size - + (PAGE_DIR + PAGE_DIR_SLOT_SIZE)); + const page_dir_slot_t *const slot0 = slot; + compile_time_assert(PAGE_DIR_SLOT_SIZE == 2); + if (fmt != REDUNDANT) + { + uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page); + ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM); + offset= static_cast<uint16_t>(offset + PAGE_NEW_INFIMUM); + /* Set owner & dir. */ + while (offset != PAGE_NEW_SUPREMUM) + { + ut_ad(offset >= PAGE_NEW_SUPREMUM); + ut_ad(offset < page_offset(slot)); + count++; + n_recs++; + + if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) + { + slot-= PAGE_DIR_SLOT_SIZE; + mach_write_to_2(slot, offset); + + if (fmt != COMPRESSED) + page_rec_set_n_owned<false>(m_block, m_page + offset, count, true, + &m_mtr); + else + rec_set_n_owned_zip(m_page + offset, count); + + count= 0; + } + + uint16_t next= static_cast<uint16_t> + ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) & + (srv_page_size - 1)); + ut_ad(next); + offset= next; + } + + if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <= + PAGE_DIR_SLOT_MAX_N_OWNED)) + { + /* Merge the last two slots, like page_cur_insert_rec_low() does. */ + count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot)); + if (fmt != COMPRESSED) + page_rec_set_n_owned<false>(m_block, rec, 0, true, &m_mtr); + else + rec_set_n_owned_zip(rec, 0); + } + else + slot-= PAGE_DIR_SLOT_SIZE; + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + if (fmt != COMPRESSED) + page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM, + count + 1, true, &m_mtr); + else + rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1); + } + else + { + rec_t *insert_rec= m_page + + mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page); + + /* Set owner & dir. */ + while (insert_rec != m_page + PAGE_OLD_SUPREMUM) + { + count++; + n_recs++; + + if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) + { + slot-= PAGE_DIR_SLOT_SIZE; + mach_write_to_2(slot, page_offset(insert_rec)); + page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr); + count= 0; + } + + insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT); + } + + if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <= + PAGE_DIR_SLOT_MAX_N_OWNED)) + { + /* Merge the last two slots, like page_cur_insert_rec_low() does. */ + count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot)); + page_rec_set_n_owned<false>(m_block, rec, 0, false, &m_mtr); + } + else + slot-= PAGE_DIR_SLOT_SIZE; + + mach_write_to_2(slot, PAGE_OLD_SUPREMUM); + page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1, + false, &m_mtr); + } + + if (!m_rec_no); + else if (fmt != COMPRESSED) + { + static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility"); + alignas(8) byte page_header[PAGE_N_HEAP + 2]; + mach_write_to_2(page_header + PAGE_N_DIR_SLOTS, + 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE); + mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page); + mach_write_to_2(page_header + PAGE_N_HEAP, + (PAGE_HEAP_NO_USER_LOW + m_rec_no) | + uint16_t{fmt != REDUNDANT} << 15); + m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header, + sizeof page_header); + m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no); + m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot); + } + else + { + /* For ROW_FORMAT=COMPRESSED, redo log may be written in + PageBulk::compress(). */ + mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page, + 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE); + mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page, + static_cast<ulint>(m_heap_top - m_page)); + mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page, + (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15); + mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no); + } +} + +inline bool PageBulk::needs_finish() const +{ + ut_ad(page_align(m_cur_rec) == m_block->frame); + ut_ad(m_page == m_block->frame); + if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B]) + return true; + ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP); + ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW); + if (n_heap & 0x8000) + { + n_heap&= 0x7fff; + heap_no= rec_get_heap_no_new(m_cur_rec); + if (heap_no == PAGE_HEAP_NO_INFIMUM && + page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_NEW_SUPREMUM_END) + return false; + } + else + { + heap_no= rec_get_heap_no_old(m_cur_rec); + if (heap_no == PAGE_HEAP_NO_INFIMUM && + page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_OLD_SUPREMUM_END) + return false; + } + return heap_no != n_heap - 1; +} + +/** Mark end of insertion to the page. Scan all records to set page dirs, +and set page header members. +@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED */ +inline void PageBulk::finish() +{ + ut_ad(!m_index->is_spatial()); + + if (!needs_finish()); + else if (UNIV_LIKELY_NULL(m_page_zip)) + finishPage<COMPRESSED>(); + else if (m_is_comp) + finishPage<DYNAMIC>(); + else + finishPage<REDUNDANT>(); + + /* In MariaDB 10.2, 10.3, 10.4, we would initialize + PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT + in the same way as we would during normal INSERT operations. + Starting with MariaDB Server 10.5, bulk insert will not + touch those fields. */ + ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]); + /* Restore the temporary change of PageBulk::init() that was necessary to + ensure that PageBulk::needs_finish() holds on an empty page. */ + m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION; + + ut_ad(!page_header_get_field(m_page, PAGE_FREE)); + ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE)); + ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT)); + ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION)); + ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <= + page_get_free_space_of_empty(m_is_comp)); + ut_ad(!needs_finish()); + ut_ad(page_validate(m_page, m_index)); +} + +/** Commit inserts done to the page +@param[in] success Flag whether all inserts succeed. */ +void PageBulk::commit(bool success) +{ + finish(); + if (success && !dict_index_is_clust(m_index) && page_is_leaf(m_page)) + ibuf_set_bitmap_for_bulk_load(m_block, innobase_fill_factor == 100); + m_mtr.commit(); +} + +/** Compress a page of compressed table +@return true compress successfully or no need to compress +@return false compress failed. */ +bool +PageBulk::compress() +{ + ut_ad(m_page_zip != NULL); + + return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr); +} + +/** Get node pointer +@return node pointer */ +dtuple_t* +PageBulk::getNodePtr() +{ + rec_t* first_rec; + dtuple_t* node_ptr; + + /* Create node pointer */ + first_rec = page_rec_get_next(page_get_infimum_rec(m_page)); + ut_a(page_rec_is_user_rec(first_rec)); + node_ptr = dict_index_build_node_ptr(m_index, first_rec, m_page_no, + m_heap, m_level); + + return(node_ptr); +} + +/** Get split rec in left page.We split a page in half when compresssion fails, +and the split rec will be copied to right page. +@return split rec */ +rec_t* +PageBulk::getSplitRec() +{ + rec_t* rec; + rec_offs* offsets; + ulint total_used_size; + ulint total_recs_size; + ulint n_recs; + + ut_ad(m_page_zip != NULL); + ut_ad(m_rec_no >= 2); + ut_ad(!m_index->is_instant()); + + ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space); + total_used_size = page_get_free_space_of_empty(m_is_comp) + - m_free_space; + + total_recs_size = 0; + n_recs = 0; + offsets = NULL; + rec = page_get_infimum_rec(m_page); + const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0; + + do { + rec = page_rec_get_next(rec); + ut_ad(page_rec_is_user_rec(rec)); + + offsets = rec_get_offsets(rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + total_recs_size += rec_offs_size(offsets); + n_recs++; + } while (total_recs_size + page_dir_calc_reserved_space(n_recs) + < total_used_size / 2); + + /* Keep at least one record on left page */ + if (page_rec_is_infimum(page_rec_get_prev(rec))) { + rec = page_rec_get_next(rec); + ut_ad(page_rec_is_user_rec(rec)); + } + + return(rec); +} + +/** Copy all records after split rec including itself. +@param[in] rec split rec */ +void +PageBulk::copyIn( + rec_t* split_rec) +{ + + rec_t* rec = split_rec; + rec_offs* offsets = NULL; + + ut_ad(m_rec_no == 0); + ut_ad(page_rec_is_user_rec(rec)); + + const ulint n_core = page_rec_is_leaf(rec) + ? m_index->n_core_fields : 0; + + do { + offsets = rec_get_offsets(rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + + insert(rec, offsets); + + rec = page_rec_get_next(rec); + } while (!page_rec_is_supremum(rec)); + + ut_ad(m_rec_no > 0); +} + +/** Remove all records after split rec including itself. +@param[in] rec split rec */ +void +PageBulk::copyOut( + rec_t* split_rec) +{ + rec_t* rec; + rec_t* last_rec; + ulint n; + + /* Suppose before copyOut, we have 5 records on the page: + infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec. + + after copyOut, we have 2 records on the page: + infimum->r1->r2->supremum. slot ajustment is not done. */ + + rec = page_rec_get_next(page_get_infimum_rec(m_page)); + last_rec = page_rec_get_prev(page_get_supremum_rec(m_page)); + n = 0; + + while (rec != split_rec) { + rec = page_rec_get_next(rec); + n++; + } + + ut_ad(n > 0); + + /* Set last record's next in page */ + rec_offs* offsets = NULL; + rec = page_rec_get_prev(split_rec); + const ulint n_core = page_rec_is_leaf(split_rec) + ? m_index->n_core_fields : 0; + + offsets = rec_get_offsets(rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + mach_write_to_2(rec - REC_NEXT, m_is_comp + ? static_cast<uint16_t> + (PAGE_NEW_SUPREMUM - page_offset(rec)) + : PAGE_OLD_SUPREMUM); + + /* Set related members */ + m_cur_rec = rec; + m_heap_top = rec_get_end(rec, offsets); + + offsets = rec_get_offsets(last_rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + + m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top) + + page_dir_calc_reserved_space(m_rec_no) + - page_dir_calc_reserved_space(n); + ut_ad(lint(m_free_space) > 0); + m_rec_no = n; + +#ifdef UNIV_DEBUG + m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top); +#endif /* UNIV_DEBUG */ +} + +/** Set next page +@param[in] next_page_no next page no */ +inline void PageBulk::setNext(ulint next_page_no) +{ + if (UNIV_LIKELY_NULL(m_page_zip)) + /* For ROW_FORMAT=COMPRESSED, redo log may be written + in PageBulk::compress(). */ + mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no); + else + m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no); +} + +/** Set previous page +@param[in] prev_page_no previous page no */ +inline void PageBulk::setPrev(ulint prev_page_no) +{ + if (UNIV_LIKELY_NULL(m_page_zip)) + /* For ROW_FORMAT=COMPRESSED, redo log may be written + in PageBulk::compress(). */ + mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no); + else + m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no); +} + +/** Check if required space is available in the page for the rec to be inserted. +We check fill factor & padding here. +@param[in] length required length +@return true if space is available */ +bool +PageBulk::isSpaceAvailable( + ulint rec_size) +{ + ulint slot_size; + ulint required_space; + + slot_size = page_dir_calc_reserved_space(m_rec_no + 1) + - page_dir_calc_reserved_space(m_rec_no); + + required_space = rec_size + slot_size; + + if (required_space > m_free_space) { + ut_ad(m_rec_no > 0); + return false; + } + + /* Fillfactor & Padding apply to both leaf and non-leaf pages. + Note: we keep at least 2 records in a page to avoid B-tree level + growing too high. */ + if (m_rec_no >= 2 + && ((m_page_zip == NULL && m_free_space - required_space + < m_reserved_space) + || (m_page_zip != NULL && m_free_space - required_space + < m_padding_space))) { + return(false); + } + + return(true); +} + +/** Check whether the record needs to be stored externally. +@return false if the entire record can be stored locally on the page */ +bool +PageBulk::needExt( + const dtuple_t* tuple, + ulint rec_size) +{ + return page_zip_rec_needs_ext(rec_size, m_is_comp, + dtuple_get_n_fields(tuple), + m_block->zip_size()); +} + +/** Store external record +Since the record is not logged yet, so we don't log update to the record. +the blob data is logged first, then the record is logged in bulk mode. +@param[in] big_rec external recrod +@param[in] offsets record offsets +@return error code */ +dberr_t +PageBulk::storeExt( + const big_rec_t* big_rec, + rec_offs* offsets) +{ + finish(); + + /* Note: not all fields are initialized in btr_pcur. */ + btr_pcur_t btr_pcur; + btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED; + btr_pcur.latch_mode = BTR_MODIFY_LEAF; + btr_pcur.btr_cur.index = m_index; + btr_pcur.btr_cur.page_cur.index = m_index; + btr_pcur.btr_cur.page_cur.rec = m_cur_rec; + btr_pcur.btr_cur.page_cur.offsets = offsets; + btr_pcur.btr_cur.page_cur.block = m_block; + + dberr_t err = btr_store_big_rec_extern_fields( + &btr_pcur, offsets, big_rec, &m_mtr, BTR_STORE_INSERT_BULK); + + /* Reset m_block and m_cur_rec from page cursor, because + block may be changed during blob insert. (FIXME: Can it really?) */ + ut_ad(m_block == btr_pcur.btr_cur.page_cur.block); + + m_block = btr_pcur.btr_cur.page_cur.block; + m_cur_rec = btr_pcur.btr_cur.page_cur.rec; + m_page = buf_block_get_frame(m_block); + + return(err); +} + +/** Release block by commiting mtr +Note: log_free_check requires holding no lock/latch in current thread. */ +void +PageBulk::release() +{ + finish(); + + /* We fix the block because we will re-pin it soon. */ + buf_block_buf_fix_inc(m_block, __FILE__, __LINE__); + + /* No other threads can modify this block. */ + m_modify_clock = buf_block_get_modify_clock(m_block); + + m_mtr.commit(); +} + +/** Start mtr and latch the block */ +dberr_t +PageBulk::latch() +{ + m_mtr.start(); + m_index->set_modified(m_mtr); + + ut_ad(m_block->page.buf_fix_count()); + + /* In case the block is S-latched by page_cleaner. */ + if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock, + __FILE__, __LINE__, &m_mtr)) { + m_block = buf_page_get_gen(page_id_t(m_index->table->space_id, + m_page_no), + 0, RW_X_LATCH, + m_block, BUF_GET_IF_IN_POOL, + __FILE__, __LINE__, &m_mtr, &m_err); + + if (m_err != DB_SUCCESS) { + return (m_err); + } + + ut_ad(m_block != NULL); + } + + buf_block_buf_fix_dec(m_block); + + ut_ad(m_block->page.buf_fix_count()); + + ut_ad(m_cur_rec > m_page && m_cur_rec < m_heap_top); + + return (m_err); +} + +/** Split a page +@param[in] page_bulk page to split +@param[in] next_page_bulk next page +@return error code */ +dberr_t +BtrBulk::pageSplit( + PageBulk* page_bulk, + PageBulk* next_page_bulk) +{ + ut_ad(page_bulk->getPageZip() != NULL); + + if (page_bulk->getRecNo() <= 1) { + return(DB_TOO_BIG_RECORD); + } + + /* Initialize a new page */ + PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL, + page_bulk->getLevel()); + dberr_t err = new_page_bulk.init(); + if (err != DB_SUCCESS) { + return(err); + } + + /* Copy the upper half to the new page. */ + rec_t* split_rec = page_bulk->getSplitRec(); + new_page_bulk.copyIn(split_rec); + page_bulk->copyOut(split_rec); + + /* Commit the pages after split. */ + err = pageCommit(page_bulk, &new_page_bulk, true); + if (err != DB_SUCCESS) { + pageAbort(&new_page_bulk); + return(err); + } + + err = pageCommit(&new_page_bulk, next_page_bulk, true); + if (err != DB_SUCCESS) { + pageAbort(&new_page_bulk); + return(err); + } + + return(err); +} + +/** Commit(finish) a page. We set next/prev page no, compress a page of +compressed table and split the page if compression fails, insert a node +pointer to father page if needed, and commit mini-transaction. +@param[in] page_bulk page to commit +@param[in] next_page_bulk next page +@param[in] insert_father false when page_bulk is a root page and + true when it's a non-root page +@return error code */ +dberr_t +BtrBulk::pageCommit( + PageBulk* page_bulk, + PageBulk* next_page_bulk, + bool insert_father) +{ + page_bulk->finish(); + + /* Set page links */ + if (next_page_bulk != NULL) { + ut_ad(page_bulk->getLevel() == next_page_bulk->getLevel()); + + page_bulk->setNext(next_page_bulk->getPageNo()); + next_page_bulk->setPrev(page_bulk->getPageNo()); + } else { + ut_ad(!page_has_next(page_bulk->getPage())); + /* If a page is released and latched again, we need to + mark it modified in mini-transaction. */ + page_bulk->set_modified(); + } + + ut_ad(!rw_lock_own_flagged(&m_index->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX + | RW_LOCK_FLAG_S)); + + /* Compress page if it's a compressed table. */ + if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) { + return(pageSplit(page_bulk, next_page_bulk)); + } + + /* Insert node pointer to father page. */ + if (insert_father) { + dtuple_t* node_ptr = page_bulk->getNodePtr(); + dberr_t err = insert(node_ptr, page_bulk->getLevel()+1); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Commit mtr. */ + page_bulk->commit(true); + + return(DB_SUCCESS); +} + +/** Log free check */ +inline void BtrBulk::logFreeCheck() +{ + if (log_sys.check_flush_or_checkpoint()) { + release(); + + log_check_margins(); + + latch(); + } +} + +/** Release all latches */ +void +BtrBulk::release() +{ + ut_ad(m_root_level + 1 == m_page_bulks.size()); + + for (ulint level = 0; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + + page_bulk->release(); + } +} + +/** Re-latch all latches */ +void +BtrBulk::latch() +{ + ut_ad(m_root_level + 1 == m_page_bulks.size()); + + for (ulint level = 0; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + page_bulk->latch(); + } +} + +/** Insert a tuple to page in a level +@param[in] tuple tuple to insert +@param[in] level B-tree level +@return error code */ +dberr_t +BtrBulk::insert( + dtuple_t* tuple, + ulint level) +{ + bool is_left_most = false; + dberr_t err = DB_SUCCESS; + + /* Check if we need to create a PageBulk for the level. */ + if (level + 1 > m_page_bulks.size()) { + PageBulk* new_page_bulk + = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL, + level)); + err = new_page_bulk->init(); + if (err != DB_SUCCESS) { + UT_DELETE(new_page_bulk); + return(err); + } + + m_page_bulks.push_back(new_page_bulk); + ut_ad(level + 1 == m_page_bulks.size()); + m_root_level = level; + + is_left_most = true; + } + + ut_ad(m_page_bulks.size() > level); + + PageBulk* page_bulk = m_page_bulks.at(level); + + if (is_left_most && level > 0 && page_bulk->getRecNo() == 0) { + /* The node pointer must be marked as the predefined minimum + record, as there is no lower alphabetical limit to records in + the leftmost node of a level: */ + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) + | REC_INFO_MIN_REC_FLAG); + } + + ulint n_ext = 0; + ulint rec_size = rec_get_converted_size(m_index, tuple, n_ext); + big_rec_t* big_rec = NULL; + rec_t* rec = NULL; + rec_offs* offsets = NULL; + + if (page_bulk->needExt(tuple, rec_size)) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec = dtuple_convert_big_rec(m_index, 0, tuple, &n_ext); + + if (big_rec == NULL) { + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(m_index, tuple, n_ext); + } + + if (page_bulk->getPageZip() != NULL + && page_zip_is_too_big(m_index, tuple)) { + err = DB_TOO_BIG_RECORD; + goto func_exit; + } + + if (!page_bulk->isSpaceAvailable(rec_size)) { + /* Create a sibling page_bulk. */ + PageBulk* sibling_page_bulk; + sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, + FIL_NULL, level)); + err = sibling_page_bulk->init(); + if (err != DB_SUCCESS) { + UT_DELETE(sibling_page_bulk); + goto func_exit; + } + + /* Commit page bulk. */ + err = pageCommit(page_bulk, sibling_page_bulk, true); + if (err != DB_SUCCESS) { + pageAbort(sibling_page_bulk); + UT_DELETE(sibling_page_bulk); + goto func_exit; + } + + /* Set new page bulk to page_bulks. */ + ut_ad(sibling_page_bulk->getLevel() <= m_root_level); + m_page_bulks.at(level) = sibling_page_bulk; + + UT_DELETE(page_bulk); + page_bulk = sibling_page_bulk; + + /* Important: log_free_check whether we need a checkpoint. */ + if (page_is_leaf(sibling_page_bulk->getPage())) { + if (trx_is_interrupted(m_trx)) { + err = DB_INTERRUPTED; + goto func_exit; + } + + srv_inc_activity_count(); + logFreeCheck(); + } + } + + /* Convert tuple to rec. */ + rec = rec_convert_dtuple_to_rec(static_cast<byte*>(mem_heap_alloc( + page_bulk->m_heap, rec_size)), m_index, tuple, n_ext); + offsets = rec_get_offsets(rec, m_index, offsets, level + ? 0 : m_index->n_core_fields, + ULINT_UNDEFINED, &page_bulk->m_heap); + + page_bulk->insert(rec, offsets); + + if (big_rec != NULL) { + ut_ad(dict_index_is_clust(m_index)); + ut_ad(page_bulk->getLevel() == 0); + ut_ad(page_bulk == m_page_bulks.at(0)); + + /* Release all pages above the leaf level */ + for (ulint level = 1; level <= m_root_level; level++) { + m_page_bulks.at(level)->release(); + } + + err = page_bulk->storeExt(big_rec, offsets); + + /* Latch */ + for (ulint level = 1; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + page_bulk->latch(); + } + } + +func_exit: + if (big_rec != NULL) { + dtuple_convert_back_big_rec(m_index, tuple, big_rec); + } + + return(err); +} + +/** Btree bulk load finish. We commit the last page in each level +and copy the last page in top level to the root page of the index +if no error occurs. +@param[in] err whether bulk load was successful until now +@return error code */ +dberr_t +BtrBulk::finish(dberr_t err) +{ + uint32_t last_page_no = FIL_NULL; + + ut_ad(!m_index->table->is_temporary()); + + if (m_page_bulks.size() == 0) { + /* The table is empty. The root page of the index tree + is already in a consistent state. No need to flush. */ + return(err); + } + + ut_ad(m_root_level + 1 == m_page_bulks.size()); + + /* Finish all page bulks */ + for (ulint level = 0; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + + last_page_no = page_bulk->getPageNo(); + + if (err == DB_SUCCESS) { + err = pageCommit(page_bulk, NULL, + level != m_root_level); + } + + if (err != DB_SUCCESS) { + pageAbort(page_bulk); + } + + UT_DELETE(page_bulk); + } + + if (err == DB_SUCCESS) { + rec_t* first_rec; + mtr_t mtr; + buf_block_t* last_block; + PageBulk root_page_bulk(m_index, m_trx->id, + m_index->page, m_root_level); + + mtr.start(); + m_index->set_modified(mtr); + mtr_x_lock_index(m_index, &mtr); + + ut_ad(last_page_no != FIL_NULL); + last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH, + false, &mtr); + first_rec = page_rec_get_next( + page_get_infimum_rec(last_block->frame)); + ut_ad(page_rec_is_user_rec(first_rec)); + + /* Copy last page to root page. */ + err = root_page_bulk.init(); + if (err != DB_SUCCESS) { + mtr.commit(); + return(err); + } + root_page_bulk.copyIn(first_rec); + root_page_bulk.finish(); + + /* Remove last page. */ + btr_page_free(m_index, last_block, &mtr); + + mtr.commit(); + + err = pageCommit(&root_page_bulk, NULL, false); + ut_ad(err == DB_SUCCESS); + } + + ut_ad(!sync_check_iterate(dict_sync_check())); + + ut_ad(err != DB_SUCCESS + || btr_validate_index(m_index, NULL) == DB_SUCCESS); + return(err); +} diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc new file mode 100644 index 00000000..5bb2a0e2 --- /dev/null +++ b/storage/innobase/btr/btr0cur.cc @@ -0,0 +1,8279 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2015, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0cur.cc +The index tree cursor + +All changes that row operations make to a B-tree or the records +there must go through this module! Undo log records are written here +of every modify or insert of a clustered index record. + + NOTE!!! +To make sure we do not run out of disk space during a pessimistic +insert or update, we have to reserve 2 x the height of the index tree +many pages in the tablespace before we start the operation, because +if leaf splitting has been started, it is difficult to undo, except +by crashing the database and doing a roll-forward. + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0cur.h" +#include "row0upd.h" +#include "mtr0log.h" +#include "page0page.h" +#include "page0zip.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "buf0lru.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "row0log.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "que0que.h" +#include "row0row.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "zlib.h" +#include "srv0start.h" +#include "mysql_com.h" +#include "dict0stats.h" +#ifdef WITH_WSREP +#include "mysql/service_wsrep.h" +#endif /* WITH_WSREP */ + +/** Buffered B-tree operation types, introduced as part of delete buffering. */ +enum btr_op_t { + BTR_NO_OP = 0, /*!< Not buffered */ + BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */ + BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */ + BTR_DELETE_OP, /*!< Purge a delete-marked record */ + BTR_DELMARK_OP /*!< Mark a record for deletion */ +}; + +/** Modification types for the B-tree operation. + Note that the order must be DELETE, BOTH, INSERT !! + */ +enum btr_intention_t { + BTR_INTENTION_DELETE, + BTR_INTENTION_BOTH, + BTR_INTENTION_INSERT +}; + +/** For the index->lock scalability improvement, only possibility of clear +performance regression observed was caused by grown huge history list length. +That is because the exclusive use of index->lock also worked as reserving +free blocks and read IO bandwidth with priority. To avoid huge glowing history +list as same level with previous implementation, prioritizes pessimistic tree +operations by purge as the previous, when it seems to be growing huge. + + Experimentally, the history list length starts to affect to performance +throughput clearly from about 100000. */ +#define BTR_CUR_FINE_HISTORY_LENGTH 100000 + +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ +Atomic_counter<ulint> btr_cur_n_non_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +ulint btr_cur_n_non_sea_old; +#ifdef BTR_CUR_HASH_ADAPT +/** Number of successful adaptive hash index lookups in +btr_cur_search_to_nth_level(). */ +ulint btr_cur_n_sea; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +ulint btr_cur_n_sea_old; +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +/** In the optimistic insert, if the insert does not fit, but this much space +can be released by page reorganize, then it is reorganized */ +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32) + +/** The structure of a BLOB part header */ +/* @{ */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB + part header, in bytes */ + +/** Estimated table level stats from sampled value. +@param value sampled stats +@param index index being sampled +@param sample number of sampled rows +@param ext_size external stored data size +@param not_empty table not empty +@return estimated table wide stats from sampled value */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \ + (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \ + + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size))) + +/* @} */ + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr); /*!< in: mtr, or NULL if not logged */ +/*******************************************************************//** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor positioned on a page */ + ulint height, /*!< in: height of the page in tree; + 0 means leaf node */ + ulint root_height); /*!< in: root node height in tree */ +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in: record */ + buf_block_t* block, /*!< in: index page of rec */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in: record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + buf_block_t* block, /*!< in: index page of rec */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ + +/*==================== B-TREE SEARCH =========================*/ + +/** Latches the leaf page or pages requested. +@param[in] block leaf page where the search converged +@param[in] latch_mode BTR_SEARCH_LEAF, ... +@param[in] cursor cursor +@param[in] mtr mini-transaction +@return blocks and savepoints which actually latched. */ +btr_latch_leaves_t +btr_cur_latch_leaves( + buf_block_t* block, + ulint latch_mode, + btr_cur_t* cursor, + mtr_t* mtr) +{ + rw_lock_type_t mode; + uint32_t left_page_no; + uint32_t right_page_no; + buf_block_t* get_block; + bool spatial; + btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}}; + + compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH)); + compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH)); + compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH)); + ut_ad(block->page.id().space() == cursor->index->table->space->id); + + spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info; + ut_ad(block->page.in_file()); + + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + case BTR_SEARCH_TREE: + if (spatial) { + cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS] + = mtr_set_savepoint(mtr); + } + + mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH; + latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); + get_block = btr_block_get(*cursor->index, + block->page.id().page_no(), mode, + true, mtr); + latch_leaves.blocks[1] = get_block; +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(block->frame)); +#endif /* UNIV_BTR_DEBUG */ + if (spatial) { + cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS] + = get_block; + } + + return(latch_leaves); + case BTR_MODIFY_TREE: + /* It is exclusive for other operations which calls + btr_page_set_prev() */ + ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + /* x-latch also siblings from left to right */ + left_page_no = btr_page_get_prev(block->frame); + + if (left_page_no != FIL_NULL) { + + if (spatial) { + cursor->rtr_info->tree_savepoints[ + RTR_MAX_LEVELS] = mtr_set_savepoint(mtr); + } + + latch_leaves.savepoints[0] = mtr_set_savepoint(mtr); + get_block = btr_block_get( + *cursor->index, left_page_no, RW_X_LATCH, + true, mtr); + latch_leaves.blocks[0] = get_block; + + if (spatial) { + cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS] + = get_block; + } + } + + if (spatial) { + cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1] + = mtr_set_savepoint(mtr); + } + + latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); + get_block = btr_block_get( + *cursor->index, block->page.id().page_no(), + RW_X_LATCH, true, mtr); + latch_leaves.blocks[1] = get_block; + +#ifdef UNIV_BTR_DEBUG + /* Sanity check only after both the blocks are latched. */ + if (latch_leaves.blocks[0] != NULL) { + ut_a(page_is_comp(latch_leaves.blocks[0]->frame) + == page_is_comp(block->frame)); + ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame) + == block->page.id().page_no()); + } + ut_a(page_is_comp(get_block->frame) + == page_is_comp(block->frame)); +#endif /* UNIV_BTR_DEBUG */ + + if (spatial) { + cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1] + = get_block; + } + + right_page_no = btr_page_get_next(block->frame); + + if (right_page_no != FIL_NULL) { + if (spatial) { + cursor->rtr_info->tree_savepoints[ + RTR_MAX_LEVELS + 2] = mtr_set_savepoint( + mtr); + } + latch_leaves.savepoints[2] = mtr_set_savepoint(mtr); + get_block = btr_block_get(*cursor->index, + right_page_no, RW_X_LATCH, + true, mtr); + latch_leaves.blocks[2] = get_block; +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(block->frame)); + ut_a(btr_page_get_prev(get_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + if (spatial) { + cursor->rtr_info->tree_blocks[ + RTR_MAX_LEVELS + 2] = get_block; + } + } + + return(latch_leaves); + + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH; + /* latch also left sibling */ + rw_lock_s_lock(&block->lock); + left_page_no = btr_page_get_prev(block->frame); + rw_lock_s_unlock(&block->lock); + + if (left_page_no != FIL_NULL) { + latch_leaves.savepoints[0] = mtr_set_savepoint(mtr); + get_block = btr_block_get( + *cursor->index, left_page_no, mode, + true, mtr); + latch_leaves.blocks[0] = get_block; + cursor->left_block = get_block; +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(block->frame)); + ut_a(btr_page_get_next(get_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + } + + latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); + get_block = btr_block_get(*cursor->index, + block->page.id().page_no(), mode, + true, mtr); + latch_leaves.blocks[1] = get_block; +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(block->frame)); +#endif /* UNIV_BTR_DEBUG */ + return(latch_leaves); + case BTR_CONT_MODIFY_TREE: + ut_ad(dict_index_is_spatial(cursor->index)); + return(latch_leaves); + } + + ut_error; + return(latch_leaves); +} + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] index clustered index definition +@param[in,out] mtr mini-transaction +@return error code +@retval DB_SUCCESS if no error occurred +@retval DB_CORRUPTION if any corruption was noticed */ +static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr) +{ + ut_ad(index->is_primary()); + ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES); + ut_ad(index->table->supports_instant()); + ut_ad(index->table->is_readable()); + + const fil_space_t* space = index->table->space; + if (!space) { +unreadable: + ib::error() << "Table " << index->table->name + << " has an unreadable root page"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + page_t* root = btr_root_get(index, mtr); + + if (!root || btr_cur_instant_root_init(index, root)) { + goto unreadable; + } + + ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES); + + if (fil_page_get_type(root) == FIL_PAGE_INDEX) { + ut_ad(!index->is_instant()); + return DB_SUCCESS; + } + + btr_cur_t cur; + /* Relax the assertion in rec_init_offsets(). */ + ut_ad(!index->in_instant_init); + ut_d(index->in_instant_init = true); + dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF, + &cur, 0, mtr); + ut_d(index->in_instant_init = false); + if (err != DB_SUCCESS) { + index->table->corrupted = true; + return err; + } + + ut_ad(page_cur_is_before_first(&cur.page_cur)); + ut_ad(page_is_leaf(cur.page_cur.block->frame)); + + page_cur_move_to_next(&cur.page_cur); + + const rec_t* rec = cur.page_cur.rec; + const ulint comp = dict_table_is_comp(index->table); + const ulint info_bits = rec_get_info_bits(rec, comp); + + if (page_rec_is_supremum(rec) + || !(info_bits & REC_INFO_MIN_REC_FLAG)) { + if (!index->is_instant()) { + /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be + assigned even if instant ADD COLUMN was not + committed. Changes to these page header fields are not + undo-logged, but changes to the hidden metadata record + are. If the server is killed and restarted, the page + header fields could remain set even though no metadata + record is present. */ + return DB_SUCCESS; + } + + ib::error() << "Table " << index->table->name + << " is missing instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG + || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) { +incompatible: + ib::error() << "Table " << index->table->name + << " contains unrecognizable instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + /* Read the metadata. We can get here on server restart + or when the table was evicted from the data dictionary cache + and is now being accessed again. + + Here, READ COMMITTED and REPEATABLE READ should be equivalent. + Committing the ADD COLUMN operation would acquire + MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any + concurrent operations on the table, including table eviction + from the cache. */ + + if (info_bits & REC_INFO_DELETED_FLAG) { + /* This metadata record includes a BLOB that identifies + any dropped or reordered columns. */ + ulint trx_id_offset = index->trx_id_offset; + /* If !index->trx_id_offset, the PRIMARY KEY contains + variable-length columns. For the metadata record, + variable-length columns should be written with zero + length. However, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of type + CHAR, we wrote more than zero bytes. That is why we + must determine the actual length of each PRIMARY KEY + column. The DB_TRX_ID will start right after any + PRIMARY KEY columns. */ + ut_ad(index->n_uniq); + + /* We cannot invoke rec_get_offsets() before + index->table->deserialise_columns(). Therefore, + we must duplicate some logic here. */ + if (trx_id_offset) { + } else if (index->table->not_redundant()) { + /* The PRIMARY KEY contains variable-length columns. + For the metadata record, variable-length columns are + always written with zero length. The DB_TRX_ID will + start right after any fixed-length columns. */ + + /* OK, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of + type CHAR, we wrote more than zero bytes. In + order to allow affected tables to be accessed, + it would be nice to determine the actual + length of each PRIMARY KEY column. However, to + be able to do that, we should determine the + size of the null-bit bitmap in the metadata + record. And we cannot know that before reading + the metadata BLOB, whose starting point we are + trying to find here. (Although the PRIMARY KEY + columns cannot be NULL, we would have to know + where the lengths of variable-length PRIMARY KEY + columns start.) + + So, unfortunately we cannot help users who + were affected by MDEV-21088 on a ROW_FORMAT=COMPACT + or ROW_FORMAT=DYNAMIC table. */ + + for (uint i = index->n_uniq; i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } else if (rec_get_1byte_offs_flag(rec)) { + trx_id_offset = rec_1_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK; + } else { + trx_id_offset = rec_2_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK; + } + + const byte* ptr = rec + trx_id_offset + + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) { + goto incompatible; + } + + uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4); + if (!len + || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) + != FIL_PAGE_DATA + || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + != space->id) { + goto incompatible; + } + + buf_block_t* block = buf_page_get( + page_id_t(space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB + || mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + != FIL_NULL + || mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + != len) { + goto incompatible; + } + + /* The unused part of the BLOB page should be zero-filled. */ + for (const byte* b = block->frame + + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len, + * const end = block->frame + srv_page_size + - BTR_EXTERN_LEN; + b < end; ) { + if (*b++) { + goto incompatible; + } + } + + if (index->table->deserialise_columns( + &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], + len)) { + goto incompatible; + } + + /* Proceed to initialize the default values of + any instantly added columns. */ + } + + mem_heap_t* heap = NULL; + rec_offs* offsets = rec_get_offsets(rec, index, NULL, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + if (rec_offs_any_default(offsets)) { +inconsistent: + mem_heap_free(heap); + goto incompatible; + } + + /* In fact, because we only ever append fields to the metadata + record, it is also OK to perform READ UNCOMMITTED and + then ignore any extra fields, provided that + trx_sys.is_registered(DB_TRX_ID). */ + if (rec_offs_n_fields(offsets) + > ulint(index->n_fields) + !!index->table->instant + && !trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, + offsets))) { + goto inconsistent; + } + + for (unsigned i = index->n_core_fields; i < index->n_fields; i++) { + dict_col_t* col = index->fields[i].col; + const unsigned o = i + !!index->table->instant; + ulint len; + const byte* data = rec_get_nth_field(rec, offsets, o, &len); + ut_ad(!col->is_added()); + ut_ad(!col->def_val.data); + col->def_val.len = len; + switch (len) { + case UNIV_SQL_NULL: + continue; + case 0: + col->def_val.data = field_ref_zero; + continue; + } + ut_ad(len != UNIV_SQL_DEFAULT); + if (!rec_offs_nth_extern(offsets, o)) { + col->def_val.data = mem_heap_dup( + index->table->heap, data, len); + } else if (len < BTR_EXTERN_FIELD_REF_SIZE + || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + col->def_val.len = UNIV_SQL_DEFAULT; + goto inconsistent; + } else { + col->def_val.data = btr_copy_externally_stored_field( + &col->def_val.len, data, + cur.page_cur.block->zip_size(), + len, index->table->heap); + } + } + + mem_heap_free(heap); + return DB_SUCCESS; +} + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) +{ + mtr_t mtr; + dict_index_t* index = dict_table_get_first_index(table); + mtr.start(); + dberr_t err = index + ? btr_cur_instant_init_low(index, &mtr) + : DB_CORRUPTION; + mtr.commit(); + return(err); +} + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) +{ + ut_ad(!index->is_dummy); + ut_ad(fil_page_index_page_check(page)); + ut_ad(!page_has_siblings(page)); + ut_ad(page_get_space_id(page) == index->table->space_id); + ut_ad(page_get_page_no(page) == index->page); + ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table)); + ut_ad(index->is_primary()); + ut_ad(!index->is_instant()); + ut_ad(index->table->supports_instant()); + /* This is normally executed as part of btr_cur_instant_init() + when dict_load_table_one() is loading a table definition. + Other threads should not access or modify the n_core_null_bytes, + n_core_fields before dict_load_table_one() returns. + + This can also be executed during IMPORT TABLESPACE, where the + table definition is exclusively locked. */ + + switch (fil_page_get_type(page)) { + default: + ut_ad("wrong page type" == 0); + return true; + case FIL_PAGE_INDEX: + /* The field PAGE_INSTANT is guaranteed 0 on clustered + index root pages of ROW_FORMAT=COMPACT or + ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */ + ut_ad(!page_is_comp(page) || !page_get_instant(page)); + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + return false; + case FIL_PAGE_TYPE_INSTANT: + break; + } + + const uint16_t n = page_get_instant(page); + + if (n < index->n_uniq + DATA_ROLL_PTR) { + /* The PRIMARY KEY (or hidden DB_ROW_ID) and + DB_TRX_ID,DB_ROLL_PTR columns must always be present + as 'core' fields. */ + return true; + } + + if (n > REC_MAX_N_FIELDS) { + return true; + } + + index->n_core_fields = n & dict_index_t::MAX_N_FIELDS; + + const rec_t* infimum = page_get_infimum_rec(page); + const rec_t* supremum = page_get_supremum_rec(page); + + if (!memcmp(infimum, "infimum", 8) + && !memcmp(supremum, "supremum", 8)) { + if (n > index->n_fields) { + /* All fields, including those for instantly + added columns, must be present in the + data dictionary. */ + return true; + } + + ut_ad(!index->is_dummy); + ut_d(index->is_dummy = true); + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(index->get_n_nullable(n))); + ut_d(index->is_dummy = false); + return false; + } + + if (memcmp(infimum, field_ref_zero, 8) + || memcmp(supremum, field_ref_zero, 7)) { + /* The infimum and supremum records must either contain + the original strings, or they must be filled with zero + bytes, except for the bytes that we have repurposed. */ + return true; + } + + index->n_core_null_bytes = supremum[7]; + return index->n_core_null_bytes > 128; +} + +/** Optimistically latches the leaf page or pages requested. +@param[in] block guessed buffer block +@param[in] modify_clock modify clock value +@param[in,out] latch_mode BTR_SEARCH_LEAF, ... +@param[in,out] cursor cursor +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@return true if success */ +bool +btr_cur_optimistic_latch_leaves( + buf_block_t* block, + ib_uint64_t modify_clock, + ulint* latch_mode, + btr_cur_t* cursor, + const char* file, + unsigned line, + mtr_t* mtr) +{ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + + switch (*latch_mode) { + default: + ut_error; + return(false); + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + return(buf_page_optimistic_get(*latch_mode, block, + modify_clock, file, line, mtr)); + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + rw_lock_s_lock(&block->lock); + if (block->modify_clock != modify_clock) { + rw_lock_s_unlock(&block->lock); + return false; + } + const uint32_t curr_page_no = block->page.id().page_no(); + const uint32_t left_page_no = btr_page_get_prev(block->frame); + rw_lock_s_unlock(&block->lock); + + const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV + ? RW_S_LATCH : RW_X_LATCH; + + if (left_page_no != FIL_NULL) { + dberr_t err = DB_SUCCESS; + cursor->left_block = buf_page_get_gen( + page_id_t(cursor->index->table->space_id, + left_page_no), + cursor->index->table->space->zip_size(), + mode, nullptr, BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, mtr, &err); + + if (!cursor->left_block) { + cursor->index->table->file_unreadable = true; + } + + if (cursor->left_block->page.status + == buf_page_t::FREED + || btr_page_get_next(cursor->left_block->frame) + != curr_page_no) { + /* release the left block */ + btr_leaf_page_release( + cursor->left_block, mode, mtr); + return false; + } + } else { + cursor->left_block = NULL; + } + + if (buf_page_optimistic_get(mode, block, modify_clock, + file, line, mtr)) { + if (btr_page_get_prev(block->frame) == left_page_no) { + /* block was already buffer-fixed while + entering the function and + buf_page_optimistic_get() buffer-fixes + it again. */ + ut_ad(2 <= block->page.buf_fix_count()); + *latch_mode = mode; + return(true); + } else { + /* release the block and decrement of + buf_fix_count which was incremented + in buf_page_optimistic_get() */ + btr_leaf_page_release(block, mode, mtr); + } + } + + ut_ad(block->page.buf_fix_count()); + /* release the left block */ + if (cursor->left_block != NULL) { + btr_leaf_page_release(cursor->left_block, + mode, mtr); + } + } + + return false; +} + +/** +Gets intention in btr_intention_t from latch_mode, and cleares the intention +at the latch_mode. +@param latch_mode in/out: pointer to latch_mode +@return intention for latching tree */ +static +btr_intention_t +btr_cur_get_and_clear_intention( + ulint *latch_mode) +{ + btr_intention_t intention; + + switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) { + case BTR_LATCH_FOR_INSERT: + intention = BTR_INTENTION_INSERT; + break; + case BTR_LATCH_FOR_DELETE: + intention = BTR_INTENTION_DELETE; + break; + default: + /* both or unknown */ + intention = BTR_INTENTION_BOTH; + } + *latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)); + + return(intention); +} + +/** +Gets the desired latch type for the root leaf (root page is root leaf) +at the latch mode. +@param latch_mode in: BTR_SEARCH_LEAF, ... +@return latch type */ +static +rw_lock_type_t +btr_cur_latch_for_root_leaf( + ulint latch_mode) +{ + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_SEARCH_TREE: + case BTR_SEARCH_PREV: + return(RW_S_LATCH); + case BTR_MODIFY_LEAF: + case BTR_MODIFY_TREE: + case BTR_MODIFY_PREV: + return(RW_X_LATCH); + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + /* A root page should be latched already, + and don't need to be latched here. + fall through (RW_NO_LATCH) */ + case BTR_NO_LATCHES: + return(RW_NO_LATCH); + } + + ut_error; + return(RW_NO_LATCH); /* avoid compiler warnings */ +} + +/** Detects whether the modifying record might need a modifying tree structure. +@param[in] index index +@param[in] page page +@param[in] lock_intention lock intention for the tree operation +@param[in] rec record (current node_ptr) +@param[in] rec_size size of the record or max size of node_ptr +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] mtr mtr +@return true if tree modification is needed */ +static +bool +btr_cur_will_modify_tree( + dict_index_t* index, + const page_t* page, + btr_intention_t lock_intention, + const rec_t* rec, + ulint rec_size, + ulint zip_size, + mtr_t* mtr) +{ + ut_ad(!page_is_leaf(page)); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + /* Pessimistic delete of the first record causes delete & insert + of node_ptr at upper level. And a subsequent page shrink is + possible. It causes delete of node_ptr at the upper level. + So we should pay attention also to 2nd record not only + first record and last record. Because if the "delete & insert" are + done for the different page, the 2nd record become + first record and following compress might delete the record and causes + the uppper level node_ptr modification. */ + + const ulint n_recs = page_get_n_recs(page); + + if (lock_intention <= BTR_INTENTION_BOTH) { + compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH); + compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT); + + if (!page_has_siblings(page)) { + return true; + } + + ulint margin = rec_size; + + if (lock_intention == BTR_INTENTION_BOTH) { + ulint level = btr_page_get_level(page); + + /* This value is the worst expectation for the node_ptr + records to be deleted from this page. It is used to + expect whether the cursor position can be the left_most + record in this page or not. */ + ulint max_nodes_deleted = 0; + + /* By modifying tree operations from the under of this + level, logically (2 ^ (level - 1)) opportunities to + deleting records in maximum even unreally rare case. */ + if (level > 7) { + /* TODO: adjust this practical limit. */ + max_nodes_deleted = 64; + } else if (level > 0) { + max_nodes_deleted = (ulint)1 << (level - 1); + } + /* check delete will cause. (BTR_INTENTION_BOTH + or BTR_INTENTION_DELETE) */ + if (n_recs <= max_nodes_deleted * 2 + || page_rec_is_first(rec, page)) { + /* The cursor record can be the left most record + in this page. */ + return true; + } + + if (page_has_prev(page) + && page_rec_distance_is_at_most( + page_get_infimum_rec(page), rec, + max_nodes_deleted)) { + return true; + } + + if (page_has_next(page) + && page_rec_distance_is_at_most( + rec, page_get_supremum_rec(page), + max_nodes_deleted)) { + return true; + } + + /* Delete at leftmost record in a page causes delete + & insert at its parent page. After that, the delete + might cause btr_compress() and delete record at its + parent page. Thus we should consider max deletes. */ + margin *= max_nodes_deleted; + } + + /* Safe because we already have SX latch of the index tree */ + if (page_get_data_size(page) + < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) { + return(true); + } + } + + if (lock_intention >= BTR_INTENTION_BOTH) { + /* check insert will cause. BTR_INTENTION_BOTH + or BTR_INTENTION_INSERT*/ + + /* Once we invoke the btr_cur_limit_optimistic_insert_debug, + we should check it here in advance, since the max allowable + records in a page is limited. */ + LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true); + + /* needs 2 records' space for the case the single split and + insert cannot fit. + page_get_max_insert_size_after_reorganize() includes space + for page directory already */ + ulint max_size + = page_get_max_insert_size_after_reorganize(page, 2); + + if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size + || max_size < rec_size * 2) { + return(true); + } + + /* TODO: optimize this condition for ROW_FORMAT=COMPRESSED. + This is based on the worst case, and we could invoke + page_zip_available() on the block->page.zip. */ + /* needs 2 records' space also for worst compress rate. */ + if (zip_size + && page_zip_empty_size(index->n_fields, zip_size) + <= rec_size * 2 + page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + 2)) { + return(true); + } + } + + return(false); +} + +/** Detects whether the modifying record might need a opposite modification +to the intention. +@param[in] page page +@param[in] lock_intention lock intention for the tree operation +@param[in] rec record (current node_ptr) +@return true if tree modification is needed */ +static +bool +btr_cur_need_opposite_intention( + const page_t* page, + btr_intention_t lock_intention, + const rec_t* rec) +{ + switch (lock_intention) { + case BTR_INTENTION_DELETE: + return (page_has_prev(page) && page_rec_is_first(rec, page)) || + (page_has_next(page) && page_rec_is_last(rec, page)); + case BTR_INTENTION_INSERT: + return page_has_next(page) && page_rec_is_last(rec, page); + case BTR_INTENTION_BOTH: + return(false); + } + + ut_error; + return(false); +} + +/** +@param[in] index b-tree +@return maximum size of a node pointer record in bytes */ +static ulint btr_node_ptr_max_size(const dict_index_t* index) +{ + if (dict_index_is_ibuf(index)) { + /* cannot estimate accurately */ + /* This is universal index for change buffer. + The max size of the entry is about max key length * 2. + (index key + primary key to be inserted to the index) + (The max key length is UNIV_PAGE_SIZE / 16 * 3 at + ha_innobase::max_supported_key_length(), + considering MAX_KEY_LENGTH = 3072 at MySQL imposes + the 3500 historical InnoDB value for 16K page size case.) + For the universal index, node_ptr contains most of the entry. + And 512 is enough to contain ibuf columns and meta-data */ + return srv_page_size / 8 * 3 + 512; + } + + /* Each record has page_no, length of page_no and header. */ + ulint comp = dict_table_is_comp(index->table); + ulint rec_max_size = comp + ? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(index->n_nullable) + : REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES + + 2 * index->n_fields; + + /* Compute the maximum possible record size. */ + for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint field_max_size; + ulint field_ext_max_size; + + /* Determine the maximum length of the index field. */ + + field_max_size = dict_col_get_fixed_size(col, comp); + if (field_max_size) { + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || field->fixed_len == field->prefix_len); + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + rec_max_size += field_max_size; + continue; + } + + field_max_size = dict_col_get_max_size(col); + if (UNIV_UNLIKELY(!field_max_size)) { + switch (col->mtype) { + case DATA_VARCHAR: + if (!comp + && (!strcmp(index->table->name.m_name, + "SYS_FOREIGN") + || !strcmp(index->table->name.m_name, + "SYS_FOREIGN_COLS"))) { + break; + } + /* fall through */ + case DATA_VARMYSQL: + case DATA_CHAR: + case DATA_MYSQL: + /* CHAR(0) and VARCHAR(0) are possible + data type definitions in MariaDB. + The InnoDB internal SQL parser maps + CHAR to DATA_VARCHAR, so DATA_CHAR (or + DATA_MYSQL) is only coming from the + MariaDB SQL layer. */ + if (comp) { + /* Add a length byte, because + fixed-length empty field are + encoded as variable-length. + For ROW_FORMAT=REDUNDANT, + these bytes were added to + rec_max_size before this loop. */ + rec_max_size++; + } + continue; + } + + /* SYS_FOREIGN.ID is defined as CHAR in the + InnoDB internal SQL parser, which translates + into the incorrect VARCHAR(0). InnoDB does + not enforce maximum lengths of columns, so + that is why any data can be inserted in the + first place. + + Likewise, SYS_FOREIGN.FOR_NAME, + SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are + defined as CHAR, and also they are part of a key. */ + + ut_ad(!strcmp(index->table->name.m_name, + "SYS_FOREIGN") + || !strcmp(index->table->name.m_name, + "SYS_FOREIGN_COLS")); + ut_ad(!comp); + ut_ad(col->mtype == DATA_VARCHAR); + + rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX) + ? REDUNDANT_REC_MAX_DATA_SIZE + : page_get_free_space_of_empty(FALSE) / 2; + } else if (field_max_size == NAME_LEN && i == 1 + && (!strcmp(index->table->name.m_name, + TABLE_STATS_NAME) + || !strcmp(index->table->name.m_name, + INDEX_STATS_NAME))) { + /* Interpret "table_name" as VARCHAR(199) even + if it was incorrectly defined as VARCHAR(64). + While the caller of ha_innobase enforces the + maximum length on any data written, the InnoDB + internal SQL parser will happily write as much + data as is provided. The purpose of this hack + is to avoid InnoDB hangs after persistent + statistics on partitioned tables are + deleted. */ + field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN; + } + field_ext_max_size = field_max_size < 256 ? 1 : 2; + + if (field->prefix_len + && field->prefix_len < field_max_size) { + field_max_size = field->prefix_len; + } + + if (comp) { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to rec_max_size before this loop. */ + rec_max_size += field_ext_max_size; + } + + rec_max_size += field_max_size; + } + + return rec_max_size; +} + +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. + +If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. */ +dberr_t +btr_cur_search_to_nth_level_func( + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...; + Inserts should always be made using + PAGE_CUR_LE to search the position! */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with + at most one of BTR_INSERT, BTR_DELETE_MARK, + BTR_DELETE, or BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if ahi_latch, we might not have a + cursor page latch, we assume that ahi_latch + protects the record! */ + btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ +#ifdef BTR_CUR_HASH_ADAPT + rw_lock_t* ahi_latch, + /*!< in: currently held btr_search_latch + (in RW_S_LATCH mode), or NULL */ +#endif /* BTR_CUR_HASH_ADAPT */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr, /*!< in: mtr */ + ib_uint64_t autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written + (0 if none) */ +{ + page_t* page = NULL; /* remove warning */ + buf_block_t* block; + buf_block_t* guess; + ulint height; + ulint up_match; + ulint up_bytes; + ulint low_match; + ulint low_bytes; + ulint rw_latch; + page_cur_mode_t page_mode; + page_cur_mode_t search_mode = PAGE_CUR_UNSUPP; + ulint buf_mode; + ulint estimate; + ulint node_ptr_max_size = srv_page_size / 2; + page_cur_t* page_cursor; + btr_op_t btr_op; + ulint root_height = 0; /* remove warning */ + dberr_t err = DB_SUCCESS; + + btr_intention_t lock_intention; + bool modify_external; + buf_block_t* tree_blocks[BTR_MAX_LEVELS]; + ulint tree_savepoints[BTR_MAX_LEVELS]; + ulint n_blocks = 0; + ulint n_releases = 0; + bool detected_same_key_root = false; + + bool retrying_for_search_prev = false; + ulint leftmost_from_level = 0; + buf_block_t** prev_tree_blocks = NULL; + ulint* prev_tree_savepoints = NULL; + ulint prev_n_blocks = 0; + ulint prev_n_releases = 0; + bool need_path = true; + bool rtree_parent_modified = false; + bool mbr_adj = false; + bool found = false; + + DBUG_ENTER("btr_cur_search_to_nth_level"); + +#ifdef BTR_CUR_ADAPT + btr_search_t* info; +#endif /* BTR_CUR_ADAPT */ + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets2_; + rec_offs_init(offsets_); + rec_offs_init(offsets2_); + /* Currently, PAGE_CUR_LE is the only search mode used for searches + ending to upper levels */ + + ut_ad(level == 0 || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode)); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(!(index->type & DICT_FTS)); + ut_ad(index->page != FIL_NULL); + + MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match); + MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes); + MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match); + MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes); +#ifdef UNIV_DEBUG + cursor->up_match = ULINT_UNDEFINED; + cursor->low_match = ULINT_UNDEFINED; +#endif /* UNIV_DEBUG */ + + ibool s_latch_by_caller; + + s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED; + + ut_ad(!s_latch_by_caller + || srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK + | MTR_MEMO_SX_LOCK)); + + /* These flags are mutually exclusive, they are lumped together + with the latch mode for historical reasons. It's possible for + none of the flags to be set. */ + switch (UNIV_EXPECT(latch_mode + & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK), + 0)) { + case 0: + btr_op = BTR_NO_OP; + break; + case BTR_INSERT: + btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE) + ? BTR_INSERT_IGNORE_UNIQUE_OP + : BTR_INSERT_OP; + break; + case BTR_DELETE: + btr_op = BTR_DELETE_OP; + ut_a(cursor->purge_node); + break; + case BTR_DELETE_MARK: + btr_op = BTR_DELMARK_OP; + break; + default: + /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK + should be specified at a time */ + ut_error; + } + + /* Operations on the insert buffer tree cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index)); + /* Operations on the clustered index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index)); + /* Operations on the temporary table(indexes) cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary()); + /* Operation on the spatial index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index)); + + estimate = latch_mode & BTR_ESTIMATE; + + lock_intention = btr_cur_get_and_clear_intention(&latch_mode); + + modify_external = latch_mode & BTR_MODIFY_EXTERNAL; + + /* Turn the flags unrelated to the latch mode off. */ + latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF); + + ut_ad(!s_latch_by_caller + || latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_SEARCH_TREE + || latch_mode == BTR_MODIFY_LEAF); + + ut_ad(autoinc == 0 || dict_index_is_clust(index)); + ut_ad(autoinc == 0 + || latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_MODIFY_LEAF); + ut_ad(autoinc == 0 || level == 0); + + cursor->flag = BTR_CUR_BINARY; + cursor->index = index; + +#ifndef BTR_CUR_ADAPT + guess = NULL; +#else + info = btr_search_get_info(index); + guess = info->root_guess; + +#ifdef BTR_CUR_HASH_ADAPT + +# ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; +# endif + if (autoinc == 0 + && latch_mode <= BTR_MODIFY_LEAF + && info->last_hash_succ +# ifdef MYSQL_INDEX_DISABLE_AHI + && !index->disable_ahi +# endif + && !estimate +# ifdef PAGE_CUR_LE_OR_EXTENDS + && mode != PAGE_CUR_LE_OR_EXTENDS +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + && !dict_index_is_spatial(index) + /* If !ahi_latch, we do a dirty read of + btr_search_enabled below, and btr_search_guess_on_hash() + will have to check it again. */ + && btr_search_enabled + && !modify_external + && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG) + && btr_search_guess_on_hash(index, info, tuple, mode, + latch_mode, cursor, + ahi_latch, mtr)) { + + /* Search using the hash index succeeded */ + + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + btr_cur_n_sea++; + + DBUG_RETURN(err); + } +# endif /* BTR_CUR_HASH_ADAPT */ +#endif /* BTR_CUR_ADAPT */ + btr_cur_n_non_sea++; + + /* If the hash search did not succeed, do binary search down the + tree */ + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_latch) { + /* Release possible search latch to obey latching order */ + rw_lock_s_unlock(ahi_latch); + } +#endif /* BTR_CUR_HASH_ADAPT */ + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + ulint savepoint = mtr_set_savepoint(mtr); + + rw_lock_type_t upper_rw_latch; + + switch (latch_mode) { + case BTR_MODIFY_TREE: + /* Most of delete-intended operations are purging. + Free blocks and read IO bandwidth should be prior + for them, when the history list is glowing huge. */ + if (lock_intention == BTR_INTENTION_DELETE + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH + && buf_pool.n_pend_reads) { +x_latch_index: + mtr_x_lock_index(index, mtr); + } else if (index->is_spatial() + && lock_intention <= BTR_INTENTION_BOTH) { + /* X lock the if there is possibility of + pessimistic delete on spatial index. As we could + lock upward for the tree */ + goto x_latch_index; + } else { + mtr_sx_lock_index(index, mtr); + } + upper_rw_latch = RW_X_LATCH; + break; + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + /* Do nothing */ + ut_ad(srv_read_only_mode + || mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + if (dict_index_is_spatial(index) + && latch_mode == BTR_CONT_MODIFY_TREE) { + /* If we are about to locating parent page for split + and/or merge operation for R-Tree index, X latch + the parent */ + upper_rw_latch = RW_X_LATCH; + } else { + upper_rw_latch = RW_NO_LATCH; + } + break; + default: + if (!srv_read_only_mode) { + if (s_latch_by_caller) { + ut_ad(rw_lock_own(dict_index_get_lock(index), + RW_LOCK_S)); + } else if (!modify_external) { + /* BTR_SEARCH_TREE is intended to be used with + BTR_ALREADY_S_LATCHED */ + ut_ad(latch_mode != BTR_SEARCH_TREE); + + mtr_s_lock_index(index, mtr); + } else { + /* BTR_MODIFY_EXTERNAL needs to be excluded */ + mtr_sx_lock_index(index, mtr); + } + upper_rw_latch = RW_S_LATCH; + } else { + upper_rw_latch = RW_NO_LATCH; + } + } + const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf( + latch_mode); + + page_cursor = btr_cur_get_page_cur(cursor); + + const ulint zip_size = index->table->space->zip_size(); + + /* Start with the root page. */ + page_id_t page_id(index->table->space_id, index->page); + + if (root_leaf_rw_latch == RW_X_LATCH) { + node_ptr_max_size = btr_node_ptr_max_size(index); + } + + up_match = 0; + up_bytes = 0; + low_match = 0; + low_bytes = 0; + + height = ULINT_UNDEFINED; + + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ + + switch (mode) { + case PAGE_CUR_GE: + page_mode = PAGE_CUR_L; + break; + case PAGE_CUR_G: + page_mode = PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode) + || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode)); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + page_mode = mode; + break; + } + + /* Loop and search until we arrive at the desired level */ + btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}}; + +search_loop: + buf_mode = BUF_GET; + rw_latch = RW_NO_LATCH; + rtree_parent_modified = false; + + if (height != 0) { + /* We are about to fetch the root or a non-leaf page. */ + if ((latch_mode != BTR_MODIFY_TREE || height == level) + && !retrying_for_search_prev) { + /* If doesn't have SX or X latch of index, + each pages should be latched before reading. */ + if (height == ULINT_UNDEFINED + && upper_rw_latch == RW_S_LATCH + && (modify_external || autoinc)) { + /* needs sx-latch of root page + for fseg operation or for writing + PAGE_ROOT_AUTO_INC */ + rw_latch = RW_SX_LATCH; + } else { + rw_latch = upper_rw_latch; + } + } + } else if (latch_mode <= BTR_MODIFY_LEAF) { + rw_latch = latch_mode; + + if (btr_op != BTR_NO_OP + && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) { + + /* Try to buffer the operation if the leaf + page is not in the buffer pool. */ + + buf_mode = btr_op == BTR_DELETE_OP + ? BUF_GET_IF_IN_POOL_OR_WATCH + : BUF_GET_IF_IN_POOL; + } + } + +retry_page_get: + ut_ad(n_blocks < BTR_MAX_LEVELS); + tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); + block = buf_page_get_gen(page_id, zip_size, rw_latch, guess, + buf_mode, file, line, mtr, &err, + height == 0 && !index->is_clust()); + tree_blocks[n_blocks] = block; + + /* Note that block==NULL signifies either an error or change + buffering. */ + + if (err != DB_SUCCESS) { + ut_ad(block == NULL); + if (err == DB_DECRYPTION_FAILED) { + ib_push_warning((void *)NULL, + DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name); + index->table->file_unreadable = true; + } + + goto func_exit; + } + + if (block == NULL) { + /* This must be a search to perform an insert/delete + mark/ delete; try using the insert/delete buffer */ + + ut_ad(height == 0); + ut_ad(cursor->thr); + + switch (btr_op) { + case BTR_INSERT_OP: + case BTR_INSERT_IGNORE_UNIQUE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + ut_ad(!dict_index_is_spatial(index)); + + if (ibuf_insert(IBUF_OP_INSERT, tuple, index, + page_id, zip_size, cursor->thr)) { + + cursor->flag = BTR_CUR_INSERT_TO_IBUF; + + goto func_exit; + } + break; + + case BTR_DELMARK_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + ut_ad(!dict_index_is_spatial(index)); + + if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, + index, page_id, zip_size, + cursor->thr)) { + + cursor->flag = BTR_CUR_DEL_MARK_IBUF; + + goto func_exit; + } + + break; + + case BTR_DELETE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); + ut_ad(!dict_index_is_spatial(index)); + + if (!row_purge_poss_sec(cursor->purge_node, + index, tuple)) { + + /* The record cannot be purged yet. */ + cursor->flag = BTR_CUR_DELETE_REF; + } else if (ibuf_insert(IBUF_OP_DELETE, tuple, + index, page_id, zip_size, + cursor->thr)) { + + /* The purge was buffered. */ + cursor->flag = BTR_CUR_DELETE_IBUF; + } else { + /* The purge could not be buffered. */ + buf_pool.watch_unset(page_id); + break; + } + + buf_pool.watch_unset(page_id); + goto func_exit; + + default: + ut_error; + } + + /* Insert to the insert/delete buffer did not succeed, we + must read the page from disk. */ + + buf_mode = BUF_GET; + + goto retry_page_get; + } + + if (retrying_for_search_prev && height != 0) { + /* also latch left sibling */ + uint32_t left_page_no; + buf_block_t* get_block; + + ut_ad(rw_latch == RW_NO_LATCH); + + rw_latch = upper_rw_latch; + + rw_lock_s_lock(&block->lock); + left_page_no = btr_page_get_prev(buf_block_get_frame(block)); + rw_lock_s_unlock(&block->lock); + + if (left_page_no != FIL_NULL) { + ut_ad(prev_n_blocks < leftmost_from_level); + + prev_tree_savepoints[prev_n_blocks] + = mtr_set_savepoint(mtr); + get_block = buf_page_get_gen( + page_id_t(page_id.space(), left_page_no), + zip_size, rw_latch, NULL, buf_mode, + file, line, mtr, &err); + prev_tree_blocks[prev_n_blocks] = get_block; + prev_n_blocks++; + + if (err != DB_SUCCESS) { + if (err == DB_DECRYPTION_FAILED) { + ib_push_warning((void *)NULL, + DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name); + index->table->file_unreadable = true; + } + + goto func_exit; + } + + /* BTR_MODIFY_TREE doesn't update prev/next_page_no, + without their parent page's lock. So, not needed to + retry here, because we have the parent page's lock. */ + } + + /* release RW_NO_LATCH page and lock with RW_S_LATCH */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_blocks], + tree_blocks[n_blocks]); + + tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); + block = buf_page_get_gen(page_id, zip_size, + rw_latch, NULL, buf_mode, + file, line, mtr, &err); + tree_blocks[n_blocks] = block; + + if (err != DB_SUCCESS) { + if (err == DB_DECRYPTION_FAILED) { + ib_push_warning((void *)NULL, + DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name); + index->table->file_unreadable = true; + } + + goto func_exit; + } + } + + page = buf_block_get_frame(block); + + if (height == ULINT_UNDEFINED + && page_is_leaf(page) + && rw_latch != RW_NO_LATCH + && rw_latch != root_leaf_rw_latch) { + /* The root page is also a leaf page (root_leaf). + We should reacquire the page, because the root page + is latched differently from leaf pages. */ + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH); + ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc); + ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH); + + ut_ad(n_blocks == 0); + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_blocks], + tree_blocks[n_blocks]); + + upper_rw_latch = root_leaf_rw_latch; + goto search_loop; + } + + if (rw_latch != RW_NO_LATCH) { +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + buf_block_dbg_add_level( + block, dict_index_is_ibuf(index) + ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); + } + + ut_ad(fil_page_index_page_check(page)); + ut_ad(index->id == btr_page_get_index_id(page)); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page); + root_height = height; + cursor->tree_height = root_height + 1; + + if (dict_index_is_spatial(index)) { + ut_ad(cursor->rtr_info); + + /* If SSN in memory is not initialized, fetch + it from root page */ + if (!rtr_get_current_ssn_id(index)) { + /* FIXME: do this in dict_load_table_one() */ + index->set_ssn(page_get_ssn_id(page) + 1); + } + + /* Save the MBR */ + cursor->rtr_info->thr = cursor->thr; + rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr); + } + +#ifdef BTR_CUR_ADAPT + info->root_guess = block; +#endif + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH) { + latch_leaves = btr_cur_latch_leaves( + block, latch_mode, cursor, mtr); + } + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + break; + default: + if (!s_latch_by_caller + && !srv_read_only_mode + && !modify_external) { + /* Release the tree s-latch */ + /* NOTE: BTR_MODIFY_EXTERNAL + needs to keep tree sx-latch */ + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + + /* release upper blocks */ + if (retrying_for_search_prev) { + ut_ad(!autoinc); + for (; + prev_n_releases < prev_n_blocks; + prev_n_releases++) { + mtr_release_block_at_savepoint( + mtr, + prev_tree_savepoints[ + prev_n_releases], + prev_tree_blocks[ + prev_n_releases]); + } + } + + for (; n_releases < n_blocks; n_releases++) { + if (n_releases == 0 + && (modify_external || autoinc)) { + /* keep the root page latch */ + ut_ad(mtr->memo_contains_flagged( + tree_blocks[n_releases], + MTR_MEMO_PAGE_SX_FIX + | MTR_MEMO_PAGE_X_FIX)); + continue; + } + + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + + page_mode = mode; + } + + if (dict_index_is_spatial(index)) { + /* Remember the page search mode */ + search_mode = page_mode; + + /* Some adjustment on search mode, when the + page search mode is PAGE_CUR_RTREE_LOCATE + or PAGE_CUR_RTREE_INSERT, as we are searching + with MBRs. When it is not the target level, we + should search all sub-trees that "CONTAIN" the + search range/MBR. When it is at the target + level, the search becomes PAGE_CUR_LE */ + if (page_mode == PAGE_CUR_RTREE_LOCATE + && level == height) { + if (level == 0) { + page_mode = PAGE_CUR_LE; + } else { + page_mode = PAGE_CUR_RTREE_GET_FATHER; + } + } + + if (page_mode == PAGE_CUR_RTREE_INSERT) { + page_mode = (level == height) + ? PAGE_CUR_LE + : PAGE_CUR_RTREE_INSERT; + + ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE); + } + + /* "need_path" indicates if we need to tracking the parent + pages, if it is not spatial comparison, then no need to + track it */ + if (page_mode < PAGE_CUR_CONTAIN) { + need_path = false; + } + + up_match = 0; + low_match = 0; + + if (latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_CONT_MODIFY_TREE + || latch_mode == BTR_CONT_SEARCH_TREE) { + /* Tree are locked, no need for Page Lock to protect + the "path" */ + cursor->rtr_info->need_page_lock = false; + } + } + + if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) { + ut_ad(need_path); + found = rtr_cur_search_with_match( + block, index, tuple, page_mode, page_cursor, + cursor->rtr_info); + + /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */ + if (search_mode == PAGE_CUR_RTREE_INSERT + && cursor->rtr_info->mbr_adj) { + if (latch_mode & BTR_MODIFY_LEAF) { + /* Parent MBR needs updated, should retry + with BTR_MODIFY_TREE */ + goto func_exit; + } else if (latch_mode & BTR_MODIFY_TREE) { + rtree_parent_modified = true; + cursor->rtr_info->mbr_adj = false; + mbr_adj = true; + } else { + ut_ad(0); + } + } + + if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) { + cursor->low_match = + DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } +#ifdef BTR_CUR_HASH_ADAPT + } else if (height == 0 && btr_search_enabled + && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG) + && !dict_index_is_spatial(index)) { + /* The adaptive hash index is only used when searching + for leaf pages (height==0), but not in r-trees. + We only need the byte prefix comparison for the purpose + of updating the adaptive hash index. */ + page_cur_search_with_match_bytes( + block, index, tuple, page_mode, &up_match, &up_bytes, + &low_match, &low_bytes, page_cursor); +#endif /* BTR_CUR_HASH_ADAPT */ + } else { + /* Search for complete index fields. */ + up_bytes = low_bytes = 0; + page_cur_search_with_match( + block, index, tuple, page_mode, &up_match, + &low_match, page_cursor, + need_path ? cursor->rtr_info : NULL); + } + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + /* If this is the desired level, leave the loop */ + + ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor))); + + /* Add Predicate lock if it is serializable isolation + and only if it is in the search case */ + if (dict_index_is_spatial(index) + && cursor->rtr_info->need_prdt_lock + && mode != PAGE_CUR_RTREE_INSERT + && mode != PAGE_CUR_RTREE_LOCATE + && mode >= PAGE_CUR_CONTAIN) { + trx_t* trx = thr_get_trx(cursor->thr); + lock_prdt_t prdt; + + lock_mutex_enter(); + lock_init_prdt_from_mbr( + &prdt, &cursor->rtr_info->mbr, mode, + trx->lock.lock_heap); + lock_mutex_exit(); + + if (rw_latch == RW_NO_LATCH && height != 0) { + rw_lock_s_lock(&(block->lock)); + } + + lock_prdt_lock(block, &prdt, index, LOCK_S, + LOCK_PREDICATE, cursor->thr); + + if (rw_latch == RW_NO_LATCH && height != 0) { + rw_lock_s_unlock(&(block->lock)); + } + } + + if (level != height) { + + const rec_t* node_ptr; + ut_ad(height > 0); + + height--; + guess = NULL; + + node_ptr = page_cur_get_rec(page_cursor); + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + /* If the rec is the first or last in the page for + pessimistic delete intention, it might cause node_ptr insert + for the upper level. We should change the intention and retry. + */ + if (latch_mode == BTR_MODIFY_TREE + && btr_cur_need_opposite_intention( + page, lock_intention, node_ptr)) { + +need_opposite_intention: + ut_ad(upper_rw_latch == RW_X_LATCH); + + if (n_releases > 0) { + /* release root block */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[0], + tree_blocks[0]); + } + + /* release all blocks */ + for (; n_releases <= n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + + lock_intention = BTR_INTENTION_BOTH; + + page_id.set_page_no(index->page); + up_match = 0; + low_match = 0; + height = ULINT_UNDEFINED; + + n_blocks = 0; + n_releases = 0; + + goto search_loop; + } + + if (dict_index_is_spatial(index)) { + if (page_rec_is_supremum(node_ptr)) { + cursor->low_match = 0; + cursor->up_match = 0; + goto func_exit; + } + + /* If we are doing insertion or record locating, + remember the tree nodes we visited */ + if (page_mode == PAGE_CUR_RTREE_INSERT + || (search_mode == PAGE_CUR_RTREE_LOCATE + && (latch_mode != BTR_MODIFY_LEAF))) { + bool add_latch = false; + + if (latch_mode == BTR_MODIFY_TREE + && rw_latch == RW_NO_LATCH) { + ut_ad(mtr->memo_contains_flagged( + &index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + rw_lock_s_lock(&block->lock); + add_latch = true; + } + + /* Store the parent cursor location */ +#ifdef UNIV_DEBUG + ulint num_stored = rtr_store_parent_path( + block, cursor, latch_mode, + height + 1, mtr); +#else + rtr_store_parent_path( + block, cursor, latch_mode, + height + 1, mtr); +#endif + + if (page_mode == PAGE_CUR_RTREE_INSERT) { + btr_pcur_t* r_cursor = + rtr_get_parent_cursor( + cursor, height + 1, + true); + /* If it is insertion, there should + be only one parent for each level + traverse */ +#ifdef UNIV_DEBUG + ut_ad(num_stored == 1); +#endif + + node_ptr = btr_pcur_get_rec(r_cursor); + + } + + if (add_latch) { + rw_lock_s_unlock(&block->lock); + } + + ut_ad(!page_rec_is_supremum(node_ptr)); + } + + ut_ad(page_mode == search_mode + || (page_mode == PAGE_CUR_WITHIN + && search_mode == PAGE_CUR_RTREE_LOCATE)); + + page_mode = search_mode; + } + + /* If the first or the last record of the page + or the same key value to the first record or last record, + the another page might be chosen when BTR_CONT_MODIFY_TREE. + So, the parent page should not released to avoiding deadlock + with blocking the another search with the same key value. */ + if (!detected_same_key_root + && lock_intention == BTR_INTENTION_BOTH + && !dict_index_is_unique(index) + && latch_mode == BTR_MODIFY_TREE + && (up_match >= rec_offs_n_fields(offsets) - 1 + || low_match >= rec_offs_n_fields(offsets) - 1)) { + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); + ulint matched_fields; + + ut_ad(upper_rw_latch == RW_X_LATCH); + + if (node_ptr == first_rec + || page_rec_is_last(node_ptr, page)) { + detected_same_key_root = true; + } else { + matched_fields = 0; + + offsets2 = rec_get_offsets( + first_rec, index, offsets2, + 0, ULINT_UNDEFINED, &heap); + cmp_rec_rec(node_ptr, first_rec, + offsets, offsets2, index, false, + &matched_fields); + + if (matched_fields + >= rec_offs_n_fields(offsets) - 1) { + detected_same_key_root = true; + } else { + const rec_t* last_rec; + + last_rec = page_rec_get_prev_const( + page_get_supremum_rec(page)); + + matched_fields = 0; + + offsets2 = rec_get_offsets( + last_rec, index, offsets2, + 0, ULINT_UNDEFINED, &heap); + cmp_rec_rec( + node_ptr, last_rec, + offsets, offsets2, index, + false, &matched_fields); + if (matched_fields + >= rec_offs_n_fields(offsets) - 1) { + detected_same_key_root = true; + } + } + } + } + + /* If the page might cause modify_tree, + we should not release the parent page's lock. */ + if (!detected_same_key_root + && latch_mode == BTR_MODIFY_TREE + && !btr_cur_will_modify_tree( + index, page, lock_intention, node_ptr, + node_ptr_max_size, zip_size, mtr) + && !rtree_parent_modified) { + ut_ad(upper_rw_latch == RW_X_LATCH); + ut_ad(n_releases <= n_blocks); + + /* we can release upper blocks */ + for (; n_releases < n_blocks; n_releases++) { + if (n_releases == 0) { + /* we should not release root page + to pin to same block. */ + continue; + } + + /* release unused blocks to unpin */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + + if (height == level + && latch_mode == BTR_MODIFY_TREE) { + ut_ad(upper_rw_latch == RW_X_LATCH); + /* we should sx-latch root page, if released already. + It contains seg_header. */ + if (n_releases > 0) { + mtr_block_sx_latch_at_savepoint( + mtr, tree_savepoints[0], + tree_blocks[0]); + } + + /* x-latch the branch blocks not released yet. */ + for (ulint i = n_releases; i <= n_blocks; i++) { + mtr_block_x_latch_at_savepoint( + mtr, tree_savepoints[i], + tree_blocks[i]); + } + } + + /* We should consider prev_page of parent page, if the node_ptr + is the leftmost of the page. because BTR_SEARCH_PREV and + BTR_MODIFY_PREV latches prev_page of the leaf page. */ + if ((latch_mode == BTR_SEARCH_PREV + || latch_mode == BTR_MODIFY_PREV) + && !retrying_for_search_prev) { + /* block should be latched for consistent + btr_page_get_prev() */ + ut_ad(mtr->memo_contains_flagged( + block, MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + + if (page_has_prev(page) + && page_rec_is_first(node_ptr, page)) { + + if (leftmost_from_level == 0) { + leftmost_from_level = height + 1; + } + } else { + leftmost_from_level = 0; + } + + if (height == 0 && leftmost_from_level > 0) { + /* should retry to get also prev_page + from level==leftmost_from_level. */ + retrying_for_search_prev = true; + + prev_tree_blocks = static_cast<buf_block_t**>( + ut_malloc_nokey(sizeof(buf_block_t*) + * leftmost_from_level)); + + prev_tree_savepoints = static_cast<ulint*>( + ut_malloc_nokey(sizeof(ulint) + * leftmost_from_level)); + + /* back to the level (leftmost_from_level+1) */ + ulint idx = n_blocks + - (leftmost_from_level - 1); + + page_id.set_page_no( + tree_blocks[idx]->page.id().page_no()); + + for (ulint i = n_blocks + - (leftmost_from_level - 1); + i <= n_blocks; i++) { + mtr_release_block_at_savepoint( + mtr, tree_savepoints[i], + tree_blocks[i]); + } + + n_blocks -= (leftmost_from_level - 1); + height = leftmost_from_level; + ut_ad(n_releases == 0); + + /* replay up_match, low_match */ + up_match = 0; + low_match = 0; + rtr_info_t* rtr_info = need_path + ? cursor->rtr_info : NULL; + + for (ulint i = 0; i < n_blocks; i++) { + page_cur_search_with_match( + tree_blocks[i], index, tuple, + page_mode, &up_match, + &low_match, page_cursor, + rtr_info); + } + + goto search_loop; + } + } + + /* Go to the child node */ + page_id.set_page_no( + btr_node_ptr_get_child_page_no(node_ptr, offsets)); + + n_blocks++; + + if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) { + /* We're doing a search on an ibuf tree and we're one + level above the leaf page. */ + + ut_ad(level == 0); + + buf_mode = BUF_GET; + rw_latch = RW_NO_LATCH; + goto retry_page_get; + } + + if (dict_index_is_spatial(index) + && page_mode >= PAGE_CUR_CONTAIN + && page_mode != PAGE_CUR_RTREE_INSERT) { + ut_ad(need_path); + rtr_node_path_t* path = + cursor->rtr_info->path; + + if (!path->empty() && found) { + ut_ad(path->back().page_no + == page_id.page_no()); + path->pop_back(); +#ifdef UNIV_DEBUG + if (page_mode == PAGE_CUR_RTREE_LOCATE + && (latch_mode != BTR_MODIFY_LEAF)) { + btr_pcur_t* cur + = cursor->rtr_info->parent_path->back( + ).cursor; + rec_t* my_node_ptr + = btr_pcur_get_rec(cur); + + offsets = rec_get_offsets( + my_node_ptr, index, offsets, + 0, ULINT_UNDEFINED, &heap); + + ulint my_page_no + = btr_node_ptr_get_child_page_no( + my_node_ptr, offsets); + + ut_ad(page_id.page_no() == my_page_no); + } +#endif + } + } + + goto search_loop; + } else if (!dict_index_is_spatial(index) + && latch_mode == BTR_MODIFY_TREE + && lock_intention == BTR_INTENTION_INSERT + && page_has_next(page) + && page_rec_is_last(page_cur_get_rec(page_cursor), page)) { + + /* btr_insert_into_right_sibling() might cause + deleting node_ptr at upper level */ + + guess = NULL; + + if (height == 0) { + /* release the leaf pages if latched */ + for (uint i = 0; i < 3; i++) { + if (latch_leaves.blocks[i] != NULL) { + mtr_release_block_at_savepoint( + mtr, latch_leaves.savepoints[i], + latch_leaves.blocks[i]); + latch_leaves.blocks[i] = NULL; + } + } + } + + goto need_opposite_intention; + } + + if (level != 0) { + ut_ad(!autoinc); + + if (upper_rw_latch == RW_NO_LATCH) { + ut_ad(latch_mode == BTR_CONT_MODIFY_TREE + || latch_mode == BTR_CONT_SEARCH_TREE); + buf_block_t* child_block = btr_block_get( + *index, page_id.page_no(), + latch_mode == BTR_CONT_MODIFY_TREE + ? RW_X_LATCH : RW_SX_LATCH, false, mtr); + btr_assert_not_corrupted(child_block, index); + } else { + ut_ad(mtr->memo_contains_flagged(block, + upper_rw_latch)); + btr_assert_not_corrupted(block, index); + + if (s_latch_by_caller) { + ut_ad(latch_mode == BTR_SEARCH_TREE); + /* to exclude modifying tree operations + should sx-latch the index. */ + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_SX_LOCK)); + /* because has sx-latch of index, + can release upper blocks. */ + for (; n_releases < n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, + tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + } + + if (page_mode <= PAGE_CUR_LE) { + cursor->low_match = low_match; + cursor->up_match = up_match; + } + } else { + cursor->low_match = low_match; + cursor->low_bytes = low_bytes; + cursor->up_match = up_match; + cursor->up_bytes = up_bytes; + + if (autoinc) { + page_set_autoinc(tree_blocks[0], autoinc, mtr, false); + } + +#ifdef BTR_CUR_HASH_ADAPT + /* We do a dirty read of btr_search_enabled here. We + will properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a + page hash index, while holding search latch. */ + if (!btr_search_enabled) { +# ifdef MYSQL_INDEX_DISABLE_AHI + } else if (index->disable_ahi) { +# endif + } else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(index->is_instant()); + /* This may be a search tuple for + btr_pcur_restore_position(). */ + ut_ad(tuple->is_metadata() + || (tuple->is_metadata(tuple->info_bits + ^ REC_STATUS_INSTANT))); + } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) { + /* Only user records belong in the adaptive + hash index. */ + } else { + btr_search_info_update(index, cursor); + } +#endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + } + + /* For spatial index, remember what blocks are still latched */ + if (dict_index_is_spatial(index) + && (latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_MODIFY_LEAF)) { + for (ulint i = 0; i < n_releases; i++) { + cursor->rtr_info->tree_blocks[i] = NULL; + cursor->rtr_info->tree_savepoints[i] = 0; + } + + for (ulint i = n_releases; i <= n_blocks; i++) { + cursor->rtr_info->tree_blocks[i] = tree_blocks[i]; + cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i]; + } + } + +func_exit: + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (retrying_for_search_prev) { + ut_free(prev_tree_blocks); + ut_free(prev_tree_savepoints); + } + + if (mbr_adj) { + /* remember that we will need to adjust parent MBR */ + cursor->rtr_info->mbr_adj = true; + } + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_latch) { + rw_lock_s_lock(ahi_latch); + } +#endif /* BTR_CUR_HASH_ADAPT */ + + DBUG_RETURN(err); +} + +/*****************************************************************//** +Opens a cursor at either end of an index. */ +dberr_t +btr_cur_open_at_index_side_func( +/*============================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf). */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t* page_cursor; + ulint node_ptr_max_size = srv_page_size / 2; + ulint height; + ulint root_height = 0; /* remove warning */ + rec_t* node_ptr; + ulint estimate; + btr_intention_t lock_intention; + buf_block_t* tree_blocks[BTR_MAX_LEVELS]; + ulint tree_savepoints[BTR_MAX_LEVELS]; + ulint n_blocks = 0; + ulint n_releases = 0; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + dberr_t err = DB_SUCCESS; + + rec_offs_init(offsets_); + + estimate = latch_mode & BTR_ESTIMATE; + latch_mode &= ulint(~BTR_ESTIMATE); + + ut_ad(level != ULINT_UNDEFINED); + + bool s_latch_by_caller; + + s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED; + latch_mode &= ulint(~BTR_ALREADY_S_LATCHED); + + lock_intention = btr_cur_get_and_clear_intention(&latch_mode); + + ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL)); + + /* This function doesn't need to lock left page of the leaf page */ + if (latch_mode == BTR_SEARCH_PREV) { + latch_mode = BTR_SEARCH_LEAF; + } else if (latch_mode == BTR_MODIFY_PREV) { + latch_mode = BTR_MODIFY_LEAF; + } + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched the leaf node */ + + ulint savepoint = mtr_set_savepoint(mtr); + + rw_lock_type_t upper_rw_latch; + + switch (latch_mode) { + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + upper_rw_latch = RW_NO_LATCH; + break; + case BTR_MODIFY_TREE: + /* Most of delete-intended operations are purging. + Free blocks and read IO bandwidth should be prior + for them, when the history list is glowing huge. */ + if (lock_intention == BTR_INTENTION_DELETE + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH + && buf_pool.n_pend_reads) { + mtr_x_lock_index(index, mtr); + } else { + mtr_sx_lock_index(index, mtr); + } + upper_rw_latch = RW_X_LATCH; + break; + default: + ut_ad(!s_latch_by_caller + || mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK + | MTR_MEMO_S_LOCK)); + if (!srv_read_only_mode) { + if (!s_latch_by_caller) { + /* BTR_SEARCH_TREE is intended to be used with + BTR_ALREADY_S_LATCHED */ + ut_ad(latch_mode != BTR_SEARCH_TREE); + + mtr_s_lock_index(index, mtr); + } + upper_rw_latch = RW_S_LATCH; + } else { + upper_rw_latch = RW_NO_LATCH; + } + } + + const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf( + latch_mode); + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + page_id_t page_id(index->table->space_id, index->page); + const ulint zip_size = index->table->space->zip_size(); + + if (root_leaf_rw_latch == RW_X_LATCH) { + node_ptr_max_size = btr_node_ptr_max_size(index); + } + + height = ULINT_UNDEFINED; + + for (;;) { + ut_ad(n_blocks < BTR_MAX_LEVELS); + tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); + + const ulint rw_latch = height + && (latch_mode != BTR_MODIFY_TREE || height == level) + ? upper_rw_latch : RW_NO_LATCH; + buf_block_t* block = buf_page_get_gen(page_id, zip_size, + rw_latch, NULL, BUF_GET, + file, line, mtr, &err, + height == 0 + && !index->is_clust()); + ut_ad((block != NULL) == (err == DB_SUCCESS)); + tree_blocks[n_blocks] = block; + + if (err != DB_SUCCESS) { + if (err == DB_DECRYPTION_FAILED) { + ib_push_warning((void *)NULL, + DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name); + index->table->file_unreadable = true; + } + + goto exit_loop; + } + + const page_t* page = buf_block_get_frame(block); + + if (height == ULINT_UNDEFINED + && page_is_leaf(page) + && rw_latch != RW_NO_LATCH + && rw_latch != root_leaf_rw_latch) { + /* We should retry to get the page, because the root page + is latched with different level as a leaf page. */ + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + ut_ad(rw_latch == RW_S_LATCH); + + ut_ad(n_blocks == 0); + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_blocks], + tree_blocks[n_blocks]); + + upper_rw_latch = root_leaf_rw_latch; + continue; + } + + ut_ad(fil_page_index_page_check(page)); + ut_ad(index->id == btr_page_get_index_id(page)); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page); + root_height = height; + ut_a(height >= level); + } else { + /* TODO: flag the index corrupted if this fails */ + ut_ad(height == btr_page_get_level(page)); + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH) { + btr_cur_latch_leaves(block, latch_mode, + cursor, mtr); + } + + /* In versions <= 3.23.52 we had forgotten to + release the tree latch here. If in an index + scan we had to scan far to find a record + visible to the current transaction, that could + starve others waiting for the tree latch. */ + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + break; + default: + if (UNIV_UNLIKELY(srv_read_only_mode)) { + break; + } + if (!s_latch_by_caller) { + /* Release the tree s-latch */ + mtr_release_s_latch_at_savepoint( + mtr, savepoint, &index->lock); + } + + /* release upper blocks */ + for (; n_releases < n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, + tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + } else if (height == level /* height != 0 */ + && UNIV_LIKELY(!srv_read_only_mode)) { + /* We already have the block latched. */ + ut_ad(latch_mode == BTR_SEARCH_TREE); + ut_ad(s_latch_by_caller); + ut_ad(upper_rw_latch == RW_S_LATCH); + ut_ad(mtr->memo_contains_flagged(block, + MTR_MEMO_PAGE_S_FIX)); + + if (s_latch_by_caller) { + /* to exclude modifying tree operations + should sx-latch the index. */ + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_SX_LOCK)); + /* because has sx-latch of index, + can release upper blocks. */ + for (; n_releases < n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, + tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + } + + if (from_left) { + page_cur_set_before_first(block, page_cursor); + } else { + page_cur_set_after_last(block, page_cursor); + } + + if (height == level) { + if (estimate) { + btr_cur_add_path_info(cursor, height, + root_height); + } + + break; + } + + ut_ad(height > 0); + + if (from_left) { + page_cur_move_to_next(page_cursor); + } else { + page_cur_move_to_prev(page_cursor); + } + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + 0, ULINT_UNDEFINED, &heap); + + /* If the rec is the first or last in the page for + pessimistic delete intention, it might cause node_ptr insert + for the upper level. We should change the intention and retry. + */ + if (latch_mode == BTR_MODIFY_TREE + && btr_cur_need_opposite_intention( + page, lock_intention, node_ptr)) { + + ut_ad(upper_rw_latch == RW_X_LATCH); + /* release all blocks */ + for (; n_releases <= n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + + lock_intention = BTR_INTENTION_BOTH; + + page_id.set_page_no(dict_index_get_page(index)); + + height = ULINT_UNDEFINED; + + n_blocks = 0; + n_releases = 0; + + continue; + } + + if (latch_mode == BTR_MODIFY_TREE + && !btr_cur_will_modify_tree( + cursor->index, page, lock_intention, node_ptr, + node_ptr_max_size, zip_size, mtr)) { + ut_ad(upper_rw_latch == RW_X_LATCH); + ut_ad(n_releases <= n_blocks); + + /* we can release upper blocks */ + for (; n_releases < n_blocks; n_releases++) { + if (n_releases == 0) { + /* we should not release root page + to pin to same block. */ + continue; + } + + /* release unused blocks to unpin */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + + if (height == level + && latch_mode == BTR_MODIFY_TREE) { + ut_ad(upper_rw_latch == RW_X_LATCH); + /* we should sx-latch root page, if released already. + It contains seg_header. */ + if (n_releases > 0) { + mtr_block_sx_latch_at_savepoint( + mtr, tree_savepoints[0], + tree_blocks[0]); + } + + /* x-latch the branch blocks not released yet. */ + for (ulint i = n_releases; i <= n_blocks; i++) { + mtr_block_x_latch_at_savepoint( + mtr, tree_savepoints[i], + tree_blocks[i]); + } + } + + /* Go to the child node */ + page_id.set_page_no( + btr_node_ptr_get_child_page_no(node_ptr, offsets)); + + n_blocks++; + } + + exit_loop: + if (heap) { + mem_heap_free(heap); + } + + return err; +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. +@return true if the index is available and we have put the cursor, false +if the index is unavailable */ +bool +btr_cur_open_at_rnd_pos_func( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + ulint node_ptr_max_size = srv_page_size / 2; + ulint height; + rec_t* node_ptr; + btr_intention_t lock_intention; + buf_block_t* tree_blocks[BTR_MAX_LEVELS]; + ulint tree_savepoints[BTR_MAX_LEVELS]; + ulint n_blocks = 0; + ulint n_releases = 0; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(!index->is_spatial()); + + lock_intention = btr_cur_get_and_clear_intention(&latch_mode); + + ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL)); + + ulint savepoint = mtr_set_savepoint(mtr); + + rw_lock_type_t upper_rw_latch; + + switch (latch_mode) { + case BTR_MODIFY_TREE: + /* Most of delete-intended operations are purging. + Free blocks and read IO bandwidth should be prior + for them, when the history list is glowing huge. */ + if (lock_intention == BTR_INTENTION_DELETE + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH + && buf_pool.n_pend_reads) { + mtr_x_lock_index(index, mtr); + } else { + mtr_sx_lock_index(index, mtr); + } + upper_rw_latch = RW_X_LATCH; + break; + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + /* This function doesn't support left uncle + page lock for left leaf page lock, when + needed. */ + case BTR_SEARCH_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + ut_ad(0); + /* fall through */ + default: + if (!srv_read_only_mode) { + mtr_s_lock_index(index, mtr); + upper_rw_latch = RW_S_LATCH; + } else { + upper_rw_latch = RW_NO_LATCH; + } + } + + DBUG_EXECUTE_IF("test_index_is_unavailable", + return(false);); + + if (index->page == FIL_NULL) { + /* Since we don't hold index lock until just now, the index + could be modified by others, for example, if this is a + statistics updater for referenced table, it could be marked + as unavailable by 'DROP TABLE' in the mean time, since + we don't hold lock for statistics updater */ + return(false); + } + + const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf( + latch_mode); + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + page_id_t page_id(index->table->space_id, index->page); + const ulint zip_size = index->table->space->zip_size(); + dberr_t err = DB_SUCCESS; + + if (root_leaf_rw_latch == RW_X_LATCH) { + node_ptr_max_size = btr_node_ptr_max_size(index); + } + + height = ULINT_UNDEFINED; + + for (;;) { + page_t* page; + + ut_ad(n_blocks < BTR_MAX_LEVELS); + tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); + + const rw_lock_type_t rw_latch = height + && latch_mode != BTR_MODIFY_TREE + ? upper_rw_latch : RW_NO_LATCH; + buf_block_t* block = buf_page_get_gen(page_id, zip_size, + rw_latch, NULL, BUF_GET, + file, line, mtr, &err, + height == 0 + && !index->is_clust()); + tree_blocks[n_blocks] = block; + + ut_ad((block != NULL) == (err == DB_SUCCESS)); + + if (err != DB_SUCCESS) { + if (err == DB_DECRYPTION_FAILED) { + ib_push_warning((void *)NULL, + DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name); + index->table->file_unreadable = true; + } + + break; + } + + page = buf_block_get_frame(block); + + if (height == ULINT_UNDEFINED + && page_is_leaf(page) + && rw_latch != RW_NO_LATCH + && rw_latch != root_leaf_rw_latch) { + /* We should retry to get the page, because the root page + is latched with different level as a leaf page. */ + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + ut_ad(rw_latch == RW_S_LATCH); + + ut_ad(n_blocks == 0); + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_blocks], + tree_blocks[n_blocks]); + + upper_rw_latch = root_leaf_rw_latch; + continue; + } + + ut_ad(fil_page_index_page_check(page)); + ut_ad(index->id == btr_page_get_index_id(page)); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page); + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH + || srv_read_only_mode) { + btr_cur_latch_leaves(block, latch_mode, cursor, + mtr); + } + + /* btr_cur_open_at_index_side_func() and + btr_cur_search_to_nth_level() release + tree s-latch here.*/ + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + case BTR_CONT_SEARCH_TREE: + break; + default: + /* Release the tree s-latch */ + if (!srv_read_only_mode) { + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + + /* release upper blocks */ + for (; n_releases < n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, + tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + } + + page_cur_open_on_rnd_user_rec(block, page_cursor); + + if (height == 0) { + + break; + } + + ut_ad(height > 0); + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + 0, ULINT_UNDEFINED, &heap); + + /* If the rec is the first or last in the page for + pessimistic delete intention, it might cause node_ptr insert + for the upper level. We should change the intention and retry. + */ + if (latch_mode == BTR_MODIFY_TREE + && btr_cur_need_opposite_intention( + page, lock_intention, node_ptr)) { + + ut_ad(upper_rw_latch == RW_X_LATCH); + /* release all blocks */ + for (; n_releases <= n_blocks; n_releases++) { + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + + lock_intention = BTR_INTENTION_BOTH; + + page_id.set_page_no(dict_index_get_page(index)); + + height = ULINT_UNDEFINED; + + n_blocks = 0; + n_releases = 0; + + continue; + } + + if (latch_mode == BTR_MODIFY_TREE + && !btr_cur_will_modify_tree( + cursor->index, page, lock_intention, node_ptr, + node_ptr_max_size, zip_size, mtr)) { + ut_ad(upper_rw_latch == RW_X_LATCH); + ut_ad(n_releases <= n_blocks); + + /* we can release upper blocks */ + for (; n_releases < n_blocks; n_releases++) { + if (n_releases == 0) { + /* we should not release root page + to pin to same block. */ + continue; + } + + /* release unused blocks to unpin */ + mtr_release_block_at_savepoint( + mtr, tree_savepoints[n_releases], + tree_blocks[n_releases]); + } + } + + if (height == 0 + && latch_mode == BTR_MODIFY_TREE) { + ut_ad(upper_rw_latch == RW_X_LATCH); + /* we should sx-latch root page, if released already. + It contains seg_header. */ + if (n_releases > 0) { + mtr_block_sx_latch_at_savepoint( + mtr, tree_savepoints[0], + tree_blocks[0]); + } + + /* x-latch the branch blocks not released yet. */ + for (ulint i = n_releases; i <= n_blocks; i++) { + mtr_block_x_latch_at_savepoint( + mtr, tree_savepoints[i], + tree_blocks[i]); + } + } + + /* Go to the child node */ + page_id.set_page_no( + btr_node_ptr_get_child_page_no(node_ptr, offsets)); + + n_blocks++; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return err == DB_SUCCESS; +} + +/*==================== B-TREE INSERT =========================*/ + +/*************************************************************//** +Inserts a record if there is enough space, or if enough space can +be freed by reorganizing. Differs from btr_cur_optimistic_insert because +no heuristics is applied to whether it pays to use CPU time for +reorganizing the page or not. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to inserted record if succeed, else NULL */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +rec_t* +btr_cur_insert_if_possible( +/*=======================*/ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not + have been stored to tuple */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t* page_cursor; + rec_t* rec; + + ut_ad(dtuple_check_typed(tuple)); + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + + /* If the record did not fit, reorganize. + For compressed pages, page_cur_tuple_insert() + attempted this already. */ + if (!rec && !page_cur_get_page_zip(page_cursor) + && btr_page_reorganize(page_cursor, cursor->index, mtr)) { + rec = page_cur_tuple_insert( + page_cursor, tuple, cursor->index, + offsets, heap, n_ext, mtr); + } + + ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets)); + return(rec); +} + +/*************************************************************//** +For an insert, checks the locks and does the undo logging if desired. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6))) +dberr_t +btr_cur_ins_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if + not zero, the parameters index and thr + should be specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert */ + dtuple_t* entry, /*!< in/out: entry to insert */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + bool* inherit)/*!< out: true if the inserted new record maybe + should inherit LOCK_GAP type locks from the + successor record */ +{ + dict_index_t* index; + dberr_t err = DB_SUCCESS; + rec_t* rec; + roll_ptr_t roll_ptr; + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(mtr->is_named_space(index->table->space)); + + /* Check if there is predicate or GAP lock preventing the insertion */ + if (!(flags & BTR_NO_LOCKING_FLAG)) { + const unsigned type = index->type; + if (UNIV_UNLIKELY(type & DICT_SPATIAL)) { + lock_prdt_t prdt; + rtr_mbr_t mbr; + + rtr_get_mbr_from_tuple(entry, &mbr); + + /* Use on stack MBR variable to test if a lock is + needed. If so, the predicate (MBR) will be allocated + from lock heap in lock_prdt_insert_check_and_lock() */ + lock_init_prdt_from_mbr( + &prdt, &mbr, 0, NULL); + + err = lock_prdt_insert_check_and_lock( + flags, rec, btr_cur_get_block(cursor), + index, thr, mtr, &prdt); + *inherit = false; + } else { +#ifdef WITH_WSREP + trx_t* trx= thr_get_trx(thr); + /* If transaction scanning an unique secondary + key is wsrep high priority thread (brute + force) this scanning may involve GAP-locking + in the index. As this locking happens also + when applying replication events in high + priority applier threads, there is a + probability for lock conflicts between two + wsrep high priority threads. To avoid this + GAP-locking we mark that this transaction + is using unique key scan here. */ + if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE + && trx->is_wsrep() + && wsrep_thd_is_BF(trx->mysql_thd, false)) { + trx->wsrep_UK_scan= true; + } +#endif /* WITH_WSREP */ + err = lock_rec_insert_check_and_lock( + flags, rec, btr_cur_get_block(cursor), + index, thr, mtr, inherit); +#ifdef WITH_WSREP + trx->wsrep_UK_scan= false; +#endif /* WITH_WSREP */ + } + } + + if (err != DB_SUCCESS + || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) + || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) { + + return(err); + } + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS; + if (!(flags & BTR_KEEP_SYS_FLAG)) { +upd_sys: + dfield_t* r = dtuple_get_nth_field( + entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), + roll_ptr); + } + } else { + err = trx_undo_report_row_operation(thr, index, entry, + NULL, 0, NULL, NULL, + &roll_ptr); + if (err == DB_SUCCESS) { + goto upd_sys; + } + } + + return(err); +} + +/** +Prefetch siblings of the leaf for the pessimistic operation. +@param block leaf page +@param index index of the page */ +static void btr_cur_prefetch_siblings(const buf_block_t *block, + const dict_index_t *index) +{ + ut_ad(page_is_leaf(block->frame)); + + if (index->is_ibuf()) + return; + + const page_t *page= block->frame; + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); + + if (prev == FIL_NULL); + else if (index->table->space->acquire()) + buf_read_page_background(index->table->space, + page_id_t(block->page.id().space(), prev), + block->zip_size(), false); + if (next == FIL_NULL); + else if (index->table->space->acquire()) + buf_read_page_background(index->table->space, + page_id_t(block->page.id().space(), next), + block->zip_size(), false); +} + +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + dict_index_t* index; + page_cur_t* page_cursor; + buf_block_t* block; + page_t* page; + rec_t* dummy; + bool leaf; + bool reorg __attribute__((unused)); + bool inherit = true; + ulint rec_size; + dberr_t err; + + ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG))); + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = cursor->index; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(dtuple_check_typed(entry)); + +#ifdef HAVE_valgrind + if (block->page.zip.data) { + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size()); + } +#endif /* HAVE_valgrind */ + + leaf = page_is_leaf(page); + + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + ut_ad(leaf); + goto convert_big_rec; + } + + /* Calculate the record size when entry is converted to a record */ + rec_size = rec_get_converted_size(index, entry, n_ext); + + if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), + dtuple_get_n_fields(entry), + block->zip_size())) { +convert_big_rec: + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); + + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(index, entry, n_ext); + } + + if (block->page.zip.data && page_zip_is_too_big(index, entry)) { + if (big_rec_vec != NULL) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(DB_TOO_BIG_RECORD); + } + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), + goto fail); + + if (block->page.zip.data && leaf + && (page_get_data_size(page) + rec_size + >= dict_index_zip_pad_optimal_page_size(index))) { + /* If compression padding tells us that insertion will + result in too packed up page i.e.: which is likely to + cause compression failure then don't do an optimistic + insertion. */ +fail: + err = DB_FAIL; + + /* prefetch siblings of the leaf for the pessimistic + operation, if the page is leaf. */ + if (page_is_leaf(page)) { + btr_cur_prefetch_siblings(block, index); + } +fail_err: + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(err); + } + + ulint max_size = page_get_max_insert_size_after_reorganize(page, 1); + if (max_size < rec_size) { + goto fail; + } + + const ulint n_recs = page_get_n_recs(page); + if (UNIV_UNLIKELY(n_recs >= 8189)) { + ut_ad(srv_page_size == 65536); + goto fail; + } + + if (page_has_garbage(page)) { + if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + && n_recs > 1 + && page_get_max_insert_size(page, 1) < rec_size) { + + goto fail; + } + } + + /* If there have been many consecutive inserts to the + clustered index leaf page of an uncompressed table, check if + we have to split the page to reserve enough free space for + future updates of records. */ + + if (leaf && !block->page.zip.data && dict_index_is_clust(index) + && page_get_n_recs(page) >= 2 + && dict_index_get_space_reserve() + rec_size > max_size + && (btr_page_get_split_rec_to_right(cursor, &dummy) + || btr_page_get_split_rec_to_left(cursor))) { + goto fail; + } + + page_cursor = btr_cur_get_page_cur(cursor); + + DBUG_LOG("ib_cur", + "insert " << index->name << " (" << index->id << ") by " + << ib::hex(thr ? thr->graph->trx->id : 0) + << ' ' << rec_printer(entry).str()); + DBUG_EXECUTE_IF("do_page_reorganize", + btr_page_reorganize(page_cursor, index, mtr);); + + /* Now, try the insert */ + { + const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); + + /* Check locks and write to the undo log, + if specified */ + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + if (err != DB_SUCCESS) { + goto fail_err; + } + +#ifdef UNIV_DEBUG + if (!(flags & BTR_CREATE_FLAG) + && index->is_primary() && page_is_leaf(page)) { + const dfield_t* trx_id = dtuple_get_nth_field( + entry, dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, + DATA_TRX_ID), + index)); + + ut_ad(trx_id->len == DATA_TRX_ID_LEN); + ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN); + ut_ad(*static_cast<const byte*> + (trx_id[1].data) & 0x80); + if (flags & BTR_NO_UNDO_LOG_FLAG) { + ut_ad(!memcmp(trx_id->data, reset_trx_id, + DATA_TRX_ID_LEN)); + } else { + ut_ad(thr->graph->trx->id); + ut_ad(thr->graph->trx->id + == trx_read_trx_id( + static_cast<const byte*>( + trx_id->data)) + || index->table->is_temporary()); + } + } +#endif + + *rec = page_cur_tuple_insert( + page_cursor, entry, index, offsets, heap, + n_ext, mtr); + + reorg = page_cursor_rec != page_cur_get_rec(page_cursor); + } + + if (*rec) { + } else if (block->page.zip.data) { + ut_ad(!index->table->is_temporary()); + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + if (leaf + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(block); + } + + goto fail; + } else { + ut_ad(!reorg); + + /* If the record did not fit, reorganize */ + if (!btr_page_reorganize(page_cursor, index, mtr)) { + ut_ad(0); + goto fail; + } + + ut_ad(page_get_max_insert_size(page, 1) == max_size); + + reorg = TRUE; + + *rec = page_cur_tuple_insert(page_cursor, entry, index, + offsets, heap, n_ext, mtr); + + if (UNIV_UNLIKELY(!*rec)) { + ib::fatal() << "Cannot insert tuple " << *entry + << "into index " << index->name + << " of table " << index->table->name + << ". Max size: " << max_size; + } + } + +#ifdef BTR_CUR_HASH_ADAPT + if (!leaf) { +# ifdef MYSQL_INDEX_DISABLE_AHI + } else if (index->disable_ahi) { +# endif + } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(flags == BTR_NO_LOCKING_FLAG); + } else { + rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index); + if (!reorg && cursor->flag == BTR_CUR_HASH) { + btr_search_update_hash_node_on_insert( + cursor, ahi_latch); + } else { + btr_search_update_hash_on_insert(cursor, ahi_latch); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { + + lock_update_insert(block, *rec); + } + + if (leaf + && !dict_index_is_clust(index) + && !index->table->is_temporary()) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (block->page.zip.data) { + /* Update the bits in the same mini-transaction. */ + ibuf_update_free_bits_zip(block, mtr); + } else { + /* Decrement the bits in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full( + block, max_size, + rec_size + PAGE_DIR_SLOT_SIZE); + } + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index = cursor->index; + big_rec_t* big_rec_vec = NULL; + dberr_t err; + bool inherit = false; + bool success; + uint32_t n_reserved = 0; + + ut_ad(dtuple_check_typed(entry)); + ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG))); + + *big_rec = NULL; + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + + cursor->flag = BTR_CUR_BINARY; + + /* Check locks and write to undo log, if specified */ + + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + + if (err != DB_SUCCESS) { + + return(err); + } + + if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + /* First reserve enough free space for the file segments + of the index tree, so that the insert will not fail because + of lack of space */ + + uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3); + + success = fsp_reserve_free_extents(&n_reserved, + index->table->space, + n_extents, FSP_NORMAL, mtr); + if (!success) { + return(DB_OUT_OF_FILE_SPACE); + } + } + + if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), + index->table->not_redundant(), + dtuple_get_n_fields(entry), + btr_cur_get_block(cursor)->zip_size()) + || UNIV_UNLIKELY(entry->is_alter_metadata() + && !dfield_is_ext( + dtuple_get_nth_field( + entry, + index->first_user_field())))) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + + if (UNIV_LIKELY_NULL(big_rec_vec)) { + /* This should never happen, but we handle + the situation in a robust manner. */ + ut_ad(0); + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); + + if (big_rec_vec == NULL) { + + index->table->space->release_free_extents(n_reserved); + return(DB_TOO_BIG_RECORD); + } + } + + if (dict_index_get_page(index) + == btr_cur_get_block(cursor)->page.id().page_no()) { + + /* The page is the root page */ + *rec = btr_root_raise_and_insert( + flags, cursor, offsets, heap, entry, n_ext, mtr); + } else { + *rec = btr_page_split_and_insert( + flags, cursor, offsets, heap, entry, n_ext, mtr); + } + + if (*rec == NULL && os_has_said_disk_full) { + return(DB_OUT_OF_FILE_SPACE); + } + + ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec + || dict_index_is_spatial(index)); + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + ut_ad(!index->table->is_temporary()); + if (dict_index_is_spatial(index)) { + /* Do nothing */ + } else { + /* The cursor might be moved to the other page + and the max trx id field should be updated after + the cursor was fixed. */ + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + thr_get_trx(thr)->id, mtr); + } + + if (!page_rec_is_infimum(btr_cur_get_rec(cursor)) + || !page_has_prev(btr_cur_get_page(cursor))) { + /* split and inserted need to call + lock_update_insert() always. */ + inherit = true; + } + } + } + + if (!page_is_leaf(btr_cur_get_page(cursor))) { + ut_ad(!big_rec_vec); + } else { +#ifdef BTR_CUR_HASH_ADAPT +# ifdef MYSQL_INDEX_DISABLE_AHI + if (index->disable_ahi); else +# endif + if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(!(flags & BTR_CREATE_FLAG)); + } else { + btr_search_update_hash_on_insert( + cursor, btr_search_sys.get_latch(*index)); + } +#endif /* BTR_CUR_HASH_ADAPT */ + if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) { + + lock_update_insert(btr_cur_get_block(cursor), *rec); + } + } + + index->table->space->release_free_extents(n_reserved); + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*==================== B-TREE UPDATE =========================*/ + +/*************************************************************//** +For an update, checks the locks and does the undo logging. +@return DB_SUCCESS, DB_WAIT_LOCK, or error number */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +dberr_t +btr_cur_upd_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on record to update */ + const rec_offs* offsets,/*!< in: rec_get_offsets() on cursor */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + roll_ptr_t* roll_ptr)/*!< out: roll pointer */ +{ + dict_index_t* index; + const rec_t* rec; + dberr_t err; + + ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG)); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->is_named_space(index->table->space)); + + if (!dict_index_is_clust(index)) { + ut_ad(dict_index_is_online_ddl(index) + == !!(flags & BTR_CREATE_FLAG)); + + /* We do undo logging only when we update a clustered index + record */ + return(lock_sec_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, + index, thr, mtr)); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + err = lock_clust_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, index, + offsets, thr); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Append the info about the update in the undo log */ + + return((flags & BTR_NO_UNDO_LOG_FLAG) + ? DB_SUCCESS + : trx_undo_report_row_operation( + thr, index, NULL, update, + cmpl_info, rec, offsets, roll_ptr)); +} + +/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry. +@param[in,out] entry clustered index entry +@param[in] index clustered index +@param[in] trx_id DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR */ +static void btr_cur_write_sys( + dtuple_t* entry, + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr) +{ + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(t->data), trx_id); + dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr); +} + +/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record. +@param[in,out] block clustered index leaf page +@param[in,out] rec clustered index record +@param[in] index clustered index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx transaction +@param[in] roll_ptr DB_ROLL_PTR value +@param[in,out] mtr mini-transaction */ +static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec, + dict_index_t *index, const rec_offs *offsets, + const trx_t *trx, roll_ptr_t roll_ptr, + mtr_t *mtr) +{ + ut_ad(index->is_primary()); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(), + trx->id, roll_ptr, mtr); + return; + } + + ulint offset= index->trx_id_offset; + + if (!offset) + offset= row_get_trx_id_offset(index, offsets); + + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + + /* During IMPORT the trx id in the record can be in the future, if + the .ibd file is being imported from another instance. During IMPORT + roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 || + lock_check_trx_id_sanity(trx_read_trx_id(rec + offset), + rec, index, offsets)); + + byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + + trx_write_trx_id(sys, trx->id); + trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr); + + ulint d= 0; + const byte *src= nullptr; + byte *dest= rec + offset; + ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + if (UNIV_LIKELY(index->trx_id_offset)) + { + const rec_t *prev= page_rec_get_prev_const(rec); + if (UNIV_UNLIKELY(prev == rec)) + ut_ad(0); + else if (page_rec_is_infimum(prev)); + else + for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++) + if (src[d] != sys[d]) + break; + if (d > 6 && memcmp(dest, sys, d)) + { + /* We save space by replacing a single record + + WRITE,page_offset(dest),byte[13] + + with two records: + + MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes), + WRITE|0x80,0,byte[13-d] + + The single WRITE record would be x+13 bytes long, with x>2. + The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the + second WRITE would be 1+1+13-d = 15-d bytes. + + The total size is: x+13 versus x+4+15-d = x+19-d bytes. + To save space, we must have d>6, that is, the complete DB_TRX_ID and + the first byte(s) of DB_ROLL_PTR must match the previous record. */ + memcpy(dest, src, d); + mtr->memmove(*block, page_offset(dest), page_offset(src), d); + dest+= d; + len-= d; + /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when + DB_TRX_ID refers to an active transaction. */ + ut_ad(len); + } + else + d= 0; + } + + if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */ + mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len); +} + +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ + dict_index_t* index, /*!< in: the index corresponding to cursor */ +#ifdef UNIV_DEBUG + rec_offs* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + + /* Have a local copy of the variables as these can change + dynamically. */ + const page_t* page = page_cur_get_page(cursor); + + ut_ad(page_zip == page_cur_get_page_zip(cursor)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(true); + } + + if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + return(false); + } + + if (create && page_is_leaf(page) + && (length + page_get_data_size(page) + >= dict_index_zip_pad_optimal_page_size(index))) { + return(false); + } + + if (!btr_page_reorganize(cursor, index, mtr)) { + goto out_of_space; + } + + rec_offs_make_valid(page_cur_get_rec(cursor), index, + page_is_leaf(page), offsets); + + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the btr_page_reorganize() above did not reduce + the free space available on the page. */ + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(true); + } + +out_of_space: + ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); + + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(page)) { + ibuf_reset_free_bits(page_cur_get_block(cursor)); + } + + return(false); +} + +/** Apply an update vector to a record. No field size changes are allowed. + +This is usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). +@param[in,out] rec index record +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] update update vector +@param[in,out] block index page +@param[in,out] mtr mini-transaction */ +void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, + const rec_offs *offsets, const upd_t *update, + buf_block_t *block, mtr_t *mtr) +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!index->table->skip_alter_undo); + ut_ad(!block->page.zip.data || index->table->not_redundant()); + +#ifdef UNIV_DEBUG + if (rec_offs_comp(offsets)) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_INSTANT: + ut_ad(index->is_instant()); + break; + case REC_STATUS_NODE_PTR: + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad("wrong record status in update" == 0); + } + } +#endif /* UNIV_DEBUG */ + + static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility"); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(rec_offs_comp(offsets)); + byte* info_bits = &rec[-REC_NEW_INFO_BITS]; + const bool flip_del_mark = (*info_bits ^ update->info_bits) + & REC_INFO_DELETED_FLAG; + *info_bits &= byte(~REC_INFO_BITS_MASK); + *info_bits |= update->info_bits; + + if (flip_del_mark) { + page_zip_rec_set_deleted(block, rec, update->info_bits + & REC_INFO_DELETED_FLAG, mtr); + } + } else { + byte* info_bits = &rec[rec_offs_comp(offsets) + ? -REC_NEW_INFO_BITS + : -REC_OLD_INFO_BITS]; + + mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits, + (*info_bits + & ~REC_INFO_BITS_MASK) + | update->info_bits); + } + + for (ulint i = 0; i < update->n_fields; i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) { + continue; + } + const ulint n = uf->field_no; + + ut_ad(!dfield_is_ext(&uf->new_val) + == !rec_offs_nth_extern(offsets, n)); + ut_ad(!rec_offs_nth_default(offsets, n)); + + if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) { + if (rec_offs_nth_sql_null(offsets, n)) { + ut_ad(index->table->is_instant()); + ut_ad(n >= index->n_core_fields); + continue; + } + + ut_ad(!index->table->not_redundant()); + switch (ulint size = rec_get_nth_field_size(rec, n)) { + case 0: + break; + case 1: + mtr->write<1,mtr_t::MAYBE_NOP>( + *block, + rec_get_field_start_offs(rec, n) + rec, + 0U); + break; + default: + mtr->memset( + block, + page_offset(rec_get_field_start_offs( + rec, n) + rec), + size, 0); + } + ulint l = rec_get_1byte_offs_flag(rec) + ? (n + 1) : (n + 1) * 2; + byte* b = rec - REC_N_OLD_EXTRA_BYTES - l; + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + mtr->write<1>(*block, b, + byte(*b | REC_1BYTE_SQL_NULL_MASK)); + continue; + } + + ulint len; + byte* data = rec_get_nth_field(rec, offsets, n, &len); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(len == uf->new_val.len); + memcpy(data, uf->new_val.data, len); + continue; + } + + if (UNIV_UNLIKELY(len != uf->new_val.len)) { + ut_ad(len == UNIV_SQL_NULL); + ut_ad(!rec_offs_comp(offsets)); + len = uf->new_val.len; + ut_ad(len == rec_get_nth_field_size(rec, n)); + ulint l = rec_get_1byte_offs_flag(rec) + ? (n + 1) : (n + 1) * 2; + byte* b = rec - REC_N_OLD_EXTRA_BYTES - l; + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + mtr->write<1>(*block, b, + byte(*b & ~REC_1BYTE_SQL_NULL_MASK)); + } + + if (len) { + mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data, + uf->new_val.data, len); + } + } + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + page_zip_write_rec(block, rec, index, offsets, 0, mtr); + } +} + +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + dberr_t err; + rec_t* rec; + roll_ptr_t roll_ptr = 0; + ulint was_delete_marked; + + ut_ad(page_is_leaf(cursor->page_cur.block->frame)); + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor))); + ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id); + ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG)); + + DBUG_LOG("ib_cur", + "update-in-place " << index->name << " (" << index->id + << ") by " << ib::hex(trx_id) << ": " + << rec_printer(rec, offsets).str()); + + buf_block_t* block = btr_cur_get_block(cursor); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + /* Check that enough space is available on the compressed page. */ + if (UNIV_LIKELY_NULL(page_zip)) { + ut_ad(!index->table->is_temporary()); + + if (!btr_cur_update_alloc_zip( + page_zip, btr_cur_get_page_cur(cursor), + index, offsets, rec_offs_size(offsets), + false, mtr)) { + return(DB_ZIP_OVERFLOW); + } + + rec = btr_cur_get_rec(cursor); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_upd_rec_sys(block, rec, index, offsets, + thr_get_trx(thr), roll_ptr, mtr); + } + + was_delete_marked = rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block))); + /* In delete-marked records, DB_TRX_ID must always refer to an + existing undo log record. */ + ut_ad(!was_delete_marked + || !dict_index_is_clust(index) + || row_get_rec_trx_id(rec, index, offsets)); + +#ifdef BTR_CUR_HASH_ADAPT + { + rw_lock_t* ahi_latch = block->index + ? btr_search_sys.get_latch(*index) : NULL; + if (ahi_latch) { + /* TO DO: Can we skip this if none of the fields + index->search_info->curr_n_fields + are being updated? */ + + /* The function row_upd_changes_ord_field_binary + does not work on a secondary index. */ + + if (!dict_index_is_clust(index) + || row_upd_changes_ord_field_binary( + index, update, thr, NULL, NULL)) { + ut_ad(!(update->info_bits + & REC_INFO_MIN_REC_FLAG)); + /* Remove possible hash index pointer + to this record */ + btr_search_update_hash_on_delete(cursor); + } + + rw_lock_x_lock(ahi_latch); + } + + assert_block_ahi_valid(block); +#endif /* BTR_CUR_HASH_ADAPT */ + + btr_cur_upd_rec_in_place(rec, index, offsets, update, block, + mtr); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_latch) { + rw_lock_x_unlock(ahi_latch); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (was_delete_marked + && !rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block)))) { + /* The new updated record owns its possible externally + stored fields */ + + btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr); + } + + ut_ad(err == DB_SUCCESS); + +func_exit: + if (page_zip + && !(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + /* Update the free bits in the insert buffer. */ + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } + + return(err); +} + +/** Trim a metadata record during the rollback of instant ALTER TABLE. +@param[in] entry metadata tuple +@param[in] index primary key +@param[in] update update vector for the rollback */ +ATTRIBUTE_COLD +static void btr_cur_trim_alter_metadata(dtuple_t* entry, + const dict_index_t* index, + const upd_t* update) +{ + ut_ad(index->is_instant()); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->is_alter_metadata()); + + ut_ad(update->fields[0].field_no == index->first_user_field()); + ut_ad(update->fields[0].new_val.ext); + ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE); + ut_ad(entry->n_fields - 1 == index->n_fields); + + const byte* ptr = static_cast<const byte*>( + update->fields[0].new_val.data); + ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN)); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + == index->table->space->id); + + ulint n_fields = update->fields[1].field_no; + ut_ad(n_fields <= index->n_fields); + if (n_fields != index->n_uniq) { + ut_ad(n_fields + >= index->n_core_fields); + entry->n_fields = n_fields; + return; + } + + /* This is based on dict_table_t::deserialise_columns() + and btr_cur_instant_init_low(). */ + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB); + ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + == FIL_NULL); + ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4)); + n_fields = mach_read_from_4( + &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE]) + + index->first_user_field(); + /* Rollback should not increase the number of fields. */ + ut_ad(n_fields <= index->n_fields); + ut_ad(n_fields + 1 <= entry->n_fields); + /* dict_index_t::clear_instant_alter() cannot be invoked while + rollback of an instant ALTER TABLE transaction is in progress + for an is_alter_metadata() record. */ + ut_ad(n_fields >= index->n_core_fields); + + mtr.commit(); + entry->n_fields = n_fields + 1; +} + +/** Trim an update tuple due to instant ADD COLUMN, if needed. +For normal records, the trailing instantly added fields that match +the initial default values are omitted. + +For the special metadata record on a table on which instant +ADD COLUMN has already been executed, both ADD COLUMN and the +rollback of ADD COLUMN need to be handled specially. + +@param[in,out] entry index entry +@param[in] index index +@param[in] update update vector +@param[in] thr execution thread */ +static inline +void +btr_cur_trim( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + const que_thr_t* thr) +{ + if (!index->is_instant()) { + } else if (UNIV_UNLIKELY(update->is_metadata())) { + /* We are either updating a metadata record + (instant ALTER TABLE on a table where instant ALTER was + already executed) or rolling back such an operation. */ + ut_ad(!upd_get_nth_field(update, 0)->orig_len); + ut_ad(entry->is_metadata()); + + if (thr->graph->trx->in_rollback) { + /* This rollback can occur either as part of + ha_innobase::commit_inplace_alter_table() rolling + back after a failed innobase_add_instant_try(), + or as part of crash recovery. Either way, the + table will be in the data dictionary cache, with + the instantly added columns going to be removed + later in the rollback. */ + ut_ad(index->table->cached); + /* The DB_TRX_ID,DB_ROLL_PTR are always last, + and there should be some change to roll back. + The first field in the update vector is the + first instantly added column logged by + innobase_add_instant_try(). */ + ut_ad(update->n_fields > 2); + if (update->is_alter_metadata()) { + btr_cur_trim_alter_metadata( + entry, index, update); + return; + } + ut_ad(!entry->is_alter_metadata()); + + ulint n_fields = upd_get_nth_field(update, 0) + ->field_no; + ut_ad(n_fields + 1 >= entry->n_fields); + entry->n_fields = n_fields; + } + } else { + entry->trim(*index); + } +} + +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + page_cur_t* page_cursor; + dberr_t err; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + ulint max_size; + ulint new_rec_size; + ulint old_rec_size; + ulint max_ins_size = 0; + dtuple_t* new_entry; + roll_ptr_t roll_ptr; + ulint i; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + /* This is intended only for leaf page updates */ + ut_ad(page_is_leaf(page)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_index_page_check(page)); + ut_ad(btr_page_get_index_id(page) == index->id); + + *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, + ULINT_UNDEFINED, heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, *offsets) + || thr_get_trx(thr) == trx_roll_crash_recv_trx); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (UNIV_LIKELY(!update->is_metadata()) + && !row_upd_changes_field_size_or_external(index, *offsets, + update)) { + + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + + return(btr_cur_update_in_place( + flags, cursor, *offsets, update, + cmpl_info, thr, trx_id, mtr)); + } + + if (rec_offs_any_extern(*offsets)) { +any_extern: + ut_ad(!index->is_ibuf()); + /* Externally stored fields are treated in pessimistic + update */ + + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, index); + + return(DB_OVERFLOW); + } + + if (rec_is_metadata(rec, *index) && index->table->instant) { + goto any_extern; + } + + for (i = 0; i < upd_get_n_fields(update); i++) { + if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { + + goto any_extern; + } + } + + DBUG_LOG("ib_cur", + "update " << index->name << " (" << index->id << ") by " + << ib::hex(trx_id) << ": " + << rec_printer(rec, *offsets).str()); + + page_cursor = btr_cur_get_page_cur(cursor); + + if (!*heap) { + *heap = mem_heap_create( + rec_offs_size(*offsets) + + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets))); + } + + new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap); + ut_ad(!dtuple_get_n_ext(new_entry)); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. + Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + *heap); + btr_cur_trim(new_entry, index, update, thr); + old_rec_size = rec_offs_size(*offsets); + new_rec_size = rec_get_converted_size(index, new_entry, 0); + + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ut_ad(!index->table->is_temporary()); + + if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page), + dict_index_get_n_fields(index), + block->zip_size())) { + goto any_extern; + } + + if (!btr_cur_update_alloc_zip( + page_zip, page_cursor, index, *offsets, + new_rec_size, true, mtr)) { + return(DB_ZIP_OVERFLOW); + } + + rec = page_cur_get_rec(page_cursor); + } + + /* We limit max record size to 16k even for 64k page size. */ + if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || + (!dict_table_is_comp(index->table) + && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { + err = DB_OVERFLOW; + + goto func_exit; + } + + if (UNIV_UNLIKELY(new_rec_size + >= (page_get_free_space_of_empty(page_is_comp(page)) + / 2))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + err = DB_OVERFLOW; + goto func_exit; + } + + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* The page would become too empty */ + err = DB_UNDERFLOW; + goto func_exit; + } + + /* We do not attempt to reorganize if the page is compressed. + This is because the page may fail to compress after reorganization. */ + max_size = page_zip + ? page_get_max_insert_size(page, 1) + : (old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1)); + + if (!page_zip) { + max_ins_size = page_get_max_insert_size_after_reorganize( + page, 1); + } + + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) + && (max_size >= new_rec_size)) + || (page_get_n_recs(page) <= 1))) { + + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* There was not enough space, or it did not pay to + reorganize: for simplicity, we decide what to do assuming a + reorganization is needed, though it might not be necessary */ + + err = DB_OVERFLOW; + goto func_exit; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + /* Ok, we may do the replacement. Store on the page infimum the + explicit locks on rec, before deleting rec (see the comment in + btr_cur_pessimistic_update). */ + if (!dict_table_is_locking_disabled(index->table)) { + lock_rec_store_on_page_infimum(block, rec); + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(new_entry->is_metadata()); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ADD COLUMN, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + } + + page_cur_delete_rec(page_cursor, index, *offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); + } + + /* There are no externally stored columns in new_entry */ + rec = btr_cur_insert_if_possible( + cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr); + ut_a(rec); /* <- We calculated above the insert would fit */ + + if (UNIV_UNLIKELY(update->is_metadata())) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + btr_page_reorganize(page_cursor, index, mtr); + } else if (!dict_table_is_locking_disabled(index->table)) { + /* Restore the old explicit lock state on the record */ + lock_rec_restore_from_page_infimum(block, rec, block); + } + + page_cur_move_to_next(page_cursor); + ut_ad(err == DB_SUCCESS); + +func_exit: + if (!(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index)) { + /* Update the free bits in the insert buffer. */ + if (page_zip) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } else if (!index->table->is_temporary()) { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } + } + + if (err != DB_SUCCESS) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, index); + } + + return(err); +} + +/*************************************************************//** +If, in a split, a new supremum record was created as the predecessor of the +updated record, the supremum record must inherit exactly the locks on the +updated record. In the split it may have inherited locks from the successor +of the updated record, which is not correct. This function restores the +right locks for the new supremum. */ +static +void +btr_cur_pess_upd_restore_supremum( +/*==============================*/ + buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: updated record */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + buf_block_t* prev_block; + + page = buf_block_get_frame(block); + + if (page_rec_get_next(page_get_infimum_rec(page)) != rec) { + /* Updated record is not the first user record on its page */ + + return; + } + + const uint32_t prev_page_no = btr_page_get_prev(page); + + const page_id_t page_id(block->page.id().space(), prev_page_no); + + ut_ad(prev_page_no != FIL_NULL); + prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(), + mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(prev_block->frame) + == block->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + + /* We must already have an x-latch on prev_block! */ + ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX)); + + lock_rec_reset_and_inherit_gap_locks(prev_block, block, + PAGE_HEAP_NO_SUPREMUM, + page_rec_get_heap_no(rec)); +} + +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. +@return DB_SUCCESS or error code */ +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + upd_t* update, /*!< in/out: update vector; this is allowed to + also contain trx id and roll ptr fields. + Non-updated columns that are moved offpage will + be appended to this. */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be + committed before latching any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; + dict_index_t* index; + buf_block_t* block; + page_zip_des_t* page_zip; + rec_t* rec; + page_cur_t* page_cursor; + dberr_t err; + dberr_t optim_err; + roll_ptr_t roll_ptr; + bool was_first; + uint32_t n_reserved = 0; + + *offsets = NULL; + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page_zip = buf_block_get_page_zip(block); + index = cursor->index; + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | + MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(!page_zip || !index->table->is_temporary()); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~BTR_KEEP_POS_FLAG)) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + + err = optim_err = btr_cur_optimistic_update( + flags | BTR_KEEP_IBUF_BITMAP, + cursor, offsets, offsets_heap, update, + cmpl_info, thr, trx_id, mtr); + + switch (err) { + case DB_ZIP_OVERFLOW: + case DB_UNDERFLOW: + case DB_OVERFLOW: + break; + default: + err_exit: + /* We suppressed this with BTR_KEEP_IBUF_BITMAP. + For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were + already reset by btr_cur_update_alloc_zip() if the + page was recompressed. */ + if (page_zip + && optim_err != DB_ZIP_OVERFLOW + && !dict_index_is_clust(index) + && page_is_leaf(block->frame)) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } + + if (big_rec_vec != NULL) { + dtuple_big_rec_free(big_rec_vec); + } + + return(err); + } + + rec = btr_cur_get_rec(cursor); + ut_ad(rec_offs_validate(rec, index, *offsets)); + + dtuple_t* new_entry; + + const bool is_metadata = rec_is_metadata(rec, *index); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(update->is_metadata()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + new_entry = row_metadata_to_tuple( + rec, index, *offsets, entry_heap, + update->info_bits, !thr_get_trx(thr)->in_rollback); + ut_ad(new_entry->n_fields + == ulint(index->n_fields) + + update->is_alter_metadata()); + } else { + new_entry = row_rec_to_index_entry(rec, index, *offsets, + entry_heap); + } + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. If the + clustered index record is delete-marked, then its externally + stored fields cannot have been purged yet, because then the + purge would also have removed the clustered index record + itself. Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + entry_heap); + btr_cur_trim(new_entry, index, update, thr); + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_validate(rec, index, *offsets)); + + if ((flags & BTR_NO_UNDO_LOG_FLAG) + && rec_offs_any_extern(*offsets)) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update, if they are not + inherited values. They can be inherited if we have + updated the primary key to another value, and then + update it back again. */ + + ut_ad(big_rec_vec == NULL); + ut_ad(dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->in_rollback); + + DEBUG_SYNC_C("blob_rollback_middle"); + + btr_rec_free_updated_extern_fields( + index, rec, block, *offsets, update, true, mtr); + } + + ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0; + + if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(block->frame), + dict_index_get_n_fields(index), + block->zip_size()) + || (UNIV_UNLIKELY(update->is_alter_metadata()) + && !dfield_is_ext(dtuple_get_nth_field( + new_entry, + index->first_user_field())))) { + big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext); + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + /* We cannot goto return_after_reservations, + because we may need to update the + IBUF_BITMAP_FREE bits, which was suppressed by + BTR_KEEP_IBUF_BITMAP. */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->frame, + index)); +#endif /* UNIV_ZIP_DEBUG */ + index->table->space->release_free_extents(n_reserved); + err = DB_TOO_BIG_RECORD; + goto err_exit; + } + + ut_ad(page_is_leaf(block->frame)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + if (optim_err == DB_OVERFLOW) { + + /* First reserve enough free space for the file segments + of the index tree, so that the update will not fail because + of lack of space */ + + uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3); + + if (!fsp_reserve_free_extents( + &n_reserved, index->table->space, n_extents, + flags & BTR_NO_UNDO_LOG_FLAG + ? FSP_CLEANING : FSP_NORMAL, + mtr)) { + err = DB_OUT_OF_FILE_SPACE; + goto err_exit; + } + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); + } + + const ulint max_ins_size = page_zip + ? 0 : page_get_max_insert_size_after_reorganize(block->frame, + 1); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(new_entry->is_metadata()); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ALTER TABLE, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + + /* Store state of explicit locks on rec on the page + infimum record, before deleting rec. The page infimum + acts as a dummy carrier of the locks, taking care also + of lock releases, before we can move the locks back on + the actual record. There is a special case: if we are + inserting on the root page and the insert causes a + call of btr_root_raise_and_insert. Therefore we cannot + in the lock system delete the lock structs set on the + root page even if the root page carries just node + pointers. */ + if (!dict_table_is_locking_disabled(index->table)) { + lock_rec_store_on_page_infimum(block, rec); + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_delete_rec(page_cursor, index, *offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + rec = btr_cur_insert_if_possible(cursor, new_entry, + offsets, offsets_heap, n_ext, mtr); + + if (rec) { + page_cursor->rec = rec; + + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + btr_page_reorganize(page_cursor, index, mtr); + rec = page_cursor->rec; + rec_offs_make_valid(rec, index, true, *offsets); + if (page_cursor->block->page.id().page_no() + == index->page) { + btr_set_instant(page_cursor->block, *index, + mtr); + } + } else if (!dict_table_is_locking_disabled(index->table)) { + lock_rec_restore_from_page_infimum( + btr_cur_get_block(cursor), rec, block); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets)) + || rec_is_alter_metadata(rec, *index)) { + /* The new inserted record owns its possible externally + stored fields */ + btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), + rec, index, *offsets, mtr); + } else { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + } + + bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG); + ut_ad(!adjust || page_is_leaf(block->frame)); + + if (btr_cur_compress_if_useful(cursor, adjust, mtr)) { + if (adjust) { + rec_offs_make_valid(page_cursor->rec, index, + true, *offsets); + } + } else if (!dict_index_is_clust(index) + && page_is_leaf(block->frame)) { + /* Update the free bits in the insert buffer. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (page_zip) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } else if (!index->table->is_temporary()) { + ibuf_update_free_bits_low(block, max_ins_size, + mtr); + } + } + + if (!srv_read_only_mode + && !big_rec_vec + && page_is_leaf(block->frame) + && !dict_index_is_online_ddl(index)) { + + mtr_memo_release(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK); + + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } + + err = DB_SUCCESS; + goto return_after_reservations; + } else { + /* If the page is compressed and it initially + compresses very well, and there is a subsequent insert + of a badly-compressing record, it is possible for + btr_cur_optimistic_update() to return DB_UNDERFLOW and + btr_cur_insert_if_possible() to return FALSE. */ + ut_a(page_zip || optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(block->frame)) { + ibuf_reset_free_bits(block); + } + } + + if (big_rec_vec != NULL) { + ut_ad(page_is_leaf(block->frame)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + mtr_sx_lock_index(index, mtr); + } + + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); + + /* Lock checks and undo logging were already performed by + btr_cur_upd_lock_and_undo(). We do not try + btr_cur_optimistic_insert() because + btr_cur_insert_if_possible() already failed above. */ + + err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG, + cursor, offsets, offsets_heap, + new_entry, &rec, + &dummy_big_rec, n_ext, NULL, mtr); + ut_a(rec); + ut_a(err == DB_SUCCESS); + ut_a(dummy_big_rec == NULL); + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + page_cursor->rec = rec; + + /* Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (dict_index_is_sec_or_ibuf(index) + && !index->table->is_temporary()) { + /* Update PAGE_MAX_TRX_ID in the index page header. + It was not updated by btr_cur_pessimistic_insert() + because of BTR_NO_LOCKING_FLAG. */ + page_update_max_trx_id(btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + trx_id, mtr); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* The new inserted record owns its possible externally + stored fields */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, block->frame, + index)); +#endif /* UNIV_ZIP_DEBUG */ + btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec, + index, *offsets, mtr); + } else { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + } + + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + btr_page_reorganize(page_cursor, index, mtr); + rec = page_cursor->rec; + } else if (!dict_table_is_locking_disabled(index->table)) { + lock_rec_restore_from_page_infimum( + btr_cur_get_block(cursor), rec, block); + } + + /* If necessary, restore also the correct lock state for a new, + preceding supremum record created in a page split. While the old + record was nonexistent, the supremum might have inherited its locks + from a wrong record. */ + + if (!was_first && !dict_table_is_locking_disabled(index->table)) { + btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor), + rec, mtr); + } + +return_after_reservations: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor), + btr_cur_get_page(cursor), index)); +#endif /* UNIV_ZIP_DEBUG */ + + index->table->space->release_free_extents(n_reserved); + *big_rec = big_rec_vec; + return(err); +} + +/*==================== B-TREE DELETE MARK AND UNMARK ===============*/ + +/** Modify the delete-mark flag of a record. +@tparam flag the value of the delete-mark flag +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in,out] mtr mini-transaction */ +template<bool flag> +void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr) +{ + if (page_rec_is_comp(rec)) + { + byte *b= &rec[-REC_NEW_INFO_BITS]; + const byte v= flag + ? (*b | REC_INFO_DELETED_FLAG) + : (*b & byte(~REC_INFO_DELETED_FLAG)); + if (*b == v); + else if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + *b= v; + page_zip_rec_set_deleted(block, rec, flag, mtr); + } + else + mtr->write<1>(*block, b, v); + } + else + { + ut_ad(!block->page.zip.data); + byte *b= &rec[-REC_OLD_INFO_BITS]; + const byte v = flag + ? (*b | REC_INFO_DELETED_FLAG) + : (*b & byte(~REC_INFO_DELETED_FLAG)); + mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v); + } +} + +template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *); +template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *); + +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + const dtuple_t* entry, /*!< in: dtuple for the deleting record, also + contains the virtual cols if there are any */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + roll_ptr_t roll_ptr; + dberr_t err; + trx_t* trx; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(buf_block_get_frame(block) == page_align(rec)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(mtr->is_named_space(index->table->space)); + + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* We may already have delete-marked this record + when executing an ON DELETE CASCADE operation. */ + ut_ad(row_get_rec_trx_id(rec, index, offsets) + == thr_get_trx(thr)->id); + return(DB_SUCCESS); + } + + err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block, + rec, index, offsets, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + err = trx_undo_report_row_operation(thr, index, + entry, NULL, 0, rec, offsets, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* The search latch is not needed here, because + the adaptive hash index does not depend on the delete-mark + and the delete-mark is being updated in place. */ + + btr_rec_set_deleted<true>(block, rec, mtr); + + trx = thr_get_trx(thr); + + DBUG_LOG("ib_cur", + "delete-mark clust " << index->table->name + << " (" << index->id << ") by " + << ib::hex(trx_get_id_for_print(trx)) << ": " + << rec_printer(rec, offsets).str()); + + if (dict_index_is_online_ddl(index)) { + row_log_table_delete(rec, index, offsets, NULL); + } + + btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr); + return(err); +} + +/*==================== B-TREE RECORD REMOVE =========================*/ + +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return TRUE if compression occurred */ +ibool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + if (cursor->index->is_spatial()) { + const trx_t* trx = cursor->rtr_info->thr + ? thr_get_trx(cursor->rtr_info->thr) + : NULL; + const buf_block_t* block = btr_cur_get_block(cursor); + + /* Check whether page lock prevents the compression */ + if (!lock_test_prdt_page_lock(trx, block->page.id())) { + return(false); + } + } + + return(btr_cur_compress_recommendation(cursor, mtr) + && btr_compress(cursor, adjust, mtr)); +} + +/*******************************************************//** +Removes the record on which the tree cursor is positioned on a leaf page. +It is assumed that the mtr has an x-latch on the page where the cursor is +positioned, but no latch on the whole tree. +@return TRUE if success, i.e., the page did not become too empty */ +ibool +btr_cur_optimistic_delete_func( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to + delete; cursor stays valid: if deletion + succeeds, on function exit it points to the + successor of the deleted record */ +#ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +{ + buf_block_t* block; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(cursor->index->table->space)); + ut_ad(!cursor->index->is_dummy); + + /* This is intended only for leaf page deletions */ + + block = btr_cur_get_block(cursor); + + ut_ad(block->page.id().space() == cursor->index->table->space->id); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + ut_ad(!dict_index_is_online_ddl(cursor->index) + || dict_index_is_clust(cursor->index) + || (flags & BTR_CREATE_FLAG)); + + rec = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(rec, cursor->index, offsets, + cursor->index->n_core_fields, + ULINT_UNDEFINED, &heap); + + const ibool no_compress_needed = !rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr); + + if (!no_compress_needed) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, cursor->index); + goto func_exit; + } + + if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page + && page_get_n_recs(block->frame) == 1 + + (cursor->index->is_instant() + && !rec_is_metadata(rec, *cursor->index)) + && !cursor->index->must_avoid_clear_instant_add())) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists for instant ADD COLUMN (not generic ALTER TABLE). + If we are deleting the metadata record and the + table becomes empty, clean up the whole page. */ + dict_index_t* index = cursor->index; + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(block->frame)); + ut_ad(!index->is_instant() + || rec_is_metadata(first_rec, *index)); + const bool is_metadata = rec_is_metadata(rec, *index); + /* We can remove the metadata when rolling back an + instant ALTER TABLE operation, or when deleting the + last user record on the page such that only metadata for + instant ADD COLUMN (not generic ALTER TABLE) remains. */ + const bool empty_table = is_metadata + || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index)); + if (UNIV_LIKELY(empty_table)) { + if (UNIV_LIKELY(!is_metadata)) { + lock_update_delete(block, rec); + } + btr_page_empty(block, buf_block_get_page_zip(block), + index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + page_cur_set_after_last(block, + btr_cur_get_page_cur(cursor)); + goto func_exit; + } + } + + { + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG)) { + /* This should be rolling back instant ADD COLUMN. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(cursor->index->table->supports_instant()); + ut_ad(cursor->index->is_primary()); + ut_ad(!page_zip); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would have too many fields, and we would be + unable to know the size of the freed record. */ + btr_page_reorganize(btr_cur_get_page_cur(cursor), + cursor->index, mtr); + goto func_exit; + } else { + lock_update_delete(block, rec); + + btr_search_update_hash_on_delete(cursor); + } + + if (page_zip) { +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, cursor->index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, cursor->index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* On compressed pages, the IBUF_BITMAP_FREE + space is not affected by deleting (purging) + records, because it is defined as the minimum + of space available *without* reorganize, and + space available in the modification log. */ + } else { + const ulint max_ins + = page_get_max_insert_size_after_reorganize( + page, 1); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); + + /* The change buffer does not handle inserts + into non-leaf pages, into clustered indexes, + or into the change buffer. */ + if (!dict_index_is_clust(cursor->index) + && !cursor->index->table->is_temporary() + && !dict_index_is_ibuf(cursor->index)) { + ibuf_update_free_bits_low(block, max_ins, mtr); + } + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(no_compress_needed); +} + +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred and FALSE if not or something +wrong. */ +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + dict_index_t* index; + rec_t* rec; + uint32_t n_reserved = 0; + bool success; + ibool ret = FALSE; + mem_heap_t* heap; + rec_offs* offsets; +#ifdef UNIV_DEBUG + bool parent_latched = false; +#endif /* UNIV_DEBUG */ + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = btr_cur_get_index(cursor); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(!index->is_dummy); + ut_ad(block->page.id().space() == index->table->space->id); + + if (!has_reserved_extents) { + /* First reserve enough free space for the file segments + of the index tree, so that the node pointer updates will + not fail because of lack of space */ + + uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1); + + success = fsp_reserve_free_extents(&n_reserved, + index->table->space, + n_extents, + FSP_CLEANING, mtr); + if (!success) { + *err = DB_OUT_OF_FILE_SPACE; + + return(FALSE); + } + } + + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + + if (rec_offs_any_extern(offsets)) { + btr_rec_free_externally_stored_fields(index, + rec, offsets, block, + rollback, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + rec_t* next_rec = NULL; + bool min_mark_next_rec = false; + + if (page_is_leaf(page)) { + const bool is_metadata = rec_is_metadata( + rec, page_rec_is_comp(rec)); + if (UNIV_UNLIKELY(is_metadata)) { + /* This should be rolling back instant ALTER TABLE. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(rollback); + ut_ad(index->table->supports_instant()); + ut_ad(index->is_primary()); + } else if (flags == 0) { + lock_update_delete(block, rec); + } + + if (block->page.id().page_no() != index->page) { + if (page_get_n_recs(page) < 2) { + goto discard_page; + } + } else if (page_get_n_recs(page) == 1 + + (index->is_instant() && !is_metadata) + && !index->must_avoid_clear_instant_add()) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists for instant ADD COLUMN + (not generic ALTER TABLE). + If we are deleting the metadata record + (in the rollback of instant ALTER TABLE) and the + table becomes empty, clean up the whole page. */ + + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); + ut_ad(!index->is_instant() + || rec_is_metadata(first_rec, *index)); + if (is_metadata || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index))) { + btr_page_empty(block, page_zip, index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + page_cur_set_after_last( + block, + btr_cur_get_page_cur(cursor)); + ret = TRUE; + goto return_after_reservations; + } + } + + if (UNIV_LIKELY(!is_metadata)) { + btr_search_update_hash_on_delete(cursor); + } else { + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + index, offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would carry too many fields, and we would be + unable to know the size of the freed record. */ + btr_page_reorganize(btr_cur_get_page_cur(cursor), + index, mtr); + ut_ad(!ret); + goto return_after_reservations; + } + } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) { + if (page_rec_is_last(rec, page)) { +discard_page: + ut_ad(page_get_n_recs(page) == 1); + /* If there is only one record, drop + the whole page. */ + + btr_discard_page(cursor, mtr); + + ret = TRUE; + goto return_after_reservations; + } + + next_rec = page_rec_get_next(rec); + + if (!page_has_prev(page)) { + /* If we delete the leftmost node pointer on a + non-leaf level, we must mark the new leftmost node + pointer as the predefined minimum record */ + + min_mark_next_rec = true; + } else if (index->is_spatial()) { + /* For rtree, if delete the leftmost node pointer, + we need to update parent page. */ + rtr_mbr_t father_mbr; + rec_t* father_rec; + btr_cur_t father_cursor; + rec_offs* offsets; + bool upd_ret; + ulint len; + + rtr_page_get_father_block(NULL, heap, index, + block, mtr, NULL, + &father_cursor); + offsets = rec_get_offsets( + btr_cur_get_rec(&father_cursor), index, NULL, + 0, ULINT_UNDEFINED, &heap); + + father_rec = btr_cur_get_rec(&father_cursor); + rtr_read_mbr(rec_get_nth_field( + father_rec, offsets, 0, &len), &father_mbr); + + upd_ret = rtr_update_mbr_field(&father_cursor, offsets, + NULL, page, &father_mbr, + next_rec, mtr); + + if (!upd_ret) { + *err = DB_ERROR; + + mem_heap_free(heap); + return(FALSE); + } + + ut_d(parent_latched = true); + } else { + /* Otherwise, if we delete the leftmost node pointer + on a page, we have to change the parent node pointer + so that it is equal to the new leftmost node pointer + on the page */ + btr_cur_t cursor; + btr_page_get_father(index, block, mtr, &cursor); + btr_cur_node_ptr_delete(&cursor, mtr); + const ulint level = btr_page_get_level(page); + // FIXME: reuse the node_ptr from above + dtuple_t* node_ptr = dict_index_build_node_ptr( + index, next_rec, block->page.id().page_no(), + heap, level); + + btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr, mtr); + + ut_d(parent_latched = true); + } + } + + /* SPATIAL INDEX never use SX locks; we can allow page merges + while holding X lock on the spatial index tree. + Do not allow merges of non-leaf B-tree pages unless it is + safe to do so. */ + { + const bool allow_merge = page_is_leaf(page) + || dict_index_is_spatial(index) + || btr_cur_will_modify_tree( + index, page, BTR_INTENTION_DELETE, rec, + btr_node_ptr_max_size(index), + block->zip_size(), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, + offsets, mtr); + + if (min_mark_next_rec) { + btr_set_min_rec_mark(next_rec, *block, mtr); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(!parent_latched + || btr_check_node_ptr(index, block, mtr)); + + if (!ret && btr_cur_compress_recommendation(cursor, mtr)) { + if (UNIV_LIKELY(allow_merge)) { + ret = btr_cur_compress_if_useful( + cursor, FALSE, mtr); + } else { + ib::warn() << "Not merging page " + << block->page.id() + << " in index " << index->name + << " of " << index->table->name; + ut_ad("MDEV-14637" == 0); + } + } + } + +return_after_reservations: + *err = DB_SUCCESS; + + mem_heap_free(heap); + + if (!srv_read_only_mode + && page_is_leaf(page) + && !dict_index_is_online_ddl(index)) { + + mtr_memo_release(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK); + + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } + + index->table->space->release_free_extents(n_reserved); + return(ret); +} + +/** Delete the node pointer in a parent page. +@param[in,out] parent cursor pointing to parent record +@param[in,out] mtr mini-transaction */ +void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent), + MTR_MEMO_PAGE_X_FIX)); + dberr_t err; + ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent, + BTR_CREATE_FLAG, false, + mtr); + ut_a(err == DB_SUCCESS); + if (!compressed) { + btr_cur_compress_if_useful(parent, FALSE, mtr); + } +} + +/*******************************************************************//** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor positioned on a page */ + ulint height, /*!< in: height of the page in tree; + 0 means leaf node */ + ulint root_height) /*!< in: root node height in tree */ +{ + btr_path_t* slot; + + ut_a(cursor->path_arr); + + if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) { + /* Do nothing; return empty path */ + + slot = cursor->path_arr; + slot->nth_rec = ULINT_UNDEFINED; + + return; + } + + if (height == 0) { + /* Mark end of slots for path */ + slot = cursor->path_arr + root_height + 1; + slot->nth_rec = ULINT_UNDEFINED; + } + + slot = cursor->path_arr + (root_height - height); + + const buf_block_t* block = btr_cur_get_block(cursor); + + slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + slot->n_recs = page_get_n_recs(block->frame); + slot->page_no = block->page.id().page_no(); + slot->page_level = btr_page_get_level(block->frame); +} + +/*******************************************************************//** +Estimate the number of rows between slot1 and slot2 for any level on a +B-tree. This function starts from slot1->page and reads a few pages to +the right, counting their records. If we reach slot2->page quickly then +we know exactly how many records there are between slot1 and slot2 and +we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly +then we calculate the average number of records in the pages scanned +so far and assume that all pages that we did not scan up to slot2->page +contain the same number of records, then we multiply that average to +the number of pages between slot1->page and slot2->page (which is +n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE. +@return number of rows, not including the borders (exact or estimated) */ +static +ha_rows +btr_estimate_n_rows_in_range_on_level( +/*==================================*/ + dict_index_t* index, /*!< in: index */ + btr_path_t* slot1, /*!< in: left border */ + btr_path_t* slot2, /*!< in: right border */ + ha_rows n_rows_on_prev_level, /*!< in: number of rows + on the previous level for the + same descend paths; used to + determine the number of pages + on this level */ + bool* is_n_rows_exact) /*!< out: TRUE if the returned + value is exact i.e. not an + estimation */ +{ + ha_rows n_rows = 0; + uint n_pages_read = 0; + ulint level; + + /* Assume by default that we will scan all pages between + slot1->page_no and slot2->page_no. */ + *is_n_rows_exact = true; + + /* Add records from slot1->page_no which are to the right of + the record which serves as a left border of the range, if any + (we don't include the record itself in this count). */ + if (slot1->nth_rec <= slot1->n_recs) { + n_rows += slot1->n_recs - slot1->nth_rec; + } + + /* Add records from slot2->page_no which are to the left of + the record which servers as a right border of the range, if any + (we don't include the record itself in this count). */ + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + + /* Count the records in the pages between slot1->page_no and + slot2->page_no (non inclusive), if any. */ + + /* Do not read more than this number of pages in order not to hurt + performance with this code which is just an estimation. If we read + this many pages before reaching slot2->page_no then we estimate the + average from the pages scanned so far. */ +# define N_PAGES_READ_LIMIT 10 + + const fil_space_t* space = index->table->space; + page_id_t page_id(space->id, slot1->page_no); + const ulint zip_size = space->zip_size(); + + level = slot1->page_level; + + do { + mtr_t mtr; + page_t* page; + buf_block_t* block; + dberr_t err=DB_SUCCESS; + + mtr_start(&mtr); + + /* Fetch the page. Because we are not holding the + index->lock, the tree may have changed and we may be + attempting to read a page that is no longer part of + the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to + silence a debug assertion about this. */ + block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH, + NULL, BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, &mtr, &err); + + ut_ad((block != NULL) == (err == DB_SUCCESS)); + + if (!block) { + if (err == DB_DECRYPTION_FAILED) { + ib_push_warning((void *)NULL, + DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index->table->name.m_name); + index->table->file_unreadable = true; + } + + mtr_commit(&mtr); + goto inexact; + } + + page = buf_block_get_frame(block); + + /* It is possible that the tree has been reorganized in the + meantime and this is a different page. If this happens the + calculated estimate will be bogus, which is not fatal as + this is only an estimate. We are sure that a page with + page_no exists because InnoDB never frees pages, only + reuses them. */ + if (!fil_page_index_page_check(page) + || btr_page_get_index_id(page) != index->id + || btr_page_get_level(page) != level) { + + /* The page got reused for something else */ + mtr_commit(&mtr); + goto inexact; + } + + /* It is possible but highly unlikely that the page was + originally written by an old version of InnoDB that did + not initialize FIL_PAGE_TYPE on other than B-tree pages. + For example, this could be an almost-empty BLOB page + that happens to contain the magic values in the fields + that we checked above. */ + + n_pages_read++; + + if (page_id.page_no() != slot1->page_no) { + /* Do not count the records on slot1->page_no, + we already counted them before this loop. */ + n_rows += page_get_n_recs(page); + } + + page_id.set_page_no(btr_page_get_next(page)); + + mtr_commit(&mtr); + + if (n_pages_read == N_PAGES_READ_LIMIT + || page_id.page_no() == FIL_NULL) { + /* Either we read too many pages or + we reached the end of the level without passing + through slot2->page_no, the tree must have changed + in the meantime */ + goto inexact; + } + + } while (page_id.page_no() != slot2->page_no); + + return(n_rows); + +inexact: + + *is_n_rows_exact = false; + + /* We did interrupt before reaching slot2->page */ + + if (n_pages_read > 0) { + /* The number of pages on this level is + n_rows_on_prev_level, multiply it by the + average number of recs per page so far */ + n_rows = n_rows_on_prev_level * n_rows / n_pages_read; + } else { + /* The tree changed before we could even + start with slot1->page_no */ + n_rows = 10; + } + + return(n_rows); +} + +/** If the tree gets changed too much between the two dives for the left +and right boundary then btr_estimate_n_rows_in_range_low() will retry +that many times before giving up and returning the value stored in +rows_in_range_arbitrary_ret_val. */ +static const unsigned rows_in_range_max_retries = 4; + +/** We pretend that a range has that many records if the tree keeps changing +for rows_in_range_max_retries retries while we try to estimate the records +in a given range. */ +static const ha_rows rows_in_range_arbitrary_ret_val = 10; + +/** Estimates the number of rows in a given index range. +@param[in] index index +@param[in] tuple1 range start +@param[in] tuple2 range end +@param[in] nth_attempt if the tree gets modified too much while +we are trying to analyze it, then we will retry (this function will call +itself, incrementing this parameter) +@return estimated number of rows; if after rows_in_range_max_retries +retries the tree keeps changing, then we will just return +rows_in_range_arbitrary_ret_val as a result (if +nth_attempt >= rows_in_range_max_retries and the tree is modified between +the two dives). */ +static +ha_rows +btr_estimate_n_rows_in_range_low( + dict_index_t* index, + btr_pos_t* tuple1, + btr_pos_t* tuple2, + unsigned nth_attempt) +{ + btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS]; + btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS]; + btr_cur_t cursor; + btr_path_t* slot1; + btr_path_t* slot2; + bool diverged; + bool diverged_lot; + ulint divergence_level; + ha_rows n_rows; + bool is_n_rows_exact; + ulint i; + mtr_t mtr; + ha_rows table_n_rows; + page_cur_mode_t mode2= tuple2->mode; + + table_n_rows = dict_table_get_n_rows(index->table); + + /* Below we dive to the two records specified by tuple1 and tuple2 and + we remember the entire dive paths from the tree root. The place where + the tuple1 path ends on the leaf level we call "left border" of our + interval and the place where the tuple2 path ends on the leaf level - + "right border". We take care to either include or exclude the interval + boundaries depending on whether <, <=, > or >= was specified. For + example if "5 < x AND x <= 10" then we should not include the left + boundary, but should include the right one. */ + + mtr_start(&mtr); + + cursor.path_arr = path1; + + bool should_count_the_left_border; + + if (dtuple_get_n_fields(tuple1->tuple) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple1->tuple, + tuple1->mode, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, + __FILE__, __LINE__, &mtr); + + ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor))); + + /* We should count the border if there are any records to + match the criteria, i.e. if the maximum record on the tree is + 5 and x > 3 is specified then the cursor will be positioned at + 5 and we should count the border, but if x > 7 is specified, + then the cursor will be positioned at 'sup' on the rightmost + leaf page in the tree and we should not count the border. */ + should_count_the_left_border + = !page_rec_is_supremum(btr_cur_get_rec(&cursor)); + } else { + dberr_t err = DB_SUCCESS; + + err = btr_cur_open_at_index_side(true, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, &mtr); + + if (err != DB_SUCCESS) { + ib::warn() << " Error code: " << err + << " btr_estimate_n_rows_in_range_low " + << " called from file: " + << __FILE__ << " line: " << __LINE__ + << " table: " << index->table->name + << " index: " << index->name; + } + + ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor))); + + /* The range specified is wihout a left border, just + 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side() + positioned the cursor on the infimum record on the leftmost + page, which must not be counted. */ + should_count_the_left_border = false; + } + + tuple1->page_id= cursor.page_cur.block->page.id(); + + mtr_commit(&mtr); + + if (!index->is_readable()) { + return 0; + } + + mtr_start(&mtr); + + cursor.path_arr = path2; + + bool should_count_the_right_border; + + if (dtuple_get_n_fields(tuple2->tuple) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple2->tuple, + mode2, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, + __FILE__, __LINE__, &mtr); + + const rec_t* rec = btr_cur_get_rec(&cursor); + + ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec))); + + should_count_the_right_border + = (mode2 == PAGE_CUR_LE /* if the range is '<=' */ + /* and the record was found */ + && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple)) + || (mode2 == PAGE_CUR_L /* or if the range is '<' */ + /* and there are any records to match the criteria, + i.e. if the minimum record on the tree is 5 and + x < 7 is specified then the cursor will be + positioned at 5 and we should count the border, but + if x < 2 is specified, then the cursor will be + positioned at 'inf' and we should not count the + border */ + && !page_rec_is_infimum(rec)); + /* Notice that for "WHERE col <= 'foo'" MySQL passes to + ha_innobase::records_in_range(): + min_key=NULL (left-unbounded) which is expected + max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is + unexpected - one would expect + flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the + cursor will be positioned on the first record to the right of + the requested one (can also be positioned on the 'sup') and + we should not count the right border. */ + } else { + dberr_t err = DB_SUCCESS; + + err = btr_cur_open_at_index_side(false, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, &mtr); + + if (err != DB_SUCCESS) { + ib::warn() << " Error code: " << err + << " btr_estimate_n_rows_in_range_low " + << " called from file: " + << __FILE__ << " line: " << __LINE__ + << " table: " << index->table->name + << " index: " << index->name; + } + + ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor))); + + /* The range specified is wihout a right border, just + 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side() + positioned the cursor on the supremum record on the rightmost + page, which must not be counted. */ + should_count_the_right_border = false; + } + + tuple2->page_id= cursor.page_cur.block->page.id(); + + mtr_commit(&mtr); + + /* We have the path information for the range in path1 and path2 */ + + n_rows = 0; + is_n_rows_exact = true; + + /* This becomes true when the two paths do not pass through the + same pages anymore. */ + diverged = false; + + /* This becomes true when the paths are not the same or adjacent + any more. This means that they pass through the same or + neighboring-on-the-same-level pages only. */ + diverged_lot = false; + + /* This is the level where paths diverged a lot. */ + divergence_level = 1000000; + + for (i = 0; ; i++) { + ut_ad(i < BTR_PATH_ARRAY_N_SLOTS); + + slot1 = path1 + i; + slot2 = path2 + i; + + if (slot1->nth_rec == ULINT_UNDEFINED + || slot2->nth_rec == ULINT_UNDEFINED) { + + /* Here none of the borders were counted. For example, + if on the leaf level we descended to: + (inf, a, b, c, d, e, f, sup) + ^ ^ + path1 path2 + then n_rows will be 2 (c and d). */ + + if (is_n_rows_exact) { + /* Only fiddle to adjust this off-by-one + if the number is exact, otherwise we do + much grosser adjustments below. */ + + btr_path_t* last1 = &path1[i - 1]; + btr_path_t* last2 = &path2[i - 1]; + + /* If both paths end up on the same record on + the leaf level. */ + if (last1->page_no == last2->page_no + && last1->nth_rec == last2->nth_rec) { + + /* n_rows can be > 0 here if the paths + were first different and then converged + to the same record on the leaf level. + For example: + SELECT ... LIKE 'wait/synch/rwlock%' + mode1=PAGE_CUR_GE, + tuple1="wait/synch/rwlock" + path1[0]={nth_rec=58, n_recs=58, + page_no=3, page_level=1} + path1[1]={nth_rec=56, n_recs=55, + page_no=119, page_level=0} + + mode2=PAGE_CUR_G + tuple2="wait/synch/rwlock" + path2[0]={nth_rec=57, n_recs=57, + page_no=3, page_level=1} + path2[1]={nth_rec=56, n_recs=55, + page_no=119, page_level=0} */ + + /* If the range is such that we should + count both borders, then avoid + counting that record twice - once as a + left border and once as a right + border. */ + if (should_count_the_left_border + && should_count_the_right_border) { + + n_rows = 1; + } else { + /* Some of the borders should + not be counted, e.g. [3,3). */ + n_rows = 0; + } + } else { + if (should_count_the_left_border) { + n_rows++; + } + + if (should_count_the_right_border) { + n_rows++; + } + } + } + + if (i > divergence_level + 1 && !is_n_rows_exact) { + /* In trees whose height is > 1 our algorithm + tends to underestimate: multiply the estimate + by 2: */ + + n_rows = n_rows * 2; + } + + DBUG_EXECUTE_IF("bug14007649", return(n_rows);); + + /* Do not estimate the number of rows in the range + to over 1 / 2 of the estimated rows in the whole + table */ + + if (n_rows > table_n_rows / 2 && !is_n_rows_exact) { + + n_rows = table_n_rows / 2; + + /* If there are just 0 or 1 rows in the table, + then we estimate all rows are in the range */ + + if (n_rows == 0) { + n_rows = table_n_rows; + } + } + + return(n_rows); + } + + if (!diverged && slot1->nth_rec != slot2->nth_rec) { + + /* If both slots do not point to the same page, + this means that the tree must have changed between + the dive for slot1 and the dive for slot2 at the + beginning of this function. */ + if (slot1->page_no != slot2->page_no + || slot1->page_level != slot2->page_level) { + + /* If the tree keeps changing even after a + few attempts, then just return some arbitrary + number. */ + if (nth_attempt >= rows_in_range_max_retries) { + return(rows_in_range_arbitrary_ret_val); + } + + return btr_estimate_n_rows_in_range_low( + index, tuple1, tuple2, + nth_attempt + 1); + } + + diverged = true; + + if (slot1->nth_rec < slot2->nth_rec) { + /* We do not count the borders (nor the left + nor the right one), thus "- 1". */ + n_rows = slot2->nth_rec - slot1->nth_rec - 1; + + if (n_rows > 0) { + /* There is at least one row between + the two borders pointed to by slot1 + and slot2, so on the level below the + slots will point to non-adjacent + pages. */ + diverged_lot = true; + divergence_level = i; + } + } else { + /* It is possible that + slot1->nth_rec >= slot2->nth_rec + if, for example, we have a single page + tree which contains (inf, 5, 6, supr) + and we select where x > 20 and x < 30; + in this case slot1->nth_rec will point + to the supr record and slot2->nth_rec + will point to 6. */ + n_rows = 0; + should_count_the_left_border = false; + should_count_the_right_border = false; + } + + } else if (diverged && !diverged_lot) { + + if (slot1->nth_rec < slot1->n_recs + || slot2->nth_rec > 1) { + + diverged_lot = true; + divergence_level = i; + + n_rows = 0; + + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs + - slot1->nth_rec; + } + + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + } + } else if (diverged_lot) { + + n_rows = btr_estimate_n_rows_in_range_on_level( + index, slot1, slot2, n_rows, + &is_n_rows_exact); + } + } +} + +/** Estimates the number of rows in a given index range. +@param[in] index index +@param[in] tuple1 range start, may also be empty tuple +@param[in] mode1 search mode for range start +@param[in] tuple2 range end, may also be empty tuple +@param[in] mode2 search mode for range end +@return estimated number of rows */ +ha_rows +btr_estimate_n_rows_in_range( + dict_index_t* index, + btr_pos_t *tuple1, + btr_pos_t *tuple2) +{ + return btr_estimate_n_rows_in_range_low( + index, tuple1, tuple2, 1); +} + +/*******************************************************************//** +Record the number of non_null key values in a given index for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + ulint n_unique, /*!< in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_uint64_t* n_not_null) /*!< in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + if (rec_offs_nth_sql_null(offsets, i)) { + break; + } + + n_not_null[i]++; + } +} + +/** Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +result.n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array result.n_non_null_key_vals. +@param[in] index index +@return vector with statistics information +empty vector if the index is unavailable. */ +std::vector<index_field_stats_t> +btr_estimate_number_of_different_key_vals(dict_index_t* index) +{ + btr_cur_t cursor; + page_t* page; + rec_t* rec; + ulint n_cols; + ib_uint64_t* n_diff; + ib_uint64_t* n_not_null; + ibool stats_null_not_equal; + uintmax_t n_sample_pages=1; /* number of pages to sample */ + ulint not_empty_flag = 0; + ulint total_external_size = 0; + ulint i; + ulint j; + uintmax_t add_on; + mtr_t mtr; + mem_heap_t* heap = NULL; + rec_offs* offsets_rec = NULL; + rec_offs* offsets_next_rec = NULL; + + std::vector<index_field_stats_t> result; + + /* For spatial index, there is no such stats can be + fetched. */ + ut_ad(!dict_index_is_spatial(index)); + + n_cols = dict_index_get_n_unique(index); + + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * n_cols + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof(n_diff[0])); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } + + if (srv_stats_sample_traditional) { + /* It makes no sense to test more pages than are contained + in the index, thus we lower the number if it is too high */ + if (srv_stats_transient_sample_pages > index->stat_index_size) { + if (index->stat_index_size > 0) { + n_sample_pages = index->stat_index_size; + } + } else { + n_sample_pages = srv_stats_transient_sample_pages; + } + } else { + /* New logaritmic number of pages that are estimated. + Number of pages estimated should be between 1 and + index->stat_index_size. + + If we have only 0 or 1 index pages then we can only take 1 + sample. We have already initialized n_sample_pages to 1. + + So taking index size as I and sample as S and log(I)*S as L + + requirement 1) we want the out limit of the expression to not exceed I; + requirement 2) we want the ideal pages to be at least S; + so the current expression is min(I, max( min(S,I), L) + + looking for simplifications: + + case 1: assume S < I + min(I, max( min(S,I), L) -> min(I , max( S, L)) + + but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L. + + so we have: min(I , L) + + case 2: assume I < S + min(I, max( min(S,I), L) -> min(I, max( I, L)) + + case 2a: L > I + min(I, max( I, L)) -> min(I, L) -> I + + case 2b: when L < I + min(I, max( I, L)) -> min(I, I ) -> I + + so taking all case2 paths is I, our expression is: + n_pages = S < I? min(I,L) : I + */ + if (index->stat_index_size > 1) { + n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) + ? ut_min(index->stat_index_size, + static_cast<ulint>( + log2(double(index->stat_index_size)) + * double(srv_stats_transient_sample_pages))) + : index->stat_index_size; + } + } + + /* Sanity check */ + ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size)); + + /* We sample some pages in the index to get an estimate */ + + for (i = 0; i < n_sample_pages; i++) { + mtr_start(&mtr); + + bool available; + + available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, + &cursor, &mtr); + + if (!available) { + mtr_commit(&mtr); + mem_heap_free(heap); + + return result; + } + + /* Count the number of different key values for each prefix of + the key on this index page. If the prefix does not determine + the index record uniquely in the B-tree, then we subtract one + because otherwise our algorithm would give a wrong estimate + for an index where there is just one key value. */ + + if (!index->is_readable()) { + mtr_commit(&mtr); + goto exit_loop; + } + + page = btr_cur_get_page(&cursor); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + const ulint n_core = page_is_leaf(page) + ? index->n_core_fields : 0; + + if (!page_rec_is_supremum(rec)) { + not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + n_core, + ULINT_UNDEFINED, &heap); + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_rec, n_not_null); + } + } + + while (!page_rec_is_supremum(rec)) { + ulint matched_fields; + rec_t* next_rec = page_rec_get_next(rec); + if (page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); + break; + } + + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + n_core, + ULINT_UNDEFINED, + &heap); + + cmp_rec_rec(rec, next_rec, + offsets_rec, offsets_next_rec, + index, stats_null_not_equal, + &matched_fields); + + for (j = matched_fields; j < n_cols; j++) { + /* We add one if this index record has + a different prefix from the previous */ + + n_diff[j]++; + } + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_next_rec, n_not_null); + } + + total_external_size + += btr_rec_get_externally_stored_len( + rec, offsets_rec); + + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + rec_offs* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + } + + if (n_cols == dict_index_get_n_unique_in_tree(index) + && page_has_siblings(page)) { + + /* If there is more than one leaf page in the tree, + we add one because we know that the first record + on the page certainly had a different prefix than the + last record on the previous index page in the + alphabetical order. Before this fix, if there was + just one big record on each clustered index page, the + algorithm grossly underestimated the number of rows + in the table. */ + + n_diff[n_cols - 1]++; + } + + mtr_commit(&mtr); + } + +exit_loop: + /* If we saw k borders between different key values on + n_sample_pages leaf pages, we can estimate how many + there will be in index->stat_n_leaf_pages */ + + /* We must take into account that our sample actually represents + also the pages used for external storage of fields (those pages are + included in index->stat_n_leaf_pages) */ + + result.reserve(n_cols); + + for (j = 0; j < n_cols; j++) { + index_field_stats_t stat; + + stat.n_diff_key_vals + = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, n_sample_pages, + total_external_size, not_empty_flag); + + /* If the tree is small, smaller than + 10 * n_sample_pages + total_external_size, then + the above estimate is ok. For bigger trees it is common that we + do not see any borders between key values in the few pages + we pick. But still there may be n_sample_pages + different key values, or even more. Let us try to approximate + that: */ + + add_on = index->stat_n_leaf_pages + / (10 * (n_sample_pages + + total_external_size)); + + if (add_on > n_sample_pages) { + add_on = n_sample_pages; + } + + stat.n_diff_key_vals += add_on; + + stat.n_sample_sizes = n_sample_pages; + + if (n_not_null != NULL) { + stat.n_non_null_key_vals = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, n_sample_pages, + total_external_size, not_empty_flag); + } + + result.push_back(stat); + } + + mem_heap_free(heap); + + return result; +} + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/***********************************************************//** +Gets the offset of the pointer to the externally stored part of a field. +@return offset of the pointer to the externally stored part */ +static +ulint +btr_rec_get_field_ref_offs( +/*=======================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: index of the external field */ +{ + ulint field_ref_offs; + ulint local_len; + + ut_a(rec_offs_nth_extern(offsets, n)); + field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len); + ut_a(len_is_stored(local_len)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE); +} + +/** Gets a pointer to the externally stored part of a field. +@param rec record +@param offsets rec_get_offsets(rec) +@param n index of the externally stored field +@return pointer to the externally stored part */ +#define btr_rec_get_field_ref(rec, offsets, n) \ + ((rec) + btr_rec_get_field_ref_offs(offsets, n)) + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const rec_offs* offsets) +{ + ulint n_fields; + ulint total_extern_len = 0; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + ulint extern_len = mach_read_from_4( + btr_rec_get_field_ref(rec, offsets, i) + + BTR_EXTERN_LEN + 4); + + total_extern_len += ut_calc_align( + extern_len, ulint(srv_page_size)); + } + } + + return total_extern_len >> srv_page_size_shift; +} + +/*******************************************************************//** +Sets the ownership bit of an externally stored field in a record. */ +static +void +btr_cur_set_ownership_of_extern_field( +/*==================================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: clustered index record */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint i, /*!< in: field number */ + bool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + byte* data; + ulint local_len; + ulint byte_val; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + ut_ad(rec_offs_nth_extern(offsets, i)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN); + + if (val) { + byte_val &= ~BTR_EXTERN_OWNER_FLAG; + } else { +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + byte_val |= BTR_EXTERN_OWNER_FLAG; + } + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr); + } else { + mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len + + BTR_EXTERN_LEN, byte_val); + } +} + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +void +btr_cur_disown_inherited_fields( +/*============================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_any_extern(offsets)); + + for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i) + && !upd_get_field_by_field_no(update, i, false)) { + btr_cur_set_ownership_of_extern_field( + block, rec, index, offsets, i, false, mtr); + } + } +} + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + if (!rec_offs_any_extern(offsets)) { + return; + } + + const ulint n = rec_offs_n_fields(offsets); + + for (ulint i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_cur_set_ownership_of_extern_field( + block, rec, index, offsets, i, true, mtr); + } + } +} + +/*******************************************************************//** +Returns the length of a BLOB part stored on the header page. +@return part length */ +static +uint32_t +btr_blob_get_part_len( +/*==================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*******************************************************************//** +Returns the page number where the next BLOB part is stored. +@return page number or FIL_NULL if no more pages */ +static +uint32_t +btr_blob_get_next_page_no( +/*======================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/** Deallocate a buffer block that was reserved for a BLOB part. +@param block buffer block +@param all flag whether to remove a ROW_FORMAT=COMPRESSED page +@param mtr mini-transaction to commit */ +static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr) +{ + const page_id_t page_id(block->page.id()); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + mtr->commit(); + + const ulint fold= page_id.fold(); + + mysql_mutex_lock(&buf_pool.mutex); + + if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold)) + if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data) + /* Attempt to deallocate the redundant copy of the uncompressed page + if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */ + buf_LRU_free_page(bpage, false); + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Helper class used while writing blob pages, during insert or update. */ +struct btr_blob_log_check_t { + /** Persistent cursor on a clusterex index record with blobs. */ + btr_pcur_t* m_pcur; + /** Mini transaction holding the latches for m_pcur */ + mtr_t* m_mtr; + /** rec_get_offsets(rec, index); offset of clust_rec */ + const rec_offs* m_offsets; + /** The block containing clustered record */ + buf_block_t** m_block; + /** The clustered record pointer */ + rec_t** m_rec; + /** The blob operation code */ + enum blob_op m_op; + + /** Constructor + @param[in] pcur persistent cursor on a clustered + index record with blobs. + @param[in] mtr mini-transaction holding latches for + pcur. + @param[in] offsets offsets of the clust_rec + @param[in,out] block record block containing pcur record + @param[in,out] rec the clustered record pointer + @param[in] op the blob operation code */ + btr_blob_log_check_t( + btr_pcur_t* pcur, + mtr_t* mtr, + const rec_offs* offsets, + buf_block_t** block, + rec_t** rec, + enum blob_op op) + : m_pcur(pcur), + m_mtr(mtr), + m_offsets(offsets), + m_block(block), + m_rec(rec), + m_op(op) + { + ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets)); + ut_ad((*m_block)->frame == page_align(*m_rec)); + ut_ad(*m_rec == btr_pcur_get_rec(m_pcur)); + } + + /** Check if there is enough space in log file. Commit and re-start the + mini transaction. */ + void check() + { + dict_index_t* index = m_pcur->index(); + ulint offs = 0; + uint32_t page_no = FIL_NULL; + + if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) { + offs = page_offset(*m_rec); + page_no = (*m_block)->page.id().page_no(); + buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__); + ut_ad(page_no != FIL_NULL); + } else { + btr_pcur_store_position(m_pcur, m_mtr); + } + m_mtr->commit(); + + DEBUG_SYNC_C("blob_write_middle"); + + log_free_check(); + + DEBUG_SYNC_C("blob_write_middle_after_check"); + + const mtr_log_t log_mode = m_mtr->get_log_mode(); + m_mtr->start(); + m_mtr->set_log_mode(log_mode); + index->set_modified(*m_mtr); + + if (UNIV_UNLIKELY(page_no != FIL_NULL)) { + m_pcur->btr_cur.page_cur.block = btr_block_get( + *index, page_no, RW_X_LATCH, false, m_mtr); + m_pcur->btr_cur.page_cur.rec + = m_pcur->btr_cur.page_cur.block->frame + + offs; + + buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block); + } else { + ut_ad(m_pcur->rel_pos == BTR_PCUR_ON); + bool ret = btr_pcur_restore_position( + BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL, + m_pcur, m_mtr); + + ut_a(ret); + } + + *m_block = btr_pcur_get_block(m_pcur); + *m_rec = btr_pcur_get_rec(m_pcur); + + rec_offs_make_valid(*m_rec, index, true, + const_cast<rec_offs*>(m_offsets)); + + ut_ad(m_mtr->memo_contains_page_flagged( + *m_rec, + MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); + + ut_ad((m_op == BTR_STORE_INSERT_BULK) + == !m_mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK + | MTR_MEMO_X_LOCK)); + } +}; + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. + +TODO: If the allocation extends the tablespace, it will not be redo logged, in +any mini-transaction. Tablespace extension should be redo-logged, so that +recovery will not fail when the big_rec was written to the extended portion of +the file, in case the file was somehow truncated in the crash. + +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if + btr_mtr is restarted, then this can + be repositioned. */ + rec_offs* offsets, /*!< in/out: rec_get_offsets() on + pcur. the "external storage" flags + in offsets will correctly correspond + to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in/out: mtr containing the + latches to the clustered index. can be + committed and restarted. */ + enum blob_op op) /*! in: operation code */ +{ + byte* field_ref; + ulint extern_len; + ulint store_len; + ulint space_id; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + page_zip_des_t* page_zip; + z_stream c_stream; + dberr_t error = DB_SUCCESS; + dict_index_t* index = pcur->index(); + buf_block_t* rec_block = btr_pcur_get_block(pcur); + rec_t* rec = btr_pcur_get_rec(pcur); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(op == BTR_STORE_INSERT_BULK + || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); + ut_a(dict_index_is_clust(index)); + + btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block, + &rec, op); + page_zip = buf_block_get_page_zip(rec_block); + space_id = rec_block->page.id().space(); + ut_a(fil_page_index_page_check(page_align(rec)) + || op == BTR_STORE_INSERT_BULK); + + if (page_zip) { + int err; + + /* Zlib deflate needs 128 kilobytes for the default + window size, plus 512 << memLevel, plus a few + kilobytes for small objects. We use reduced memLevel + to limit the memory consumption, and preallocate the + heap, hoping to avoid memory fragmentation. */ + heap = mem_heap_create(250000); + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, int(page_zip_level), + Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must either be zero or they must be pointers to inherited + columns, owned by this record or an earlier record version. */ + for (i = 0; i < big_rec_vec->n_fields; i++) { + field_ref = btr_rec_get_field_ref( + rec, offsets, big_rec_vec->fields[i].field_no); + + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + /* Either this must be an update in place, + or the BLOB must be inherited, or the BLOB pointer + must be zero (will be written in this function). */ + ut_a(op == BTR_STORE_UPDATE + || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + || !memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Space available in compressed page to carry blob data */ + const ulint payload_size_zip = rec_block->physical_size() + - FIL_PAGE_DATA; + + /* Space available in uncompressed page to carry blob data */ + const ulint payload_size = payload_size_zip + - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END); + + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + const ulint field_no = big_rec_vec->fields[i].field_no; + + field_ref = btr_rec_get_field_ref(rec, offsets, field_no); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* A zero BLOB pointer should have been initially inserted. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + extern_len = big_rec_vec->fields[i].len; + MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len); + ut_a(extern_len > 0); + + uint32_t prev_page_no = FIL_NULL; + + if (page_zip) { + int err = deflateReset(&c_stream); + ut_a(err == Z_OK); + + c_stream.next_in = (Bytef*) + big_rec_vec->fields[i].data; + c_stream.avail_in = static_cast<uInt>(extern_len); + } + + for (ulint blob_npages = 0;; ++blob_npages) { + buf_block_t* block; + const ulint commit_freq = 4; + uint32_t r_extents; + + ut_ad(page_align(field_ref) == page_align(rec)); + + if (!(blob_npages % commit_freq)) { + + redo_log.check(); + + field_ref = btr_rec_get_field_ref( + rec, offsets, field_no); + + page_zip = buf_block_get_page_zip(rec_block); + } + + mtr.start(); + index->set_modified(mtr); + mtr.set_log_mode(btr_mtr->get_log_mode()); + + buf_page_get(rec_block->page.id(), + rec_block->zip_size(), RW_X_LATCH, &mtr); + + uint32_t hint_prev = prev_page_no; + if (hint_prev == FIL_NULL) { + hint_prev = rec_block->page.id().page_no(); + } + + if (!fsp_reserve_free_extents(&r_extents, + index->table->space, 1, + FSP_BLOB, &mtr, 1)) { + mtr.commit(); + error = DB_OUT_OF_FILE_SPACE; + goto func_exit; + } + + block = btr_page_alloc(index, hint_prev + 1, + FSP_NO_DIR, 0, &mtr, &mtr); + + index->table->space->release_free_extents(r_extents); + + ut_a(block != NULL); + + const uint32_t page_no = block->page.id().page_no(); + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block; + + prev_block = buf_page_get( + page_id_t(space_id, prev_page_no), + rec_block->zip_size(), + RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(prev_block, + SYNC_EXTERN_STORAGE); + + if (page_zip) { + mtr.write<4>(*prev_block, + prev_block->frame + + FIL_PAGE_NEXT, + page_no); + memcpy_aligned<4>( + buf_block_get_page_zip( + prev_block) + ->data + FIL_PAGE_NEXT, + prev_block->frame + + FIL_PAGE_NEXT, 4); + } else { + mtr.write<4>(*prev_block, + BTR_BLOB_HDR_NEXT_PAGE_NO + + FIL_PAGE_DATA + + prev_block->frame, + page_no); + } + } else if (dict_index_is_online_ddl(index)) { + row_log_table_blob_alloc(index, page_no); + } + + ut_ad(!page_has_siblings(block->frame)); + ut_ad(!fil_page_get_type(block->frame)); + + if (page_zip) { + int err; + page_zip_des_t* blob_page_zip; + + mtr.write<1>(*block, + FIL_PAGE_TYPE + 1 + block->frame, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2); + block->page.zip.data[FIL_PAGE_TYPE + 1] + = block->frame[FIL_PAGE_TYPE + 1]; + + c_stream.next_out = block->frame + + FIL_PAGE_DATA; + c_stream.avail_out = static_cast<uInt>( + payload_size_zip); + + err = deflate(&c_stream, Z_FINISH); + ut_a(err == Z_OK || err == Z_STREAM_END); + ut_a(err == Z_STREAM_END + || c_stream.avail_out == 0); + + mtr.memcpy(*block, + FIL_PAGE_DATA, + page_zip_get_size(page_zip) + - FIL_PAGE_DATA + - c_stream.avail_out); + /* Copy the page to compressed storage, + because it will be flushed to disk + from there. */ + blob_page_zip = buf_block_get_page_zip(block); + ut_ad(blob_page_zip); + ut_ad(page_zip_get_size(blob_page_zip) + == page_zip_get_size(page_zip)); + memcpy(blob_page_zip->data, block->frame, + page_zip_get_size(page_zip)); + + if (err == Z_OK && prev_page_no != FIL_NULL) { + + goto next_zip_page; + } + + if (err == Z_STREAM_END) { + mach_write_to_4(field_ref + + BTR_EXTERN_LEN, 0); + mach_write_to_4(field_ref + + BTR_EXTERN_LEN + 4, + c_stream.total_in); + } else { + memset(field_ref + BTR_EXTERN_LEN, + 0, 8); + } + + if (prev_page_no == FIL_NULL) { + ut_ad(blob_npages == 0); + mach_write_to_4(field_ref + + BTR_EXTERN_SPACE_ID, + space_id); + + mach_write_to_4(field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mach_write_to_4(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_NEXT); + } + + /* We compress a page when finish bulk insert.*/ + if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) { + page_zip_write_blob_ptr( + rec_block, rec, index, offsets, + field_no, &mtr); + } + +next_zip_page: + prev_page_no = page_no; + + /* Commit mtr and release the + uncompressed page frame to save memory. */ + btr_blob_free(block, FALSE, &mtr); + + if (err == Z_STREAM_END) { + break; + } + } else { + mtr.write<1>(*block, FIL_PAGE_TYPE + 1 + + block->frame, + FIL_PAGE_TYPE_BLOB); + + if (extern_len > payload_size) { + store_len = payload_size; + } else { + store_len = extern_len; + } + + mtr.memcpy<mtr_t::MAYBE_NOP>( + *block, + FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE + + block->frame, + static_cast<const byte*> + (big_rec_vec->fields[i].data) + + big_rec_vec->fields[i].len + - extern_len, store_len); + mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN + + FIL_PAGE_DATA + block->frame, + store_len); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO + + FIL_PAGE_DATA, 4, 0xff); + + extern_len -= store_len; + + ut_ad(!mach_read_from_4(BTR_EXTERN_LEN + + field_ref)); + mtr.write<4>(*rec_block, + BTR_EXTERN_LEN + 4 + field_ref, + big_rec_vec->fields[i].len + - extern_len); + + if (prev_page_no == FIL_NULL) { + ut_ad(blob_npages == 0); + mtr.write<4,mtr_t::MAYBE_NOP>( + *rec_block, + field_ref + BTR_EXTERN_SPACE_ID, + space_id); + + mtr.write<4>(*rec_block, field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mtr.write<4>(*rec_block, field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA); + } + + prev_page_no = page_no; + + mtr.commit(); + + if (extern_len == 0) { + break; + } + } + } + + DBUG_EXECUTE_IF("btr_store_big_rec_extern", + error = DB_OUT_OF_FILE_SPACE; + goto func_exit;); + + rec_offs_make_nth_extern(offsets, field_no); + } + +func_exit: + if (page_zip) { + deflateEnd(&c_stream); + } + + if (heap != NULL) { + mem_heap_free(heap); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must be valid. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + /* The pointer must not be zero if the operation + succeeded. */ + ut_a(0 != memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE) + || error != DB_SUCCESS); + /* The column must not be disowned by this record. */ + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + return(error); +} + +/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page. +@param[in] block uncompressed BLOB page +@param[in] read true=read, false=purge */ +static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read) +{ + uint16_t type= fil_page_get_type(block.frame); + + if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB)) + return; + /* FIXME: take the tablespace as a parameter */ + if (fil_space_t *space= fil_space_t::get(block.page.id().space())) + { + /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB + pages. Do not print anything about the type mismatch when reading + a BLOB page that may be from old versions. */ + if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags)) + { + ib::fatal() << "FIL_PAGE_TYPE=" << type + << (read ? " on BLOB read file " : " on BLOB purge file ") + << space->chain.start->name + << " page " << block.page.id().page_no(); + } + space->release(); + } +} + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned by the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + buf_block_t* block, /*!< in/out: page of field_ref */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + bool rollback, /*!< in: performing rollback? */ + mtr_t* local_mtr) /*!< in: mtr + containing the latch to data an an + X-latch to the index tree */ +{ + page_t* page; + const uint32_t space_id = mach_read_from_4( + field_ref + BTR_EXTERN_SPACE_ID); + const uint32_t start_page = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + uint32_t page_no; + uint32_t next_page_no; + mtr_t mtr; + + ut_ad(index->is_primary()); + ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(local_mtr->memo_contains_page_flagged(field_ref, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i)); + ut_ad(local_mtr->is_named_space( + page_get_space_id(page_align(field_ref)))); + + if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* In the rollback, we may encounter a clustered index + record with some unwritten off-page columns. There is + nothing to free then. */ + ut_a(rollback); + return; + } + + ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN) + & ~((BTR_EXTERN_OWNER_FLAG + | BTR_EXTERN_INHERITED_FLAG) << 24))); + ut_ad(space_id == index->table->space->id); + ut_ad(space_id == index->table->space_id); + + const ulint ext_zip_size = index->table->space->zip_size(); + const ulint rec_zip_size = rec ? ext_zip_size : 0; + + /* !rec holds in a call from purge when field_ref is in an undo page */ + ut_ad(rec || !block->page.zip.data); + + for (;;) { +#ifdef UNIV_DEBUG + buf_block_t* rec_block; +#endif /* UNIV_DEBUG */ + buf_block_t* ext_block; + + mtr_start(&mtr); + mtr.set_spaces(*local_mtr); + mtr.set_log_mode(local_mtr->get_log_mode()); + + ut_ad(!index->table->is_temporary() + || local_mtr->get_log_mode() == MTR_LOG_NO_REDO); + + const page_t* p = page_align(field_ref); + + const page_id_t page_id(page_get_space_id(p), + page_get_page_no(p)); + +#ifdef UNIV_DEBUG + rec_block = +#endif /* UNIV_DEBUG */ + buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK); + page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + if (/* There is no external storage data */ + page_no == FIL_NULL + /* This field does not own the externally stored field */ + || (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_OWNER_FLAG) + /* Rollback and inherited field */ + || (rollback + && (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_INHERITED_FLAG))) { + + /* Do not free */ + mtr_commit(&mtr); + + return; + } + + if (page_no == start_page && dict_index_is_online_ddl(index)) { + row_log_table_blob_free(index, start_page); + } + + ext_block = buf_page_get( + page_id_t(space_id, page_no), ext_zip_size, + RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(ext_block); + + if (ext_zip_size) { + /* Note that page_zip will be NULL + in row_purge_upd_exist_or_extern(). */ + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + default: + ut_error; + } + next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT); + + btr_page_free(index, ext_block, &mtr, true); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no); + memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4); + page_zip_write_blob_ptr(block, rec, index, + offsets, i, &mtr); + } else { + mtr.write<4>(*block, + BTR_EXTERN_PAGE_NO + field_ref, + next_page_no); + mtr.write<4,mtr_t::MAYBE_NOP>(*block, + BTR_EXTERN_LEN + + 4 + field_ref, + 0U); + } + } else { + ut_ad(!block->page.zip.data); + btr_check_blob_fil_page_type(*ext_block, false); + + next_page_no = mach_read_from_4( + page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + btr_page_free(index, ext_block, &mtr, true); + + mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref, + next_page_no); + /* Zero out the BLOB length. If the server + crashes during the execution of this function, + trx_rollback_all_recovered() could + dereference the half-deleted BLOB, fetching a + wrong prefix for the BLOB. */ + mtr.write<4,mtr_t::MAYBE_NOP>(*block, + BTR_EXTERN_LEN + 4 + + field_ref, 0U); + } + + /* Commit mtr and release the BLOB block to save memory. */ + btr_blob_free(ext_block, TRUE, &mtr); + } +} + +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + buf_block_t* block, /*!< in: index page of rec */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(index->is_primary()); + ut_ad(page_rec_is_leaf(rec)); + /* Free possible externally stored fields in the record */ + + ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_free_externally_stored_field( + index, btr_rec_get_field_ref(rec, offsets, i), + rec, offsets, block, i, rollback, mtr); + } + } +} + +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in/out: record */ + buf_block_t* block, /*!< in: index page of rec */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const upd_field_t* ufield = upd_get_nth_field(update, i); + + if (rec_offs_nth_extern(offsets, ufield->field_no)) { + ulint len; + byte* data = rec_get_nth_field( + rec, offsets, ufield->field_no, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, block, + ufield->field_no, rollback, mtr); + } + } +} + +/*******************************************************************//** +Copies the prefix of an uncompressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_blob_prefix( +/*=================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + uint32_t len, /*!< in: length of buf, in bytes */ + page_id_t id, /*!< in: page identifier of the first BLOB page */ + uint32_t offset) /*!< in: offset on the first BLOB page */ +{ + ulint copied_len = 0; + + for (;;) { + mtr_t mtr; + buf_block_t* block; + const page_t* page; + const byte* blob_header; + ulint part_len; + ulint copy_len; + + mtr_start(&mtr); + + block = buf_page_get(id, 0, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(block); + + btr_check_blob_fil_page_type(*block, true); + + blob_header = page + offset; + part_len = btr_blob_get_part_len(blob_header); + copy_len = ut_min(part_len, len - copied_len); + + memcpy(buf + copied_len, + blob_header + BTR_BLOB_HDR_SIZE, copy_len); + copied_len += copy_len; + + id.set_page_no(btr_blob_get_next_page_no(blob_header)); + + mtr_commit(&mtr); + + if (id.page_no() == FIL_NULL || copy_len != part_len) { + MEM_CHECK_DEFINED(buf, copied_len); + return(copied_len); + } + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + ut_ad(copied_len <= len); + } +} + +/** Copies the prefix of a compressed BLOB. +The clustered index record that points to this BLOB must be protected +by a lock or a page latch. +@param[out] buf the externally stored part of the field, +or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size +@param[in] id page identifier of the BLOB pages +@return number of bytes written to buf */ +static +ulint +btr_copy_zblob_prefix( + byte* buf, + uint32_t len, + ulint zip_size, + page_id_t id, + uint32_t offset) +{ + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = static_cast<uInt>(len); + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); + + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(id.space()); + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + for (;;) { + buf_page_t* bpage; + uint32_t next_page_no; + + /* There is no latch on bpage directly. Instead, + bpage is protected by the B-tree page latch that + is being held on the clustered index record, or, + in row_merge_copy_blobs(), by an exclusive table lock. */ + bpage = buf_page_get_zip(id, zip_size); + + if (UNIV_UNLIKELY(!bpage)) { + ib::error() << "Cannot load compressed BLOB " << id; + goto func_exit; + } + + if (UNIV_UNLIKELY + (fil_page_get_type(bpage->zip.data) != page_type)) { + + ib::error() << "Unexpected type " + << fil_page_get_type(bpage->zip.data) + << " of compressed BLOB page " << id; + + ut_ad(0); + goto end_of_blob; + } + + next_page_no = mach_read_from_4(bpage->zip.data + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_DATA; + } else { + offset += 4; + } + + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = uInt(zip_size - offset); + + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + if (!d_stream.avail_out) { + goto end_of_blob; + } + break; + case Z_STREAM_END: + if (next_page_no == FIL_NULL) { + goto end_of_blob; + } + /* fall through */ + default: +inflate_error: + ib::error() << "inflate() of compressed BLOB page " + << id + << " returned " << err + << " (" << d_stream.msg << ")"; + + case Z_BUF_ERROR: + goto end_of_blob; + } + + if (next_page_no == FIL_NULL) { + if (!d_stream.avail_in) { + ib::error() + << "Unexpected end of compressed " + << "BLOB page " << id; + } else { + err = inflate(&d_stream, Z_FINISH); + switch (err) { + case Z_STREAM_END: + case Z_BUF_ERROR: + break; + default: + goto inflate_error; + } + } + +end_of_blob: + buf_page_release_zip(bpage); + goto func_exit; + } + + buf_page_release_zip(bpage); + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + id.set_page_no(next_page_no); + offset = FIL_PAGE_NEXT; + page_type = FIL_PAGE_TYPE_ZBLOB2; + } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + MEM_CHECK_DEFINED(buf, d_stream.total_out); + return(d_stream.total_out); +} + +/** Copies the prefix of an externally stored field of a record. +The clustered index record that points to this BLOB must be protected +by a lock or a page latch. +@param[out] buf the externally stored part of the +field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] id page identifier of the first BLOB page +@param[in] offset offset on the first BLOB page +@return number of bytes written to buf */ +static +ulint +btr_copy_externally_stored_field_prefix_low( + byte* buf, + uint32_t len, + ulint zip_size, + page_id_t id, + uint32_t offset) +{ + if (len == 0) + return 0; + + return zip_size + ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset) + : btr_copy_blob_prefix(buf, len, id, offset); +} + +/** Copies the prefix of an externally stored field of a record. +The clustered index record must be protected by a lock or a page latch. +@param[out] buf the field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] local_len length of data, in bytes +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +ulint +btr_copy_externally_stored_field_prefix( + byte* buf, + ulint len, + ulint zip_size, + const byte* data, + ulint local_len) +{ + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(local_len >= len)) { + memcpy(buf, data, len); + return(len); + } + + memcpy(buf, data, local_len); + data += local_len; + + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) { + /* The externally stored part of the column has been + (partially) deleted. Signal the half-deleted BLOB + to the caller. */ + + return(0); + } + + uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID); + uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO); + uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET); + len -= local_len; + + return(local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + uint32_t(len), + zip_size, + page_id_t( + space_id, + page_no), + offset)); +} + +/** Copies an externally stored field of a record to mem heap. +The clustered index record must be protected by a lock or a page latch. +@param[out] len length of the whole field +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] local_len length of data +@param[in,out] heap mem heap +@return the whole field copied to heap */ +byte* +btr_copy_externally_stored_field( + ulint* len, + const byte* data, + ulint zip_size, + ulint local_len, + mem_heap_t* heap) +{ + byte* buf; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + uint32_t space_id = mach_read_from_4(data + local_len + + BTR_EXTERN_SPACE_ID); + uint32_t page_no = mach_read_from_4(data + local_len + + BTR_EXTERN_PAGE_NO); + uint32_t offset = mach_read_from_4(data + local_len + + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger than 4 GB; we + leave the 4 upper bytes in the length field unused */ + + uint32_t extern_len = mach_read_from_4(data + local_len + + BTR_EXTERN_LEN + 4); + + buf = (byte*) mem_heap_alloc(heap, local_len + extern_len); + + memcpy(buf, data, local_len); + *len = local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + extern_len, + zip_size, + page_id_t( + space_id, + page_no), + offset); + + return(buf); +} + +/** Copies an externally stored field of a record to mem heap. +@param[in] rec record in a clustered index; must be +protected by a lock or a page latch +@param[in] offset array returned by rec_get_offsets() +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] no field number +@param[out] len length of the field +@param[in,out] heap mem heap +@return the field copied to heap, or NULL if the field is incomplete */ +byte* +btr_rec_copy_externally_stored_field( + const rec_t* rec, + const rec_offs* offsets, + ulint zip_size, + ulint no, + ulint* len, + mem_heap_t* heap) +{ + ulint local_len; + const byte* data; + + ut_a(rec_offs_nth_extern(offsets, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, offsets, no, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY + (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + trx_rollback_recovered() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(NULL); + } + + return(btr_copy_externally_stored_field(len, data, + zip_size, local_len, heap)); +} diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc new file mode 100644 index 00000000..ebe9854b --- /dev/null +++ b/storage/innobase/btr/btr0defragment.cc @@ -0,0 +1,843 @@ +/***************************************************************************** + +Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file btr/btr0defragment.cc +Index defragmentation. + +Created 05/29/2014 Rongrong Zhong +Modified 16/07/2014 Sunguck Lee +Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com +*******************************************************/ + +#include "btr0defragment.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "dict0defrag_bg.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "srv0start.h" + +#include <list> + +/* When there's no work, either because defragment is disabled, or because no +query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ +#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 +/* Reduce the target page size by this amount when compression failure happens +during defragmentaiton. 512 is chosen because it's a power of 2 and it is about +3% of the page size. When there are compression failures in defragmentation, +our goal is to get a decent defrag ratio with as few compression failure as +possible. From experimentation it seems that reduce the target size by 512 every +time will make sure the page is compressible within a couple of iterations. */ +#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 + +/** Item in the work queue for btr_degrament_thread. */ +struct btr_defragment_item_t +{ + btr_pcur_t* pcur; /* persistent cursor where + btr_defragment_n_pages should start */ + os_event_t event; /* if not null, signal after work + is done */ + bool removed; /* Mark an item as removed */ + ulonglong last_processed; /* timestamp of last time this index + is processed by defragment thread */ + + btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event); + ~btr_defragment_item_t(); +}; + +/* Work queue for defragmentation. */ +typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t; +static btr_defragment_wq_t btr_defragment_wq; + +/* Mutex protecting the defragmentation work queue.*/ +ib_mutex_t btr_defragment_mutex; +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* Number of compression failures caused by defragmentation since server +start. */ +Atomic_counter<ulint> btr_defragment_compression_failures; +/* Number of btr_defragment_n_pages calls that altered page but didn't +manage to release any page. */ +Atomic_counter<ulint> btr_defragment_failures; +/* Total number of btr_defragment_n_pages calls that altered page. +The difference between btr_defragment_count and btr_defragment_failures shows +the amount of effort wasted. */ +Atomic_counter<ulint> btr_defragment_count; + +bool btr_defragment_active; + +struct defragment_chunk_state_t +{ + btr_defragment_item_t* m_item; +}; + +static defragment_chunk_state_t defragment_chunk_state; +static void btr_defragment_chunk(void*); + +static tpool::timer* btr_defragment_timer; +static tpool::task_group task_group(1); +static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group); +static void btr_defragment_start(); + +/******************************************************************//** +Constructor for btr_defragment_item_t. */ +btr_defragment_item_t::btr_defragment_item_t( + btr_pcur_t* pcur, + os_event_t event) +{ + this->pcur = pcur; + this->event = event; + this->removed = false; + this->last_processed = 0; +} + +/******************************************************************//** +Destructor for btr_defragment_item_t. */ +btr_defragment_item_t::~btr_defragment_item_t() { + if (this->pcur) { + btr_pcur_free_for_mysql(this->pcur); + } + if (this->event) { + os_event_set(this->event); + } +} + +static void submit_defragment_task(void*arg=0) +{ + srv_thread_pool->submit_task(&btr_defragment_task); +} + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init() +{ + srv_defragment_interval = 1000000000ULL / srv_defragment_frequency; + mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex); + defragment_chunk_state.m_item = 0; + btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task); + btr_defragment_active = true; +} + +/******************************************************************//** +Shutdown defragmentation. Release all resources. */ +void +btr_defragment_shutdown() +{ + if (!btr_defragment_timer) + return; + delete btr_defragment_timer; + btr_defragment_timer = 0; + task_group.cancel_pending(&btr_defragment_task); + mutex_enter(&btr_defragment_mutex); + std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + while(iter != btr_defragment_wq.end()) { + btr_defragment_item_t* item = *iter; + iter = btr_defragment_wq.erase(iter); + delete item; + } + mutex_exit(&btr_defragment_mutex); + mutex_free(&btr_defragment_mutex); + btr_defragment_active = false; +} + + +/******************************************************************//** +Functions used by the query threads: btr_defragment_xxx_index +Query threads find/add/remove index. */ +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. We use index->id +to identify indices. */ +bool +btr_defragment_find_index( + dict_index_t* index) /*!< Index to find. */ +{ + mutex_enter(&btr_defragment_mutex); + for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + mutex_exit(&btr_defragment_mutex); + return true; + } + } + mutex_exit(&btr_defragment_mutex); + return false; +} + +/******************************************************************//** +Query thread uses this function to add an index to btr_defragment_wq. +Return a pointer to os_event for the query thread to wait on if this is a +synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + dberr_t* err) /*!< out: error code */ +{ + mtr_t mtr; + *err = DB_SUCCESS; + + mtr_start(&mtr); + buf_block_t* block = btr_root_block_get(index, RW_NO_LATCH, &mtr); + page_t* page = NULL; + + if (block) { + page = buf_block_get_frame(block); + } + + if (page == NULL && !index->is_readable()) { + mtr_commit(&mtr); + *err = DB_DECRYPTION_FAILED; + return NULL; + } + + ut_ad(fil_page_index_page_check(page)); + ut_ad(!page_has_siblings(page)); + + if (page_is_leaf(page)) { + // Index root is a leaf page, no need to defragment. + mtr_commit(&mtr); + return NULL; + } + btr_pcur_t* pcur = btr_pcur_create_for_mysql(); + os_event_t event = os_event_create(0); + btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur, + true, 0, &mtr); + btr_pcur_move_to_next(pcur, &mtr); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + dict_stats_empty_defrag_summary(index); + btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event); + mutex_enter(&btr_defragment_mutex); + btr_defragment_wq.push_back(item); + if(btr_defragment_wq.size() == 1){ + /* Kick off defragmentation work */ + btr_defragment_start(); + } + mutex_exit(&btr_defragment_mutex); + return event; +} + +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (table->id == idx->table->id) { + item->removed = true; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Query thread uses this function to mark an index as removed in +btr_efragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + item->removed = true; + item->event = NULL; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Functions used by defragmentation thread: btr_defragment_xxx_item. +Defragmentation thread operates on the work *item*. It gets/removes +item from the work queue. */ +/******************************************************************//** +Defragment thread uses this to remove an item from btr_defragment_wq. +When an item is removed from the work queue, all resources associated with it +are free as well. */ +void +btr_defragment_remove_item( + btr_defragment_item_t* item) /*!< Item to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + if (item == *iter) { + btr_defragment_wq.erase(iter); + delete item; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Defragment thread uses this to get an item from btr_defragment_wq to work on. +The item is not removed from the work queue so query threads can still access +this item. We keep it this way so query threads can find and kill a +defragmentation even if that index is being worked on. Be aware that while you +work on this item you have no lock protection on it whatsoever. This is OK as +long as the query threads and defragment thread won't modify the same fields +without lock protection. +*/ +btr_defragment_item_t* +btr_defragment_get_item() +{ + if (btr_defragment_wq.empty()) { + return NULL; + //return nullptr; + } + mutex_enter(&btr_defragment_mutex); + std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + if (iter == btr_defragment_wq.end()) { + iter = btr_defragment_wq.begin(); + } + btr_defragment_item_t* item = *iter; + iter++; + mutex_exit(&btr_defragment_mutex); + return item; +} + +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage. +Currently we save the stats to persistent storage every 100 updates. */ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index) /*!< in: index */ +{ + if (srv_defragment_stats_accuracy != 0 // stats tracking disabled + && index->table->space_id != 0 // do not track system tables + && index->stat_defrag_modified_counter + >= srv_defragment_stats_accuracy) { + dict_stats_defrag_pool_add(index); + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Main defragment functionalities used by defragment thread.*/ +/*************************************************************//** +Calculate number of records from beginning of block that can +fit into size_limit +@return number of records */ +UNIV_INTERN +ulint +btr_defragment_calc_n_recs_for_size( + buf_block_t* block, /*!< in: B-tree page */ + dict_index_t* index, /*!< in: index of the page */ + ulint size_limit, /*!< in: size limit to fit records in */ + ulint* n_recs_size) /*!< out: actual size of the records that fit + in size_limit. */ +{ + page_t* page = buf_block_get_frame(block); + ulint n_recs = 0; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + ulint size = 0; + page_cur_t cur; + + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; + page_cur_set_before_first(block, &cur); + page_cur_move_to_next(&cur); + while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { + rec_t* cur_rec = page_cur_get_rec(&cur); + offsets = rec_get_offsets(cur_rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + ulint rec_size = rec_offs_size(offsets); + size += rec_size; + if (size > size_limit) { + size = size - rec_size; + break; + } + n_recs ++; + page_cur_move_to_next(&cur); + } + *n_recs_size = size; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return n_recs; +} + +/*************************************************************//** +Merge as many records from the from_block to the to_block. Delete +the from_block if all records are successfully merged to to_block. +@return the to_block to target for next merge operation. */ +static +buf_block_t* +btr_defragment_merge_pages( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* from_block, /*!< in: origin of merge */ + buf_block_t* to_block, /*!< in: destination of merge */ + ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */ + ulint reserved_space, /*!< in: space reserved for future + insert to avoid immediate page split */ + ulint* max_data_size, /*!< in/out: max data size to + fit in a single compressed page. */ + mem_heap_t* heap, /*!< in/out: pointer to memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* from_page = buf_block_get_frame(from_block); + page_t* to_page = buf_block_get_frame(to_block); + ulint level = btr_page_get_level(from_page); + ulint n_recs = page_get_n_recs(from_page); + ulint new_data_size = page_get_data_size(to_page); + ulint max_ins_size = + page_get_max_insert_size(to_page, n_recs); + ulint max_ins_size_reorg = + page_get_max_insert_size_after_reorganize( + to_page, n_recs); + ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space + ? max_ins_size_reorg - reserved_space : 0; + ulint move_size = 0; + ulint n_recs_to_move = 0; + rec_t* rec = NULL; + ulint target_n_recs = 0; + rec_t* orig_pred; + + // Estimate how many records can be moved from the from_page to + // the to_page. + if (zip_size) { + ulint page_diff = srv_page_size - *max_data_size; + max_ins_size_to_use = (max_ins_size_to_use > page_diff) + ? max_ins_size_to_use - page_diff : 0; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + + // If max_ins_size >= move_size, we can move the records without + // reorganizing the page, otherwise we need to reorganize the page + // first to release more space. + if (move_size > max_ins_size) { + if (!btr_page_reorganize_block(page_zip_level, + to_block, index, + mtr)) { + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + ibuf_reset_free_bits(to_block); + } + // If reorganization fails, that means page is + // not compressable. There's no point to try + // merging into this page. Continue to the + // next page. + return from_block; + } + ut_ad(page_validate(to_page, index)); + max_ins_size = page_get_max_insert_size(to_page, n_recs); + ut_a(max_ins_size >= move_size); + } + + // Move records to pack to_page more full. + orig_pred = NULL; + target_n_recs = n_recs_to_move; + while (n_recs_to_move > 0) { + rec = page_rec_get_nth(from_page, + n_recs_to_move + 1); + orig_pred = page_copy_rec_list_start( + to_block, from_block, rec, index, mtr); + if (orig_pred) + break; + // If we reach here, that means compression failed after packing + // n_recs_to_move number of records to to_page. We try to reduce + // the targeted data size on the to_page by + // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. + btr_defragment_compression_failures++; + max_ins_size_to_use = + move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + : 0; + if (max_ins_size_to_use == 0) { + n_recs_to_move = 0; + move_size = 0; + break; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + } + // If less than target_n_recs are moved, it means there are + // compression failures during page_copy_rec_list_start. Adjust + // the max_data_size estimation to reduce compression failures + // in the following runs. + if (target_n_recs > n_recs_to_move + && *max_data_size > new_data_size + move_size) { + *max_data_size = new_data_size + move_size; + } + // Set ibuf free bits if necessary. + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + if (zip_size) { + ibuf_reset_free_bits(to_block); + } else { + ibuf_update_free_bits_if_full( + to_block, + srv_page_size, + ULINT_UNDEFINED); + } + } + btr_cur_t parent; + if (n_recs_to_move == n_recs) { + /* The whole page is merged with the previous page, + free it. */ + lock_update_merge_left(to_block, orig_pred, + from_block); + btr_search_drop_page_hash_index(from_block); + btr_level_list_remove(*from_block, *index, mtr); + btr_page_get_father(index, from_block, mtr, &parent); + btr_cur_node_ptr_delete(&parent, mtr); + /* btr_blob_dbg_remove(from_page, index, + "btr_defragment_n_pages"); */ + btr_page_free(index, from_block, mtr); + } else { + // There are still records left on the page, so + // increment n_defragmented. Node pointer will be changed + // so remove the old node pointer. + if (n_recs_to_move > 0) { + // Part of the page is merged to left, remove + // the merged records, update record locks and + // node pointer. + dtuple_t* node_ptr; + page_delete_rec_list_start(rec, from_block, + index, mtr); + lock_update_split_and_merge(to_block, + orig_pred, + from_block); + // FIXME: reuse the node_ptr! + btr_page_get_father(index, from_block, mtr, &parent); + btr_cur_node_ptr_delete(&parent, mtr); + rec = page_rec_get_next( + page_get_infimum_rec(from_page)); + node_ptr = dict_index_build_node_ptr( + index, rec, page_get_page_no(from_page), + heap, level); + btr_insert_on_non_leaf_level(0, index, level+1, + node_ptr, mtr); + } + to_block = from_block; + } + return to_block; +} + +/*************************************************************//** +Tries to merge N consecutive pages, starting from the page pointed by the +cursor. Skip space 0. Only consider leaf pages. +This function first loads all N pages into memory, then for each of +the pages other than the first page, it tries to move as many records +as possible to the left sibling to keep the left sibling full. During +the process, if any page becomes empty, that page will be removed from +the level list. Record locks, hash, and node pointers are updated after +page reorganization. +@return pointer to the last block processed, or NULL if reaching end of index */ +UNIV_INTERN +buf_block_t* +btr_defragment_n_pages( + buf_block_t* block, /*!< in: starting block for defragmentation */ + dict_index_t* index, /*!< in: index tree */ + uint n_pages,/*!< in: number of pages to defragment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + /* We will need to load the n+1 block because if the last page is freed + and we need to modify the prev_page_no of that block. */ + buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; + page_t* first_page; + buf_block_t* current_block; + ulint total_data_size = 0; + ulint total_n_recs = 0; + ulint data_size_per_rec; + ulint optimal_page_size; + ulint reserved_space; + ulint max_data_size = 0; + uint n_defragmented = 0; + uint n_new_slots; + mem_heap_t* heap; + ibool end_of_index = FALSE; + + /* It doesn't make sense to call this function with n_pages = 1. */ + ut_ad(n_pages > 1); + + if (!page_is_leaf(block->frame)) { + return NULL; + } + + if (!index->table->space || !index->table->space_id) { + /* Ignore space 0. */ + return NULL; + } + + if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { + n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; + } + + first_page = buf_block_get_frame(block); + const ulint zip_size = index->table->space->zip_size(); + + /* 1. Load the pages and calculate the total data size. */ + blocks[0] = block; + for (uint i = 1; i <= n_pages; i++) { + page_t* page = buf_block_get_frame(blocks[i-1]); + uint32_t page_no = btr_page_get_next(page); + total_data_size += page_get_data_size(page); + total_n_recs += page_get_n_recs(page); + if (page_no == FIL_NULL) { + n_pages = i; + end_of_index = TRUE; + break; + } + + blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true, + mtr); + } + + if (n_pages == 1) { + if (!page_has_prev(first_page)) { + /* last page in the index */ + if (dict_index_get_page(index) + == page_get_page_no(first_page)) + return NULL; + /* given page is the last page. + Lift the records to father. */ + btr_lift_page_up(index, block, mtr); + } + return NULL; + } + + /* 2. Calculate how many pages data can fit in. If not compressable, + return early. */ + ut_a(total_n_recs != 0); + data_size_per_rec = total_data_size / total_n_recs; + // For uncompressed pages, the optimal data size if the free space of a + // empty page. + optimal_page_size = page_get_free_space_of_empty( + page_is_comp(first_page)); + // For compressed pages, we take compression failures into account. + if (zip_size) { + ulint size = 0; + uint i = 0; + // We estimate the optimal data size of the index use samples of + // data size. These samples are taken when pages failed to + // compress due to insertion on the page. We use the average + // of all samples we have as the estimation. Different pages of + // the same index vary in compressibility. Average gives a good + // enough estimation. + for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { + if (index->stat_defrag_data_size_sample[i] == 0) { + break; + } + size += index->stat_defrag_data_size_sample[i]; + } + if (i != 0) { + size /= i; + optimal_page_size = ut_min(optimal_page_size, size); + } + max_data_size = optimal_page_size; + } + + reserved_space = ut_min(static_cast<ulint>( + static_cast<double>(optimal_page_size) + * (1 - srv_defragment_fill_factor)), + (data_size_per_rec + * srv_defragment_fill_factor_n_recs)); + optimal_page_size -= reserved_space; + n_new_slots = uint((total_data_size + optimal_page_size - 1) + / optimal_page_size); + if (n_new_slots >= n_pages) { + /* Can't defragment. */ + if (end_of_index) + return NULL; + return blocks[n_pages-1]; + } + + /* 3. Defragment pages. */ + heap = mem_heap_create(256); + // First defragmented page will be the first page. + current_block = blocks[0]; + // Start from the second page. + for (uint i = 1; i < n_pages; i ++) { + buf_block_t* new_block = btr_defragment_merge_pages( + index, blocks[i], current_block, zip_size, + reserved_space, &max_data_size, heap, mtr); + if (new_block != current_block) { + n_defragmented ++; + current_block = new_block; + } + } + mem_heap_free(heap); + n_defragmented ++; + btr_defragment_count++; + if (n_pages == n_defragmented) { + btr_defragment_failures++; + } else { + index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); + } + if (end_of_index) + return NULL; + return current_block; +} + + + +void btr_defragment_start() { + if (!srv_defragment) + return; + ut_ad(!btr_defragment_wq.empty()); + submit_defragment_task(); +} + + +/** +Callback used by defragment timer + +Throttling "sleep", is implemented via rescheduling the +threadpool timer, which, when fired, will resume the work again, +where it is left. + +The state (current item) is stored in function parameter. +*/ +static void btr_defragment_chunk(void*) +{ + defragment_chunk_state_t* state = &defragment_chunk_state; + + btr_pcur_t* pcur; + btr_cur_t* cursor; + dict_index_t* index; + mtr_t mtr; + buf_block_t* first_block; + buf_block_t* last_block; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + if (!state->m_item) { + state->m_item = btr_defragment_get_item(); + } + /* If an index is marked as removed, we remove it from the work + queue. No other thread could be using this item at this point so + it's safe to remove now. */ + while (state->m_item && state->m_item->removed) { + btr_defragment_remove_item(state->m_item); + state->m_item = btr_defragment_get_item(); + } + if (!state->m_item) { + /* Queue empty */ + return; + } + + pcur = state->m_item->pcur; + ulonglong now = my_interval_timer(); + ulonglong elapsed = now - state->m_item->last_processed; + + if (elapsed < srv_defragment_interval) { + /* If we see an index again before the interval + determined by the configured frequency is reached, + we just sleep until the interval pass. Since + defragmentation of all indices queue up on a single + thread, it's likely other indices that follow this one + don't need to sleep again. */ + int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000); + if (sleep_ms) { + btr_defragment_timer->set_time(sleep_ms, 0); + return; + } + } + log_free_check(); + mtr_start(&mtr); + cursor = btr_pcur_get_btr_cur(pcur); + index = btr_cur_get_index(cursor); + index->set_modified(mtr); + /* To follow the latching order defined in WL#6326, acquire index->lock X-latch. + This entitles us to acquire page latches in any order for the index. */ + mtr_x_lock_index(index, &mtr); + /* This will acquire index->lock SX-latch, which per WL#6363 is allowed + when we are already holding the X-latch. */ + btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); + first_block = btr_cur_get_block(cursor); + + last_block = btr_defragment_n_pages(first_block, index, + srv_defragment_n_pages, + &mtr); + if (last_block) { + /* If we haven't reached the end of the index, + place the cursor on the last record of last page, + store the cursor position, and put back in queue. */ + page_t* last_page = buf_block_get_frame(last_block); + rec_t* rec = page_rec_get_prev( + page_get_supremum_rec(last_page)); + ut_a(page_rec_is_user_rec(rec)); + page_cur_position(rec, last_block, + btr_cur_get_page_cur(cursor)); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + /* Update the last_processed time of this index. */ + state->m_item->last_processed = now; + } else { + dberr_t err = DB_SUCCESS; + mtr_commit(&mtr); + /* Reaching the end of the index. */ + dict_stats_empty_defrag_stats(index); + err = dict_stats_save_defrag_stats(index); + if (err != DB_SUCCESS) { + ib::error() << "Saving defragmentation stats for table " + << index->table->name + << " index " << index->name() + << " failed with error " << err; + } else { + err = dict_stats_save_defrag_summary(index); + + if (err != DB_SUCCESS) { + ib::error() << "Saving defragmentation summary for table " + << index->table->name + << " index " << index->name() + << " failed with error " << err; + } + } + + btr_defragment_remove_item(state->m_item); + state->m_item = NULL; + } + } +} diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc new file mode 100644 index 00000000..574998a9 --- /dev/null +++ b/storage/innobase/btr/btr0pcur.cc @@ -0,0 +1,681 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0pcur.cc +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0pcur.h" +#include "ut0byte.h" +#include "rem0cmp.h" +#include "trx0trx.h" + +/**************************************************************//** +Allocates memory for a persistent cursor object and initializes the cursor. +@return own: persistent cursor */ +btr_pcur_t* +btr_pcur_create_for_mysql(void) +/*============================*/ +{ + btr_pcur_t* pcur; + DBUG_ENTER("btr_pcur_create_for_mysql"); + + pcur = (btr_pcur_t*) ut_malloc_nokey(sizeof(btr_pcur_t)); + + pcur->btr_cur.index = NULL; + btr_pcur_init(pcur); + + DBUG_PRINT("btr_pcur_create_for_mysql", ("pcur: %p", pcur)); + DBUG_RETURN(pcur); +} + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor) /*!< in, out: persistent cursor */ +{ + btr_pcur_free(cursor); + cursor->old_rec_buf = NULL; + cursor->btr_cur.index = NULL; + cursor->btr_cur.page_cur.rec = NULL; + cursor->old_rec = NULL; + cursor->old_n_core_fields = 0; + cursor->old_n_fields = 0; + cursor->old_stored = false; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; +} + +/**************************************************************//** +Frees the memory for a persistent cursor object. */ +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor) /*!< in, own: persistent cursor */ +{ + DBUG_ENTER("btr_pcur_free_for_mysql"); + DBUG_PRINT("btr_pcur_free_for_mysql", ("pcur: %p", cursor)); + + btr_pcur_free(cursor); + ut_free(cursor); + DBUG_VOID_RETURN; +} + +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + dict_index_t* index; + ulint offs; + + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + page_cursor = btr_pcur_get_page_cur(cursor); + + rec = page_cur_get_rec(page_cursor); + offs = rec - block->frame; + ut_ad(block->page.id().page_no() == page_get_page_no(block->frame)); + ut_ad(block->page.buf_fix_count()); + /* For spatial index, when we do positioning on parent + buffer if necessary, it might not hold latches, but the + tree must be locked to prevent change on the page */ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX) + || (index->is_spatial() + && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK))); + + cursor->old_stored = true; + + if (page_is_empty(block->frame)) { + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ + + ut_a(!page_has_siblings(block->frame)); + ut_ad(page_is_leaf(block->frame)); + ut_ad(block->page.id().page_no() == index->page); + + if (page_rec_is_supremum_low(offs)) { + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + } else { +before_first: + cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; + } + + return; + } + + if (page_rec_is_supremum_low(offs)) { + rec = page_rec_get_prev(rec); + + ut_ad(!page_rec_is_infimum(rec)); + if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) { +#if 0 /* MDEV-22867 had to relax this */ + /* If the table is emptied during an ALGORITHM=NOCOPY + DROP COLUMN ... that is not ALGORITHM=INSTANT, + then we must preserve any instant ADD metadata. */ + ut_ad(index->table->instant + || block->page.id().page_no() != index->page); +#endif + ut_ad(index->is_instant() + || block->page.id().page_no() != index->page); + ut_ad(page_get_n_recs(block->frame) == 1); + ut_ad(page_is_leaf(block->frame)); + ut_ad(!page_has_prev(block->frame)); + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + return; + } + + cursor->rel_pos = BTR_PCUR_AFTER; + } else if (page_rec_is_infimum_low(offs)) { + rec = page_rec_get_next(rec); + + if (rec_is_metadata(rec, *index)) { + ut_ad(!page_has_prev(block->frame)); + rec = page_rec_get_next(rec); + if (page_rec_is_supremum(rec)) { + goto before_first; + } + } + + cursor->rel_pos = BTR_PCUR_BEFORE; + } else { + cursor->rel_pos = BTR_PCUR_ON; + } + + if (index->is_ibuf()) { + ut_ad(!index->table->not_redundant()); + cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec)); + } else { + cursor->old_n_fields = static_cast<uint16>( + dict_index_get_n_unique_in_tree(index)); + if (index->is_spatial() && !page_rec_is_leaf(rec)) { + ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) + == DICT_INDEX_SPATIAL_NODEPTR_SIZE); + /* For R-tree, we have to compare + the child page numbers as well. */ + cursor->old_n_fields + = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } + } + + cursor->old_n_core_fields = index->n_core_fields; + cursor->old_rec = rec_copy_prefix_to_buf(rec, index, + cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); + cursor->block_when_stored.store(block); + + /* Function try to check if block is S/X latch. */ + cursor->modify_clock = buf_block_get_modify_clock(block); +} + +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is + copied */ +{ + ut_free(pcur_receive->old_rec_buf); + memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t)); + + if (pcur_donate->old_rec_buf) { + + pcur_receive->old_rec_buf = (byte*) + ut_malloc_nokey(pcur_donate->buf_size); + + memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf, + pcur_donate->buf_size); + pcur_receive->old_rec = pcur_receive->old_rec_buf + + (pcur_donate->old_rec - pcur_donate->old_rec_buf); + } + + pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields; + pcur_receive->old_n_fields = pcur_donate->old_n_fields; +} + +/** Structure acts as functor to do the latching of leaf pages. +It returns true if latching of leaf pages succeeded and false +otherwise. */ +struct optimistic_latch_leaves +{ + btr_pcur_t *const cursor; + ulint *latch_mode; + mtr_t *const mtr; + + optimistic_latch_leaves(btr_pcur_t *cursor, ulint *latch_mode, mtr_t *mtr) + :cursor(cursor), latch_mode(latch_mode), mtr(mtr) {} + + bool operator() (buf_block_t *hint) const + { + return hint && btr_cur_optimistic_latch_leaves( + hint, cursor->modify_clock, latch_mode, + btr_pcur_get_btr_cur(cursor), __FILE__, __LINE__, mtr); + } +}; + +/**************************************************************//** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. +@return TRUE if the cursor position was stored when it was on a user +record and it can be restored on a user record whose ordering fields +are identical to the ones of the original user record */ +ibool +btr_pcur_restore_position_func( +/*===========================*/ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: detached persistent cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + dtuple_t* tuple; + page_cur_mode_t mode; + page_cur_mode_t old_mode; + mem_heap_t* heap; + + ut_ad(mtr->is_active()); + //ut_ad(cursor->old_stored); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + if (UNIV_UNLIKELY + (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) { + dberr_t err = DB_SUCCESS; + + /* In these cases we do not try an optimistic restoration, + but always do a search */ + + err = btr_cur_open_at_index_side( + cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, + index, latch_mode, + btr_pcur_get_btr_cur(cursor), 0, mtr); + + if (err != DB_SUCCESS) { + ib::warn() << " Error code: " << err + << " btr_pcur_restore_position_func " + << " called from file: " + << file << " line: " << line + << " table: " << index->table->name + << " index: " << index->name; + } + + cursor->latch_mode = + BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->block_when_stored.clear(); + + return(FALSE); + } + + ut_a(cursor->old_rec); + ut_a(cursor->old_n_core_fields); + ut_a(cursor->old_n_core_fields <= index->n_core_fields); + ut_a(cursor->old_n_fields); + + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + /* Try optimistic restoration. */ + + if (cursor->block_when_stored.run_with_hint( + optimistic_latch_leaves(cursor, &latch_mode, + mtr))) { + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->latch_mode = latch_mode; + + buf_block_dbg_add_level( + btr_pcur_get_block(cursor), + dict_index_is_ibuf(index) + ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); + + if (cursor->rel_pos == BTR_PCUR_ON) { +#ifdef UNIV_DEBUG + const rec_t* rec; + rec_offs offsets1_[REC_OFFS_NORMAL_SIZE]; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets1 = offsets1_; + rec_offs* offsets2 = offsets2_; + rec = btr_pcur_get_rec(cursor); + + rec_offs_init(offsets1_); + rec_offs_init(offsets2_); + + heap = mem_heap_create(256); + ut_ad(cursor->old_n_core_fields + == index->n_core_fields); + + offsets1 = rec_get_offsets( + cursor->old_rec, index, offsets1, + cursor->old_n_core_fields, + cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets( + rec, index, offsets2, + index->n_core_fields, + cursor->old_n_fields, &heap); + + ut_ad(!cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, + index)); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ + return(TRUE); + } + /* This is the same record as stored, + may need to be adjusted for BTR_PCUR_BEFORE/AFTER, + depending on search mode and direction. */ + if (btr_pcur_is_on_user_rec(cursor)) { + cursor->pos_state + = BTR_PCUR_IS_POSITIONED_OPTIMISTIC; + } + return(FALSE); + } + } + + /* If optimistic restoration did not succeed, open the cursor anew */ + + heap = mem_heap_create(256); + + tuple = dtuple_create(heap, cursor->old_n_fields); + + dict_index_copy_types(tuple, index, cursor->old_n_fields); + + rec_copy_prefix_to_dtuple(tuple, cursor->old_rec, index, + cursor->old_n_core_fields, + cursor->old_n_fields, heap); + ut_ad(dtuple_check_typed(tuple)); + + /* Save the old search mode of the cursor */ + old_mode = cursor->search_mode; + + switch (cursor->rel_pos) { + case BTR_PCUR_ON: + mode = PAGE_CUR_LE; + break; + case BTR_PCUR_AFTER: + mode = PAGE_CUR_G; + break; + case BTR_PCUR_BEFORE: + mode = PAGE_CUR_L; + break; + default: + ut_error; + mode = PAGE_CUR_UNSUPP; + } + + btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode, + cursor, +#ifdef BTR_CUR_HASH_ADAPT + NULL, +#endif /* BTR_CUR_HASH_ADAPT */ + file, line, mtr); + + /* Restore the old search mode */ + cursor->search_mode = old_mode; + + ut_ad(cursor->rel_pos == BTR_PCUR_ON + || cursor->rel_pos == BTR_PCUR_BEFORE + || cursor->rel_pos == BTR_PCUR_AFTER); + rec_offs offsets[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets); + if (cursor->rel_pos == BTR_PCUR_ON + && btr_pcur_is_on_user_rec(cursor) + && !cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), + rec_get_offsets(btr_pcur_get_rec(cursor), + index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap))) { + + /* We have to store the NEW value for the modify clock, + since the cursor can now be on a different page! + But we can retain the value of old_rec */ + + cursor->block_when_stored.store(btr_pcur_get_block(cursor)); + cursor->modify_clock = buf_block_get_modify_clock( + cursor->block_when_stored.block()); + cursor->old_stored = true; + + mem_heap_free(heap); + + return(TRUE); + } + + mem_heap_free(heap); + + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + + btr_pcur_store_position(cursor, mtr); + + return(FALSE); +} + +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. Releases the +latch on the current page, and bufferunfixes it. Note that there must not be +modifications on the current page, as then the x-latch can be released only in +mtr_commit. */ +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_after_last_on_page(cursor)); + + cursor->old_stored = false; + + const page_t* page = btr_pcur_get_page(cursor); + + if (UNIV_UNLIKELY(!page)) { + return; + } + + const uint32_t next_page_no = btr_page_get_next(page); + + ut_ad(next_page_no != FIL_NULL); + + ulint mode = cursor->latch_mode; + switch (mode) { + case BTR_SEARCH_TREE: + mode = BTR_SEARCH_LEAF; + break; + case BTR_MODIFY_TREE: + mode = BTR_MODIFY_LEAF; + } + + buf_block_t* next_block = btr_block_get( + *btr_pcur_get_btr_cur(cursor)->index, next_page_no, mode, + page_is_leaf(page), mtr); + + if (UNIV_UNLIKELY(!next_block)) { + return; + } + + const page_t* next_page = buf_block_get_frame(next_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page) + == btr_pcur_get_block(cursor)->page.id().page_no()); +#endif /* UNIV_BTR_DEBUG */ + + btr_leaf_page_release(btr_pcur_get_block(cursor), mode, mtr); + + page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor)); + + ut_d(page_check_dir(next_page)); +} + +/*********************************************************//** +Moves the persistent cursor backward if it is on the first record of the page. +Commits mtr. Note that to prevent a possible deadlock, the operation +first stores the position of the cursor, commits mtr, acquires the necessary +latches and restores the cursor position again before returning. The +alphabetical position of the cursor is guaranteed to be sensible on +return, but it may happen that the cursor is not positioned on the last +record of any page, because the structure of the tree may have changed +during the time when the cursor had no latches. */ +static +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first + record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint prev_page_no; + page_t* page; + buf_block_t* prev_block; + ulint latch_mode; + ulint latch_mode2; + + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_before_first_on_page(cursor)); + ut_ad(!btr_pcur_is_before_first_in_tree(cursor)); + + latch_mode = cursor->latch_mode; + + if (latch_mode == BTR_SEARCH_LEAF) { + + latch_mode2 = BTR_SEARCH_PREV; + + } else if (latch_mode == BTR_MODIFY_LEAF) { + + latch_mode2 = BTR_MODIFY_PREV; + } else { + latch_mode2 = 0; /* To eliminate compiler warning */ + ut_error; + } + + btr_pcur_store_position(cursor, mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + btr_pcur_restore_position(latch_mode2, cursor, mtr); + + page = btr_pcur_get_page(cursor); + + prev_page_no = btr_page_get_prev(page); + + if (prev_page_no == FIL_NULL) { + } else if (btr_pcur_is_before_first_on_page(cursor)) { + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + latch_mode, mtr); + + page_cur_set_after_last(prev_block, + btr_pcur_get_page_cur(cursor)); + } else { + + /* The repositioned cursor did not end on an infimum + record on a page. Cursor repositioning acquired a latch + also on the previous page, but we do not need the latch: + release it. */ + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(prev_block, latch_mode, mtr); + } + + cursor->latch_mode = latch_mode; + cursor->old_stored = false; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +ibool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = false; + + if (btr_pcur_is_before_first_on_page(cursor)) { + + if (btr_pcur_is_before_first_in_tree(cursor)) { + + return(FALSE); + } + + btr_pcur_move_backward_from_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_prev_on_page(cursor); + + return(TRUE); +} + +/**************************************************************//** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +void +btr_pcur_open_on_user_rec_func( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor, + file, line, 0, mtr); + + if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) { + + if (btr_pcur_is_after_last_on_page(cursor)) { + + btr_pcur_move_to_next_user_rec(cursor, mtr); + } + } else { + ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L)); + + /* Not implemented yet */ + + ut_error; + } +} diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc new file mode 100644 index 00000000..f22e3a59 --- /dev/null +++ b/storage/innobase/btr/btr0sea.cc @@ -0,0 +1,2372 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2017, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file btr/btr0sea.cc +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "btr0sea.h" +#ifdef BTR_CUR_HASH_ADAPT +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "srv0mon.h" + +/** Is search system enabled. +Search system is protected by array of latches. */ +char btr_search_enabled; + +/** Number of adaptive hash index partition. */ +ulong btr_ahi_parts; + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +ulint btr_search_n_succ = 0; +/** Number of failed adaptive hash index lookups */ +ulint btr_search_n_hash_fail = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** The adaptive hash index */ +btr_search_sys_t btr_search_sys; + +/** If the number of records on the page divided by this parameter +would have been successfully accessed using a hash index, the index +is then built on the page, assuming the global limit has been reached */ +#define BTR_SEARCH_PAGE_BUILD_LIMIT 16U + +/** The global limit for consecutive potentially successful hash searches, +before hash index building is started */ +#define BTR_SEARCH_BUILD_LIMIT 100U + +/** Compute a hash value of a record in a page. +@param[in] rec index record +@param[in] offsets return value of rec_get_offsets() +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the hash value */ +static inline +ulint +rec_fold( + const rec_t* rec, + const rec_offs* offsets, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) +{ + ulint i; + const byte* data; + ulint len; + ulint fold; + ulint n_fields_rec; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!page_rec_is_metadata(rec)); + ut_ad(n_fields > 0 || n_bytes > 0); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); + + if (n_fields > n_fields_rec) { + n_fields = n_fields_rec; + } + + if (n_fields == n_fields_rec) { + n_bytes = 0; + } + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/** Determine the number of accessed key fields. +@param[in] n_fields number of complete fields +@param[in] n_bytes number of bytes in an incomplete last field +@return number of complete or incomplete fields */ +inline MY_ATTRIBUTE((warn_unused_result)) +ulint +btr_search_get_n_fields( + ulint n_fields, + ulint n_bytes) +{ + return(n_fields + (n_bytes > 0 ? 1 : 0)); +} + +/** Determine the number of accessed key fields. +@param[in] cursor b-tree cursor +@return number of complete or incomplete fields */ +inline MY_ATTRIBUTE((warn_unused_result)) +ulint +btr_search_get_n_fields( + const btr_cur_t* cursor) +{ + return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes)); +} + +/** This function should be called before reserving any btr search mutex, if +the intended operation might add nodes to the search system hash table. +Because of the latching order, once we have reserved the btr search system +latch, we cannot allocate a free frame from the buffer pool. Checks that +there is a free buffer frame allocated for hash table heap in the btr search +system. If not, allocates a free frames for the heap. This check makes it +probable that, when have reserved the btr search system latch and we need to +allocate a new node to the hash table, it will succeed. However, the check +will not guarantee success. +@param[in] index index handler */ +static void btr_search_check_free_space_in_heap(const dict_index_t *index) +{ + /* Note that we peek the value of heap->free_block without reserving + the latch: this is ok, because we will not guarantee that there will + be enough free space in the hash table. */ + + buf_block_t *block= buf_block_alloc(); + auto part= btr_search_sys.get_part(*index); + + rw_lock_x_lock(&part->latch); + + if (!btr_search_enabled || part->heap->free_block) + buf_block_free(block); + else + part->heap->free_block= block; + + rw_lock_x_unlock(&part->latch); +} + +/** Set index->ref_count = 0 on all indexes of a table. +@param[in,out] table table handler */ +static void btr_search_disable_ref_count(dict_table_t *table) +{ + for (dict_index_t *index= dict_table_get_first_index(table); index; + index= dict_table_get_next_index(index)) + index->search_info->ref_count= 0; +} + +/** Lazily free detached metadata when removing the last reference. */ +ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index) +{ + ut_ad(index->freed()); + dict_table_t *table= index->table; + /* Perform the skipped steps of dict_index_remove_from_cache_low(). */ + UT_LIST_REMOVE(table->freed_indexes, index); + rw_lock_free(&index->lock); + dict_mem_index_free(index); + + if (!UT_LIST_GET_LEN(table->freed_indexes) && + !UT_LIST_GET_LEN(table->indexes)) + { + ut_ad(table->id == 0); + dict_mem_table_free(table); + } +} + +/** Disable the adaptive hash search system and empty the index. */ +void btr_search_disable() +{ + dict_table_t* table; + + mutex_enter(&dict_sys.mutex); + + btr_search_x_lock_all(); + + if (!btr_search_enabled) { + mutex_exit(&dict_sys.mutex); + btr_search_x_unlock_all(); + return; + } + + btr_search_enabled = false; + + /* Clear the index->search_info->ref_count of every index in + the data dictionary cache. */ + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + btr_search_disable_ref_count(table); + } + + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + btr_search_disable_ref_count(table); + } + + mutex_exit(&dict_sys.mutex); + + /* Set all block->index = NULL. */ + buf_pool.clear_hash_index(); + + /* Clear the adaptive hash index. */ + btr_search_sys.clear(); + + btr_search_x_unlock_all(); +} + +/** Enable the adaptive hash search system. +@param resize whether buf_pool_t::resize() is the caller */ +void btr_search_enable(bool resize) +{ + if (!resize) { + mysql_mutex_lock(&buf_pool.mutex); + bool changed = srv_buf_pool_old_size != srv_buf_pool_size; + mysql_mutex_unlock(&buf_pool.mutex); + if (changed) { + return; + } + } + + btr_search_x_lock_all(); + ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64; + + if (btr_search_sys.parts[0].heap) { + ut_ad(btr_search_enabled); + btr_search_x_unlock_all(); + return; + } + + btr_search_sys.alloc(hash_size); + + btr_search_enabled = true; + btr_search_x_unlock_all(); +} + +/** Updates the search info of an index about hash successes. NOTE that info +is NOT protected by any semaphore, to save CPU time! Do not assume its fields +are consistent. +@param[in,out] info search info +@param[in] cursor cursor which was just positioned */ +static +void +btr_search_info_update_hash( + btr_search_t* info, + btr_cur_t* cursor) +{ + dict_index_t* index = cursor->index; + int cmp; + + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); + + if (dict_index_is_ibuf(index)) { + /* So many deletes are performed on an insert buffer tree + that we do not consider a hash index useful on it: */ + + return; + } + + uint16_t n_unique = dict_index_get_n_unique_in_tree(index); + + if (info->n_hash_potential == 0) { + + goto set_new_recomm; + } + + /* Test if the search would have succeeded using the recommended + hash prefix */ + + if (info->n_fields >= n_unique && cursor->up_match >= n_unique) { +increment_potential: + info->n_hash_potential++; + + return; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->low_match, cursor->low_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto set_new_recomm; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->up_match, cursor->up_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto increment_potential; + } + +set_new_recomm: + /* We have to set a new recommendation; skip the hash analysis + for a while to avoid unnecessary CPU time usage when there is no + chance for success */ + + info->hash_analysis = 0; + + cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes, + cursor->low_match, cursor->low_bytes); + info->left_side = cmp >= 0; + info->n_hash_potential = cmp != 0; + + if (cmp == 0) { + /* For extra safety, we set some sensible values here */ + info->n_fields = 1; + info->n_bytes = 0; + } else if (cmp > 0) { + info->n_hash_potential = 1; + + if (cursor->up_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match < cursor->up_match) { + + info->n_fields = static_cast<uint16_t>( + cursor->low_match + 1); + info->n_bytes = 0; + } else { + info->n_fields = static_cast<uint16_t>( + cursor->low_match); + info->n_bytes = static_cast<uint16_t>( + cursor->low_bytes + 1); + } + } else { + if (cursor->low_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + } else if (cursor->low_match > cursor->up_match) { + + info->n_fields = static_cast<uint16_t>( + cursor->up_match + 1); + info->n_bytes = 0; + } else { + info->n_fields = static_cast<uint16_t>( + cursor->up_match); + info->n_bytes = static_cast<uint16_t>( + cursor->up_bytes + 1); + } + } +} + +/** Update the block search info on hash successes. NOTE that info and +block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any +semaphore, to save CPU time! Do not assume the fields are consistent. +@return TRUE if building a (new) hash index on the block is recommended +@param[in,out] info search info +@param[in,out] block buffer block */ +static +bool +btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block) +{ + ut_ad(!btr_search_own_any()); + ut_ad(rw_lock_own_flagged(&block->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + + info->last_hash_succ = FALSE; + ut_d(auto state= block->page.state()); + ut_ad(state == BUF_BLOCK_NOT_USED + || state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_MEMORY + || state == BUF_BLOCK_REMOVE_HASH); + ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N); + + if ((block->n_hash_helps > 0) + && (info->n_hash_potential > 0) + && (block->n_fields == info->n_fields) + && (block->n_bytes == info->n_bytes) + && (block->left_side == info->left_side)) { + + if ((block->index) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + + /* The search would presumably have succeeded using + the hash index */ + + info->last_hash_succ = TRUE; + } + + block->n_hash_helps++; + } else { + block->n_hash_helps = 1; + block->n_fields = info->n_fields; + block->n_bytes = info->n_bytes; + block->left_side = info->left_side; + } + + if ((block->n_hash_helps > page_get_n_recs(block->frame) + / BTR_SEARCH_PAGE_BUILD_LIMIT) + && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) { + + if ((!block->index) + || (block->n_hash_helps + > 2U * page_get_n_recs(block->frame)) + || (block->n_fields != block->curr_n_fields) + || (block->n_bytes != block->curr_n_bytes) + || (block->left_side != block->curr_left_side)) { + + /* Build a new hash index on the page */ + + return(true); + } + } + + return(false); +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Maximum number of records in a page */ +constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +__attribute__((nonnull)) +/** +Insert an entry into the hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@param table hash table +@param heap memory heap +@param fold folded value of the record +@param block buffer block containing the record +@param data the record +@retval true on success +@retval false if no more memory could be allocated */ +static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap, + ulint fold, +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t *block, /*!< buffer block of data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t *data) +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->frame == page_align(data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ut_ad(btr_search_enabled); + + hash_cell_t *cell= &table->array[table->calc_hash(fold)]; + + for (ha_node_t *prev= static_cast<ha_node_t*>(cell->node); prev; + prev= prev->next) + { + if (prev->fold == fold) + { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t *prev_block= prev->block; + ut_a(prev_block->frame == page_align(prev->data)); + ut_a(prev_block->n_pointers-- < MAX_N_POINTERS); + ut_a(block->n_pointers++ < MAX_N_POINTERS); + + prev->block= block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + prev->data= data; + return true; + } + } + + /* We have to allocate a new chain node */ + ha_node_t *node= static_cast<ha_node_t*>(mem_heap_alloc(heap, sizeof *node)); + + if (!node) + return false; + + ha_node_set_data(node, block, data); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->n_pointers++ < MAX_N_POINTERS); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + node->fold= fold; + node->next= nullptr; + + ha_node_t *prev= static_cast<ha_node_t*>(cell->node); + if (!prev) + cell->node= node; + else + { + while (prev->next) + prev= prev->next; + prev->next= node; + } + return true; +} + +__attribute__((nonnull)) +/** Delete a record. +@param table hash table +@param heap memory heap +@param del_node record to be deleted */ +static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap, + ha_node_t *del_node) +{ + ut_ad(btr_search_enabled); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(del_node->block->frame == page_align(del_node->data)); + ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + const ulint fold= del_node->fold; + + HASH_DELETE(ha_node_t, next, table, fold, del_node); + + ha_node_t *top= static_cast<ha_node_t*>(mem_heap_get_top(heap, sizeof *top)); + + if (del_node != top) + { + /* Compact the heap of nodes by moving the top in the place of del_node. */ + *del_node= *top; + hash_cell_t *cell= &table->array[table->calc_hash(top->fold)]; + + /* Look for the pointer to the top node, to update it */ + if (cell->node == top) + /* The top node is the first in the chain */ + cell->node= del_node; + else + { + /* We have to look for the predecessor */ + ha_node_t *node= static_cast<ha_node_t*>(cell->node); + + while (top != HASH_GET_NEXT(next, node)) + node= static_cast<ha_node_t*>(HASH_GET_NEXT(next, node)); + + /* Now we have the predecessor node */ + node->next= del_node; + } + } + + /* Free the occupied space */ + mem_heap_free_top(heap, sizeof *top); +} + +__attribute__((nonnull)) +/** Delete all pointers to a page. +@param table hash table +@param heap memory heap +@param page record to be deleted */ +static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap, + ulint fold, const page_t *page) +{ + for (ha_node_t *node= ha_chain_get_first(table, fold); node; ) + { + if (page_align(ha_node_get_data(node)) == page) + { + ha_delete_hash_node(table, heap, node); + /* The deletion may compact the heap of nodes and move other nodes! */ + node= ha_chain_get_first(table, fold); + } + else + node= ha_chain_get_next(node); + } +#ifdef UNIV_DEBUG + /* Check that all nodes really got deleted */ + for (ha_node_t *node= ha_chain_get_first(table, fold); node; + node= ha_chain_get_next(node)) + ut_ad(page_align(ha_node_get_data(node)) != page); +#endif /* UNIV_DEBUG */ +} + +/** Delete a record if found. +@param table hash table +@param heap memory heap for the hash bucket chain +@param fold folded value of the searched data +@param data pointer to the record +@return whether the record was found */ +static bool ha_search_and_delete_if_found(hash_table_t *table, + mem_heap_t *heap, + ulint fold, const rec_t *data) +{ + if (ha_node_t *node= ha_search_with_data(table, fold, data)) + { + ha_delete_hash_node(table, heap, node); + return true; + } + + return false; +} + +__attribute__((nonnull)) +/** Looks for an element when we know the pointer to the data and +updates the pointer to data if found. +@param table hash table +@param fold folded value of the searched data +@param data pointer to the data +@param new_data new pointer to the data +@return whether the element was found */ +static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold, + const rec_t *data, +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + /** block containing new_data */ + buf_block_t *new_block, +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t *new_data) +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(new_block->frame == page_align(new_data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + if (!btr_search_enabled) + return false; + + if (ha_node_t *node= ha_search_with_data(table, fold, data)) + { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(node->block->n_pointers-- < MAX_N_POINTERS); + ut_a(new_block->n_pointers++ < MAX_N_POINTERS); + node->block= new_block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data= new_data; + + return true; + } + + return false; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +#else +# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d) +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found(table,fold,data,new_data) +#endif + +/** Updates a hash node reference when it has been unsuccessfully used in a +search which could have succeeded with the used hash parameters. This can +happen because when building a hash index for a page, we do not check +what happens at page boundaries, and therefore there can be misleading +hash nodes. Also, collisions in the fold value can lead to misleading +references. This function lazily fixes these imperfections in the hash +index. +@param[in] info search info +@param[in] block buffer block where cursor positioned +@param[in] cursor cursor */ +static +void +btr_search_update_hash_ref( + const btr_search_t* info, + buf_block_t* block, + const btr_cur_t* cursor) +{ + ut_ad(cursor->flag == BTR_CUR_HASH_FAIL); + + ut_ad(rw_lock_own_flagged(&block->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(page_align(btr_cur_get_rec(cursor)) == block->frame); + ut_ad(page_is_leaf(block->frame)); + assert_block_ahi_valid(block); + + dict_index_t* index = block->index; + + if (!index || !info->n_hash_potential) { + return; + } + + if (index != cursor->index) { + ut_ad(index->id == cursor->index->id); + btr_search_drop_page_hash_index(block); + return; + } + + ut_ad(block->page.id().space() == index->table->space_id); + ut_ad(index == cursor->index); + ut_ad(!dict_index_is_ibuf(index)); + auto part = btr_search_sys.get_part(*index); + rw_lock_x_lock(&part->latch); + ut_ad(!block->index || block->index == index); + + if (block->index + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side) + && btr_search_enabled) { + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + const rec_t* rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_user_rec(rec)) { + goto func_exit; + } + + ulint fold = rec_fold( + rec, + rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, index->id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + ha_insert_for_fold(&part->table, part->heap, fold, block, rec); + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + +func_exit: + rw_lock_x_unlock(&part->latch); +} + +/** Checks if a guessed position for a tree cursor is right. Note that if +mode is PAGE_CUR_LE, which is used in inserts, and the function returns +TRUE, then cursor->up_match and cursor->low_match both have sensible values. +@param[in,out] cursor guess cursor position +@param[in] can_only_compare_to_cursor_rec + if we do not have a latch on the page of cursor, + but a latch corresponding search system, then + ONLY the columns of the record UNDER the cursor + are protected, not the next or previous record + in the chain: we cannot look at the next or + previous record to check our guess! +@param[in] tuple data tuple +@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE +@return whether a match was found */ +static +bool +btr_search_check_guess( + btr_cur_t* cursor, + bool can_only_compare_to_cursor_rec, + const dtuple_t* tuple, + ulint mode) +{ + rec_t* rec; + ulint n_unique; + ulint match; + int cmp; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + ibool success = FALSE; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique_in_tree(cursor->index); + + rec = btr_cur_get_rec(cursor); + + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(page_rec_is_leaf(rec)); + + match = 0; + + offsets = rec_get_offsets(rec, cursor->index, offsets, + cursor->index->n_core_fields, + n_unique, &heap); + cmp = cmp_dtuple_rec_with_match(tuple, rec, offsets, &match); + + if (mode == PAGE_CUR_GE) { + if (cmp > 0) { + goto exit_func; + } + + cursor->up_match = match; + + if (match >= n_unique) { + success = TRUE; + goto exit_func; + } + } else if (mode == PAGE_CUR_LE) { + if (cmp < 0) { + goto exit_func; + } + + cursor->low_match = match; + + } else if (mode == PAGE_CUR_G) { + if (cmp >= 0) { + goto exit_func; + } + } else if (mode == PAGE_CUR_L) { + if (cmp <= 0) { + goto exit_func; + } + } + + if (can_only_compare_to_cursor_rec) { + /* Since we could not determine if our guess is right just by + looking at the record under the cursor, return FALSE */ + goto exit_func; + } + + match = 0; + + if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { + ut_ad(!page_rec_is_infimum(rec)); + + const rec_t* prev_rec = page_rec_get_prev(rec); + + if (page_rec_is_infimum(prev_rec)) { + success = !page_has_prev(page_align(prev_rec)); + goto exit_func; + } + + offsets = rec_get_offsets(prev_rec, cursor->index, offsets, + cursor->index->n_core_fields, + n_unique, &heap); + cmp = cmp_dtuple_rec_with_match( + tuple, prev_rec, offsets, &match); + if (mode == PAGE_CUR_GE) { + success = cmp > 0; + } else { + success = cmp >= 0; + } + } else { + ut_ad(!page_rec_is_supremum(rec)); + + const rec_t* next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + if (!page_has_next(page_align(next_rec))) { + cursor->up_match = 0; + success = TRUE; + } + + goto exit_func; + } + + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + cursor->index->n_core_fields, + n_unique, &heap); + cmp = cmp_dtuple_rec_with_match( + tuple, next_rec, offsets, &match); + if (mode == PAGE_CUR_LE) { + success = cmp < 0; + cursor->up_match = match; + } else { + success = cmp <= 0; + } + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +static +void +btr_search_failure(btr_search_t* info, btr_cur_t* cursor) +{ + cursor->flag = BTR_CUR_HASH_FAIL; + +#ifdef UNIV_SEARCH_PERF_STAT + ++info->n_hash_fail; + + if (info->n_hash_succ > 0) { + --info->n_hash_succ; + } +#endif /* UNIV_SEARCH_PERF_STAT */ + + info->last_hash_succ = FALSE; +} + +/** Clear the adaptive hash index on all pages in the buffer pool. */ +inline void buf_pool_t::clear_hash_index() +{ + ut_ad(btr_search_own_all(RW_LOCK_X)); + ut_ad(!resizing); + ut_ad(!btr_search_enabled); + + std::set<dict_index_t*> garbage; + + for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; ) + { + for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size; + block != end; block++) + { + dict_index_t *index= block->index; + assert_block_ahi_valid(block); + + /* We can clear block->index and block->n_pointers when + btr_search_own_all(RW_LOCK_X); see the comments in buf0buf.h */ + + if (!index) + { +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(!block->n_pointers); +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + continue; + } + + ut_d(buf_page_state state= block->page.state()); + /* Another thread may have set the state to + BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed(). + + The state change in buf_pool_t::realloc() is not observable + here, because in that case we would have !block->index. + + In the end, the entire adaptive hash index will be removed. */ + ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH); +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers= 0; +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + if (index->freed()) + garbage.insert(index); + block->index= nullptr; + } + } + + for (dict_index_t *index : garbage) + btr_search_lazy_free(index); +} + +/** Get a buffer block from an adaptive hash index pointer. +This function does not return if the block is not identified. +@param ptr pointer to within a page frame +@return pointer to block, never NULL */ +inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const +{ + chunk_t::map *chunk_map = chunk_t::map_ref; + ut_ad(chunk_t::map_ref == chunk_t::map_reg); + ut_ad(!resizing); + + chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr); + ut_a(it != chunk_map->begin()); + + chunk_t *chunk= it == chunk_map->end() + ? chunk_map->rbegin()->second + : (--it)->second; + + const size_t offs= size_t(ptr - chunk->blocks->frame) >> srv_page_size_shift; + ut_a(offs < chunk->size); + + buf_block_t *block= &chunk->blocks[offs]; + /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that + block[n].frame == block->frame + n * srv_page_size. Check it. */ + ut_ad(block->frame == page_align(ptr)); + /* Read the state of the block without holding hash_lock. + A state transition from BUF_BLOCK_FILE_PAGE to + BUF_BLOCK_REMOVE_HASH is possible during this execution. */ + ut_d(const buf_page_state state = block->page.state()); + ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH); + return block; +} + +/** Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@param[in,out] index index +@param[in,out] info index search info +@param[in] tuple logical record +@param[in] mode PAGE_CUR_L, .... +@param[in] latch_mode BTR_SEARCH_LEAF, ...; + NOTE that only if has_search_latch is 0, we will + have a latch set on the cursor page, otherwise + we assume the caller uses his search latch + to protect the record! +@param[out] cursor tree cursor +@param[in] ahi_latch the adaptive hash index latch being held, + or NULL +@param[in] mtr mini transaction +@return whether the search succeeded */ +bool +btr_search_guess_on_hash( + dict_index_t* index, + btr_search_t* info, + const dtuple_t* tuple, + ulint mode, + ulint latch_mode, + btr_cur_t* cursor, + rw_lock_t* ahi_latch, + mtr_t* mtr) +{ + ulint fold; + index_id_t index_id; + + ut_ad(mtr->is_active()); + ut_ad(!ahi_latch || rw_lock_own_flagged( + ahi_latch, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + + if (!btr_search_enabled) { + return false; + } + + ut_ad(!index->is_ibuf()); + ut_ad(!ahi_latch + || ahi_latch == &btr_search_sys.get_part(*index)->latch); + ut_ad((latch_mode == BTR_SEARCH_LEAF) + || (latch_mode == BTR_MODIFY_LEAF)); + compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH}); + compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH}); + + /* Not supported for spatial index */ + ut_ad(!dict_index_is_spatial(index)); + + /* Note that, for efficiency, the struct info may not be protected by + any latch here! */ + + if (info->n_hash_potential == 0) { + return false; + } + + cursor->n_fields = info->n_fields; + cursor->n_bytes = info->n_bytes; + + if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) { + return false; + } + + index_id = index->id; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ++; +#endif + fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id); + + cursor->fold = fold; + cursor->flag = BTR_CUR_HASH; + + auto part = btr_search_sys.get_part(*index); + const rec_t* rec; + + if (!ahi_latch) { + rw_lock_s_lock(&part->latch); + + if (!btr_search_enabled) { + goto fail; + } + } else { + ut_ad(btr_search_enabled); + ut_ad(rw_lock_own(ahi_latch, RW_LOCK_S)); + } + + rec = static_cast<const rec_t*>( + ha_search_and_get_data(&part->table, fold)); + + if (!rec) { + if (!ahi_latch) { +fail: + rw_lock_s_unlock(&part->latch); + } + + btr_search_failure(info, cursor); + return false; + } + + buf_block_t* block = buf_pool.block_from_ahi(rec); + + if (!ahi_latch) { + page_hash_latch* hash_lock = buf_pool.hash_lock_get( + block->page.id()); + hash_lock->read_lock(); + + if (block->page.state() == BUF_BLOCK_REMOVE_HASH) { + /* Another thread is just freeing the block + from the LRU list of the buffer pool: do not + try to access this page. */ + hash_lock->read_unlock(); + goto fail; + } + + const bool fail = index != block->index + && index_id == block->index->id; + ut_a(!fail || block->index->freed()); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED); + + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + hash_lock->read_unlock(); + block->page.set_accessed(); + + buf_page_make_young_if_needed(&block->page); + mtr_memo_type_t fix_type; + if (latch_mode == BTR_SEARCH_LEAF) { + if (!rw_lock_s_lock_nowait(&block->lock, + __FILE__, __LINE__)) { +got_no_latch: + buf_block_buf_fix_dec(block); + goto fail; + } + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + if (!rw_lock_x_lock_func_nowait_inline( + &block->lock, __FILE__, __LINE__)) { + goto got_no_latch; + } + fix_type = MTR_MEMO_PAGE_X_FIX; + } + mtr->memo_push(block, fix_type); + + buf_pool.stat.n_page_gets++; + + rw_lock_s_unlock(&part->latch); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + if (UNIV_UNLIKELY(fail)) { + goto fail_and_release_page; + } + } else if (UNIV_UNLIKELY(index != block->index + && index_id == block->index->id)) { + ut_a(block->index->freed()); + goto fail_and_release_page; + } + + if (block->page.state() != BUF_BLOCK_FILE_PAGE) { + + ut_ad(block->page.state() == BUF_BLOCK_REMOVE_HASH); + +fail_and_release_page: + if (!ahi_latch) { + btr_leaf_page_release(block, latch_mode, mtr); + } + + btr_search_failure(info, cursor); + return false; + } + + ut_ad(page_rec_is_user_rec(rec)); + + btr_cur_position(index, (rec_t*) rec, block, cursor); + + /* Check the validity of the guess within the page */ + + /* If we only have the latch on search system, not on the + page, it only protects the columns of the record the cursor + is positioned on. We cannot look at the next of the previous + record to determine if our guess for the cursor position is + right. */ + if (index_id != btr_page_get_index_id(block->frame) + || !btr_search_check_guess(cursor, !!ahi_latch, tuple, mode)) { + goto fail_and_release_page; + } + + if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) { + + info->n_hash_potential++; + } + +#ifdef notdefined + /* These lines of code can be used in a debug version to check + the correctness of the searched cursor position: */ + + info->last_hash_succ = FALSE; + + /* Currently, does not work if the following fails: */ + ut_ad(!ahi_latch); + + btr_leaf_page_release(block, latch_mode, mtr); + + btr_cur_search_to_nth_level( + index, 0, tuple, mode, latch_mode, &cursor2, 0, mtr); + + if (mode == PAGE_CUR_GE + && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) { + + /* If mode is PAGE_CUR_GE, then the binary search + in the index tree may actually take us to the supremum + of the previous page */ + + info->last_hash_succ = FALSE; + + btr_pcur_open_on_user_rec( + index, tuple, mode, latch_mode, &pcur, mtr); + + ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor)); + } else { + ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor)); + } + + /* NOTE that it is theoretically possible that the above assertions + fail if the page of the cursor gets removed from the buffer pool + meanwhile! Thus it might not be a bug. */ +#endif + info->last_hash_succ = TRUE; + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_succ++; +#endif + /* Increment the page get statistics though we did not really + fix the page: for user info only */ + ++buf_pool.stat.n_page_gets; + + if (!ahi_latch) { + buf_page_make_young_if_needed(&block->page); + } + + return true; +} + +/** Drop any adaptive hash index entries that point to an index page. +@param[in,out] block block containing index page, s- or x-latched, or an + index page for which we know that + block->buf_fix_count == 0 or it is an index page which + has already been removed from the buf_pool.page_hash + i.e.: it is in state BUF_BLOCK_REMOVE_HASH */ +void btr_search_drop_page_hash_index(buf_block_t* block) +{ + ulint n_fields; + ulint n_bytes; + const page_t* page; + const rec_t* rec; + ulint fold; + ulint prev_fold; + ulint n_cached; + ulint n_recs; + ulint* folds; + ulint i; + mem_heap_t* heap; + rec_offs* offsets; + +retry: + /* This debug check uses a dirty read that could theoretically cause + false positives while buf_pool.clear_hash_index() is executing. */ + assert_block_ahi_valid(block); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); + + if (!block->index) { + return; + } + + ut_ad(!block->page.buf_fix_count() + || block->page.state() == BUF_BLOCK_REMOVE_HASH + || rw_lock_own_flagged(&block->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S + | RW_LOCK_FLAG_SX)); + ut_ad(page_is_leaf(block->frame)); + + /* We must not dereference block->index here, because it could be freed + if (index->table->n_ref_count == 0 && !mutex_own(&dict_sys.mutex)). + Determine the ahi_slot based on the block contents. */ + + const index_id_t index_id + = btr_page_get_index_id(block->frame); + + auto part = btr_search_sys.get_part(index_id, + block->page.id().space()); + + dict_index_t* index = block->index; + bool is_freed = index && index->freed(); + + if (is_freed) { + rw_lock_x_lock(&part->latch); + } else { + rw_lock_s_lock(&part->latch); + } + + assert_block_ahi_valid(block); + + + if (!index || !btr_search_enabled) { + if (is_freed) { + rw_lock_x_unlock(&part->latch); + } else { + rw_lock_s_unlock(&part->latch); + } + return; + } + +#ifdef MYSQL_INDEX_DISABLE_AHI + ut_ad(!index->disable_ahi); +#endif + ut_ad(btr_search_enabled); + + ut_ad(block->page.id().space() == index->table->space_id); + ut_a(index_id == index->id); + ut_ad(!dict_index_is_ibuf(index)); + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + + /* NOTE: The AHI fields of block must not be accessed after + releasing search latch, as the index page might only be s-latched! */ + + if (!is_freed) { + rw_lock_s_unlock(&part->latch); + } + + ut_a(n_fields > 0 || n_bytes > 0); + + page = block->frame; + n_recs = page_get_n_recs(page); + + /* Calculate and cache fold values into an array for fast deletion + from the hash index */ + + folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint)); + + n_cached = 0; + + rec = page_get_infimum_rec(page); + rec = page_rec_get_next_low(rec, page_is_comp(page)); + if (rec_is_metadata(rec, *index)) { + rec = page_rec_get_next_low(rec, page_is_comp(page)); + } + + prev_fold = 0; + + heap = NULL; + offsets = NULL; + + while (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets( + rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), + &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (fold == prev_fold && prev_fold != 0) { + + goto next_rec; + } + + /* Remove all hash nodes pointing to this page from the + hash chain */ + + folds[n_cached] = fold; + n_cached++; +next_rec: + rec = page_rec_get_next_low(rec, page_rec_is_comp(rec)); + prev_fold = fold; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (!is_freed) { + rw_lock_x_lock(&part->latch); + + if (UNIV_UNLIKELY(!block->index)) { + /* Someone else has meanwhile dropped the + hash index */ + goto cleanup; + } + + ut_a(block->index == index); + } + + if (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes) { + + /* Someone else has meanwhile built a new hash index on the + page, with different parameters */ + + rw_lock_x_unlock(&part->latch); + + ut_free(folds); + goto retry; + } + + for (i = 0; i < n_cached; i++) { + ha_remove_all_nodes_to_page(&part->table, part->heap, + folds[i], page); + } + + switch (index->search_info->ref_count--) { + case 0: + ut_error; + case 1: + if (index->freed()) { + btr_search_lazy_free(index); + } + } + + block->index = NULL; + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED); + MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached); + +cleanup: + assert_block_ahi_valid(block); + rw_lock_x_unlock(&part->latch); + + ut_free(folds); +} + +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. +@param[in] page_id page id */ +void btr_search_drop_page_hash_when_freed(const page_id_t page_id) +{ + buf_block_t* block; + mtr_t mtr; + dberr_t err = DB_SUCCESS; + + mtr_start(&mtr); + + /* If the caller has a latch on the page, then the caller must + have a x-latch on the page and it must have already dropped + the hash index for the page. Because of the x-latch that we + are possibly holding, we cannot s-latch the page, but must + (recursively) x-latch it, even though we are only reading. */ + + block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL, + BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, + &mtr, &err); + + if (block) { + + /* If AHI is still valid, page can't be in free state. + AHI is dropped when page is freed. */ + DBUG_ASSERT(block->page.status != buf_page_t::FREED); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + + dict_index_t* index = block->index; + if (index != NULL) { + /* In all our callers, the table handle should + be open, or we should be in the process of + dropping the table (preventing eviction). */ + ut_ad(index->table->get_ref_count() > 0 + || mutex_own(&dict_sys.mutex)); + btr_search_drop_page_hash_index(block); + } + } + + mtr_commit(&mtr); +} + +/** Build a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible, and does not build a hash index if not. +@param[in,out] index index for which to build. +@param[in,out] block index page, s-/x- latched. +@param[in,out] ahi_latch the adaptive search latch +@param[in] n_fields hash this many full fields +@param[in] n_bytes hash this many bytes of the next field +@param[in] left_side hash for searches from left side */ +static +void +btr_search_build_page_hash_index( + dict_index_t* index, + buf_block_t* block, + rw_lock_t* ahi_latch, + uint16_t n_fields, + uint16_t n_bytes, + bool left_side) +{ + const rec_t* rec; + const rec_t* next_rec; + ulint fold; + ulint next_fold; + ulint n_cached; + ulint n_recs; + ulint* folds; + const rec_t** recs; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + +#ifdef MYSQL_INDEX_DISABLE_AHI + if (index->disable_ahi) return; +#endif + if (!btr_search_enabled) { + return; + } + + rec_offs_init(offsets_); + ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch); + ut_ad(index); + ut_ad(block->page.id().space() == index->table->space_id); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(page_is_leaf(block->frame)); + + ut_ad(rw_lock_own_flagged(&block->lock, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(block->page.id().page_no() >= 3); + + rw_lock_s_lock(ahi_latch); + + const bool enabled = btr_search_enabled; + const bool rebuild = enabled && block->index + && (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes + || block->curr_left_side != left_side); + + rw_lock_s_unlock(ahi_latch); + + if (!enabled) { + return; + } + + if (rebuild) { + btr_search_drop_page_hash_index(block); + } + + /* Check that the values for hash index build are sensible */ + + if (n_fields == 0 && n_bytes == 0) { + + return; + } + + if (dict_index_get_n_unique_in_tree(index) + < btr_search_get_n_fields(n_fields, n_bytes)) { + return; + } + + page_t* page = buf_block_get_frame(block); + n_recs = page_get_n_recs(page); + + if (n_recs == 0) { + + return; + } + + rec = page_rec_get_next_const(page_get_infimum_rec(page)); + + if (rec_is_metadata(rec, *index)) { + rec = page_rec_get_next_const(rec); + if (!--n_recs) return; + } + + /* Calculate and cache fold values and corresponding records into + an array for fast insertion to the hash index */ + + folds = static_cast<ulint*>(ut_malloc_nokey(n_recs * sizeof *folds)); + recs = static_cast<const rec_t**>( + ut_malloc_nokey(n_recs * sizeof *recs)); + + n_cached = 0; + + ut_a(index->id == btr_page_get_index_id(page)); + + offsets = rec_get_offsets( + rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), + &heap); + ut_ad(page_rec_is_supremum(rec) + || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0)); + + fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); + + if (left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + for (;;) { + next_rec = page_rec_get_next_const(rec); + + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + break; + } + + offsets = rec_get_offsets( + next_rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index->id); + + if (fold != next_fold) { + /* Insert an entry into the hash index */ + + if (left_side) { + + folds[n_cached] = next_fold; + recs[n_cached] = next_rec; + n_cached++; + } else { + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + } + + rec = next_rec; + fold = next_fold; + } + + btr_search_check_free_space_in_heap(index); + + rw_lock_x_lock(ahi_latch); + + if (!btr_search_enabled) { + goto exit_func; + } + + /* This counter is decremented every time we drop page + hash index entries and is incremented here. Since we can + rebuild hash index for a page that is already hashed, we + have to take care not to increment the counter in that + case. */ + if (!block->index) { + assert_block_ahi_empty(block); + index->search_info->ref_count++; + } else if (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes + || block->curr_left_side != left_side) { + goto exit_func; + } + + block->n_hash_helps = 0; + + block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS; + block->curr_n_bytes = n_bytes & ((1U << 15) - 1); + block->curr_left_side = left_side; + block->index = index; + + { + auto part = btr_search_sys.get_part(*index); + for (ulint i = 0; i < n_cached; i++) { + ha_insert_for_fold(&part->table, part->heap, + folds[i], block, recs[i]); + } + } + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED); + MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached); +exit_func: + assert_block_ahi_valid(block); + rw_lock_x_unlock(ahi_latch); + + ut_free(folds); + ut_free(recs); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ +void +btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor) +{ + rw_lock_t* ahi_latch = &btr_search_sys.get_part(*cursor->index) + ->latch; + ut_ad(!rw_lock_own_flagged(ahi_latch, + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + + buf_block_t* block = btr_cur_get_block(cursor); + + /* NOTE that the following two function calls do NOT protect + info or block->n_fields etc. with any semaphore, to save CPU time! + We cannot assume the fields are consistent when we return from + those functions! */ + + btr_search_info_update_hash(info, cursor); + + bool build_index = btr_search_update_block_hash_info(info, block); + + if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { + + btr_search_check_free_space_in_heap(cursor->index); + } + + if (cursor->flag == BTR_CUR_HASH_FAIL) { + /* Update the hash node reference, if appropriate */ + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_hash_fail++; +#endif /* UNIV_SEARCH_PERF_STAT */ + + btr_search_update_hash_ref(info, block, cursor); + } + + if (build_index) { + /* Note that since we did not protect block->n_fields etc. + with any semaphore, the values can be inconsistent. We have + to check inside the function call that they make sense. */ + btr_search_build_page_hash_index(cursor->index, block, + ahi_latch, + block->n_fields, + block->n_bytes, + block->left_side); + } +} + +/** Move or delete hash entries for moved records, usually in a page split. +If new_block is already hashed, then any hash index for block is dropped. +If new_block is not hashed, and block is hashed, then a new hash index is +built to new_block with the same parameters as block. +@param[in,out] new_block destination page +@param[in,out] block source page (subject to deletion later) */ +void +btr_search_move_or_delete_hash_entries( + buf_block_t* new_block, + buf_block_t* block) +{ + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X)); + ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_X)); + + if (!btr_search_enabled) { + return; + } + + dict_index_t* index = block->index; + if (!index) { + index = new_block->index; + } else { + ut_ad(!new_block->index || index == new_block->index); + } + assert_block_ahi_valid(block); + assert_block_ahi_valid(new_block); + + rw_lock_t* ahi_latch = index + ? &btr_search_sys.get_part(*index)->latch + : nullptr; + + if (new_block->index) { +drop_exit: + btr_search_drop_page_hash_index(block); + return; + } + + if (!index) { + return; + } + + if (index->freed()) { + goto drop_exit; + } + + rw_lock_s_lock(ahi_latch); + + if (block->index) { + uint16_t n_fields = block->curr_n_fields; + uint16_t n_bytes = block->curr_n_bytes; + bool left_side = block->curr_left_side; + + new_block->n_fields = block->curr_n_fields; + new_block->n_bytes = block->curr_n_bytes; + new_block->left_side = left_side; + + rw_lock_s_unlock(ahi_latch); + + ut_a(n_fields > 0 || n_bytes > 0); + + btr_search_build_page_hash_index( + index, new_block, ahi_latch, + n_fields, n_bytes, left_side); + ut_ad(n_fields == block->curr_n_fields); + ut_ad(n_bytes == block->curr_n_bytes); + ut_ad(left_side == block->curr_left_side); + return; + } + + rw_lock_s_unlock(ahi_latch); +} + +/** Updates the page hash index when a single record is deleted from a page. +@param[in] cursor cursor which was positioned on the record to delete + using btr_cur_search_, the record is not yet deleted.*/ +void btr_search_update_hash_on_delete(btr_cur_t* cursor) +{ + buf_block_t* block; + const rec_t* rec; + ulint fold; + dict_index_t* index; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap = NULL; + rec_offs_init(offsets_); + + ut_ad(page_is_leaf(btr_cur_get_page(cursor))); +#ifdef MYSQL_INDEX_DISABLE_AHI + if (cursor->index->disable_ahi) return; +#endif + + if (!btr_search_enabled) { + return; + } + + block = btr_cur_get_block(cursor); + + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X)); + + assert_block_ahi_valid(block); + index = block->index; + + if (!index) { + + return; + } + + if (index != cursor->index) { + btr_search_drop_page_hash_index(block); + return; + } + + ut_ad(block->page.id().space() == index->table->space_id); + ut_a(index == cursor->index); + ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0); + ut_ad(!dict_index_is_ibuf(index)); + + rec = btr_cur_get_rec(cursor); + + fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, block->curr_n_bytes, index->id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + auto part = btr_search_sys.get_part(*index); + + rw_lock_x_lock(&part->latch); + assert_block_ahi_valid(block); + + if (block->index && btr_search_enabled) { + ut_a(block->index == index); + + if (ha_search_and_delete_if_found(&part->table, part->heap, + fold, rec)) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED); + } else { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND); + } + + assert_block_ahi_valid(block); + } + + rw_lock_x_unlock(&part->latch); +} + +/** Updates the page hash index when a single record is inserted on a page. +@param[in] cursor cursor which was positioned to the place to insert + using btr_cur_search_, and the new record has been + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ +void +btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch) +{ + buf_block_t* block; + dict_index_t* index; + rec_t* rec; + + ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); +#ifdef MYSQL_INDEX_DISABLE_AHI + if (cursor->index->disable_ahi) return; +#endif + if (!btr_search_enabled) { + return; + } + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X)); + + index = block->index; + + if (!index) { + + return; + } + + if (index != cursor->index) { + ut_ad(index->id == cursor->index->id); + btr_search_drop_page_hash_index(block); + return; + } + + ut_a(cursor->index == index); + ut_ad(!dict_index_is_ibuf(index)); + rw_lock_x_lock(ahi_latch); + + if (!block->index || !btr_search_enabled) { + + goto func_exit; + } + + ut_a(block->index == index); + + if ((cursor->flag == BTR_CUR_HASH) + && (cursor->n_fields == block->curr_n_fields) + && (cursor->n_bytes == block->curr_n_bytes) + && !block->curr_left_side) { + + if (ha_search_and_update_if_found( + &btr_search_sys.get_part(*cursor->index)->table, + cursor->fold, rec, block, + page_rec_get_next(rec))) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED); + } + +func_exit: + assert_block_ahi_valid(block); + rw_lock_x_unlock(ahi_latch); + } else { + rw_lock_x_unlock(ahi_latch); + + btr_search_update_hash_on_insert(cursor, ahi_latch); + } +} + +/** Updates the page hash index when a single record is inserted on a page. +@param[in,out] cursor cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ +void +btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch) +{ + buf_block_t* block; + dict_index_t* index; + const rec_t* rec; + const rec_t* ins_rec; + const rec_t* next_rec; + ulint fold; + ulint ins_fold; + ulint next_fold = 0; /* remove warning (??? bug ???) */ + ulint n_fields; + ulint n_bytes; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch); + ut_ad(page_is_leaf(btr_cur_get_page(cursor))); + ut_ad(!btr_search_own_any(RW_LOCK_S)); + ut_ad(!btr_search_own_any(RW_LOCK_X)); +#ifdef MYSQL_INDEX_DISABLE_AHI + if (cursor->index->disable_ahi) return; +#endif + if (!btr_search_enabled) { + return; + } + + block = btr_cur_get_block(cursor); + + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X)); + assert_block_ahi_valid(block); + + index = block->index; + + if (!index) { + + return; + } + + ut_ad(block->page.id().space() == index->table->space_id); + btr_search_check_free_space_in_heap(index); + + rec = btr_cur_get_rec(cursor); + +#ifdef MYSQL_INDEX_DISABLE_AHI + ut_a(!index->disable_ahi); +#endif + if (index != cursor->index) { + ut_ad(index->id == cursor->index->id); + btr_search_drop_page_hash_index(block); + return; + } + + ut_a(index == cursor->index); + ut_ad(!dict_index_is_ibuf(index)); + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + const bool left_side = block->curr_left_side; + + ins_rec = page_rec_get_next_const(rec); + next_rec = page_rec_get_next_const(ins_rec); + + offsets = rec_get_offsets(ins_rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id); + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets( + next_rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index->id); + } + + /* We must not look up "part" before acquiring ahi_latch. */ + btr_search_sys_t::partition* part= nullptr; + bool locked = false; + + if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) { + offsets = rec_get_offsets( + rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); + } else { + if (left_side) { + locked = true; + rw_lock_x_lock(ahi_latch); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + + goto check_next_rec; + } + + if (fold != ins_fold) { + + if (!locked) { + locked = true; + rw_lock_x_lock(ahi_latch); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + } + + if (!left_side) { + ha_insert_for_fold(&part->table, part->heap, + fold, block, rec); + } else { + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + } + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + +check_next_rec: + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + if (!locked) { + locked = true; + rw_lock_x_lock(ahi_latch); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + } + + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + + goto function_exit; + } + + if (ins_fold != next_fold) { + if (!locked) { + locked = true; + rw_lock_x_lock(ahi_latch); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + } + + if (!left_side) { + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + } else { + ha_insert_for_fold(&part->table, part->heap, + next_fold, block, next_rec); + } + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + +function_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (locked) { + rw_lock_x_unlock(ahi_latch); + } + ut_ad(!rw_lock_own(ahi_latch, RW_LOCK_X)); +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +__attribute__((nonnull)) +/** @return whether a range of the cells is valid */ +static bool ha_validate(const hash_table_t *table, + ulint start_index, ulint end_index) +{ + ut_a(start_index <= end_index); + ut_a(end_index < table->n_cells); + + bool ok= true; + + for (ulint i= start_index; i <= end_index; i++) + { + for (auto node= static_cast<const ha_node_t*>(table->array[i].node); node; + node= node->next) + { + if (table->calc_hash(node->fold) != i) { + ib::error() << "Hash table node fold value " << node->fold + << " does not match the cell number " << i; + ok= false; + } + } + } + + return ok; +} + +/** Validates the search system for given hash table. +@param[in] hash_table_id hash table to validate +@return TRUE if ok */ +static +ibool +btr_search_hash_table_validate(ulint hash_table_id) +{ + ha_node_t* node; + ibool ok = TRUE; + ulint i; + ulint cell_count; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + btr_search_x_lock_all(); + if (!btr_search_enabled) { + btr_search_x_unlock_all(); + return(TRUE); + } + + /* How many cells to check before temporarily releasing + search latches. */ + ulint chunk_size = 10000; + + rec_offs_init(offsets_); + + mysql_mutex_lock(&buf_pool.mutex); + + auto &part = btr_search_sys.parts[hash_table_id]; + + cell_count = part.table.n_cells; + + for (i = 0; i < cell_count; i++) { + /* We release search latches every once in a while to + give other queries a chance to run. */ + if ((i != 0) && ((i % chunk_size) == 0)) { + + mysql_mutex_unlock(&buf_pool.mutex); + btr_search_x_unlock_all(); + + os_thread_yield(); + + btr_search_x_lock_all(); + + if (!btr_search_enabled) { + ok = true; + goto func_exit; + } + + mysql_mutex_lock(&buf_pool.mutex); + + ulint curr_cell_count = part.table.n_cells; + + if (cell_count != curr_cell_count) { + + cell_count = curr_cell_count; + + if (i >= cell_count) { + break; + } + } + } + + node = static_cast<ha_node_t*>(part.table.array[i].node); + + for (; node != NULL; node = node->next) { + const buf_block_t* block + = buf_pool.block_from_ahi((byte*) node->data); + index_id_t page_index_id; + + if (UNIV_LIKELY(block->page.state() + == BUF_BLOCK_FILE_PAGE)) { + + /* The space and offset are only valid + for file blocks. It is possible that + the block is being freed + (BUF_BLOCK_REMOVE_HASH, see the + assertion and the comment below) */ + const page_id_t id(block->page.id()); + if (const buf_page_t* hash_page + = buf_pool.page_hash_get_low( + id, id.fold())) { + ut_ad(hash_page == &block->page); + goto state_ok; + } + } + + /* When a block is being freed, + buf_LRU_search_and_free_block() first removes + the block from buf_pool.page_hash by calling + buf_LRU_block_remove_hashed_page(). Then it + invokes btr_search_drop_page_hash_index(). */ + ut_a(block->page.state() == BUF_BLOCK_REMOVE_HASH); +state_ok: + ut_ad(!dict_index_is_ibuf(block->index)); + ut_ad(block->page.id().space() + == block->index->table->space_id); + + page_index_id = btr_page_get_index_id(block->frame); + + offsets = rec_get_offsets( + node->data, block->index, offsets, + block->index->n_core_fields, + btr_search_get_n_fields(block->curr_n_fields, + block->curr_n_bytes), + &heap); + + const ulint fold = rec_fold( + node->data, offsets, + block->curr_n_fields, + block->curr_n_bytes, + page_index_id); + + if (node->fold != fold) { + const page_t* page = block->frame; + + ok = FALSE; + + ib::error() << "Error in an adaptive hash" + << " index pointer to page " + << block->page.id() + << ", ptr mem address " + << reinterpret_cast<const void*>( + node->data) + << ", index id " << page_index_id + << ", node fold " << node->fold + << ", rec fold " << fold; + + fputs("InnoDB: Record ", stderr); + rec_print_new(stderr, node->data, offsets); + fprintf(stderr, "\nInnoDB: on that page." + " Page mem address %p, is hashed %p," + " n fields %lu\n" + "InnoDB: side %lu\n", + (void*) page, (void*) block->index, + (ulong) block->curr_n_fields, + (ulong) block->curr_left_side); + ut_ad(0); + } + } + } + + for (i = 0; i < cell_count; i += chunk_size) { + /* We release search latches every once in a while to + give other queries a chance to run. */ + if (i != 0) { + mysql_mutex_unlock(&buf_pool.mutex); + btr_search_x_unlock_all(); + + os_thread_yield(); + + btr_search_x_lock_all(); + + if (!btr_search_enabled) { + ok = true; + goto func_exit; + } + + mysql_mutex_lock(&buf_pool.mutex); + + ulint curr_cell_count = part.table.n_cells; + + if (cell_count != curr_cell_count) { + + cell_count = curr_cell_count; + + if (i >= cell_count) { + break; + } + } + } + + ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1); + + if (!ha_validate(&part.table, i, end_index)) { + ok = FALSE; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); +func_exit: + btr_search_x_unlock_all(); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ok); +} + +/** Validate the search system. +@return true if ok. */ +bool +btr_search_validate() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + if (!btr_search_hash_table_validate(i)) { + return(false); + } + } + + return(true); +} + +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ +#endif /* BTR_CUR_HASH_ADAPT */ |