summaryrefslogtreecommitdiffstats
path: root/storage/innobase/btr
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/btr')
-rw-r--r--storage/innobase/btr/btr0btr.cc5433
-rw-r--r--storage/innobase/btr/btr0bulk.cc1233
-rw-r--r--storage/innobase/btr/btr0cur.cc7017
-rw-r--r--storage/innobase/btr/btr0defragment.cc820
-rw-r--r--storage/innobase/btr/btr0pcur.cc667
-rw-r--r--storage/innobase/btr/btr0sea.cc2328
6 files changed, 17498 insertions, 0 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
new file mode 100644
index 00000000..08be1991
--- /dev/null
+++ b/storage/innobase/btr/btr0btr.cc
@@ -0,0 +1,5433 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.cc
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "btr0defragment.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "gis0geo.h"
+#include "dict0boot.h"
+#include "row0sel.h" /* row_search_max_autoinc() */
+#include "log.h"
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge */
+ uint32_t page_no, /*!< in: a sibling page */
+ buf_block_t** merge_block, /*!< out: the merge block */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+
+Node pointer page latches acquisition is protected by index->lock latch.
+
+Before MariaDB 10.2.2, all node pointer pages were protected by index->lock
+either in S (shared) or X (exclusive) mode and block->lock was not acquired on
+node pointer pages.
+
+After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect
+node pointer pages and obtaiment of node pointer page latches is protected by
+index->lock.
+
+(0) Definition: B-tree level.
+
+(0.1) The leaf pages of the B-tree are at level 0.
+
+(0.2) The parent of a page at level L has level L+1. (The level of the
+root page is equal to the tree height.)
+
+(0.3) The B-tree lock (index->lock) is the parent of the root page and
+has a level = tree height + 1.
+
+Index->lock has 3 possible locking modes:
+
+(1) S-latch:
+
+(1.1) All latches for pages must be obtained in descending order of tree level.
+
+(1.2) Before obtaining the first node pointer page latch at a given B-tree
+level, parent latch must be held (at level +1 ).
+
+(1.3) If a node pointer page is already latched at the same level
+we can only obtain latch to its right sibling page latch at the same level.
+
+(1.4) Release of the node pointer page latches must be done in
+child-to-parent order. (Prevents deadlocks when obtained index->lock
+in SX mode).
+
+(1.4.1) Level L node pointer page latch can be released only when
+no latches at children level i.e. level < L are hold.
+
+(1.4.2) All latches from node pointer pages must be released so
+that no latches are obtained between.
+
+(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer
+latch obtained.
+
+(2) SX-latch:
+
+In this case rules (1.2) and (1.3) from S-latch case are relaxed and
+merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition
+can be skipped at some tree levels and latches can be obtained in
+a less restricted order.
+
+(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending
+order of tree level.
+
+(2.2) When a node pointer latch at level L is obtained,
+the left sibling page latch in the same level or some ancestor
+page latch (at level > L) must be hold.
+
+(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can
+be any node pointer page.
+
+(3) X-latch:
+
+Node pointer latches can be obtained in any order.
+
+NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages:
+
+index->lock S-latch is needed in read for the node pointer traversal. When the leaf
+level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all
+node pointer latches). Left to right index travelsal in leaf page level can be safely done
+by obtaining right sibling leaf page latch and then releasing the old page latch.
+
+Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock
+S-latch.
+
+B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page
+allocations are protected by index->lock X-latch.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+/** Check a file segment header within a B-tree root page.
+@param offset file segment header offset
+@param block B-tree root page
+@param space tablespace
+@return whether the segment header is valid */
+static bool btr_root_fseg_validate(ulint offset,
+ const buf_block_t &block,
+ const fil_space_t &space)
+{
+ ut_ad(block.page.id().space() == space.id);
+ const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET +
+ block.page.frame);
+ if (FIL_PAGE_DATA <= hdr && hdr <= srv_page_size - FIL_PAGE_DATA_END &&
+ mach_read_from_4(block.page.frame + offset + FSEG_HDR_SPACE) == space.id)
+ return true;
+ sql_print_error("InnoDB: Index root page " UINT32PF " in %s is corrupted "
+ "at " ULINTPF,
+ block.page.id().page_no(),
+ UT_LIST_GET_FIRST(space.chain)->name);
+ return false;
+}
+
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
+{
+ ib_push_warning(static_cast<void*>(nullptr), DB_DECRYPTION_FAILED,
+ "Table %s is encrypted but encryption service or"
+ " used key_id is not available. "
+ " Can't continue reading table.",
+ index.table->name.m_name);
+ index.table->file_unreadable= true;
+}
+
+/** Get an index page and declare its latching order level.
+@param[in] index index tree
+@param[in] page page number
+@param[in] mode latch mode
+@param[in] merge whether change buffer merge should be attempted
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return block */
+buf_block_t *btr_block_get(const dict_index_t &index,
+ uint32_t page, rw_lock_type_t mode, bool merge,
+ mtr_t *mtr, dberr_t *err)
+{
+ ut_ad(mode != RW_NO_LATCH);
+ dberr_t local_err;
+ if (!err)
+ err= &local_err;
+ buf_block_t *block=
+ buf_page_get_gen(page_id_t{index.table->space->id, page},
+ index.table->space->zip_size(), mode, nullptr, BUF_GET,
+ mtr, err, merge && !index.is_clust());
+ ut_ad(!block == (*err != DB_SUCCESS));
+
+ if (UNIV_LIKELY(block != nullptr))
+ {
+ if (!!page_is_comp(block->page.frame) != index.table->not_redundant() ||
+ btr_page_get_index_id(block->page.frame) != index.id ||
+ !fil_page_index_page_check(block->page.frame) ||
+ index.is_spatial() !=
+ (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+ {
+ *err= DB_PAGE_CORRUPTED;
+ block= nullptr;
+ }
+ }
+ else if (*err == DB_DECRYPTION_FAILED)
+ btr_decryption_failed(index);
+
+ return block;
+}
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ rw_lock_type_t mode, /*!< in: either RW_S_LATCH
+ or RW_X_LATCH */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+{
+ if (!index->table || !index->table->space)
+ {
+ *err= DB_TABLESPACE_NOT_FOUND;
+ return nullptr;
+ }
+
+ buf_block_t *block;
+#ifndef BTR_CUR_ADAPT
+ static constexpr buf_block_t *guess= nullptr;
+#else
+ buf_block_t *&guess= btr_search_get_info(index)->root_guess;
+ guess=
+#endif
+ block=
+ buf_page_get_gen(page_id_t{index->table->space->id, index->page},
+ index->table->space->zip_size(), mode, guess, BUF_GET,
+ mtr, err, false);
+ ut_ad(!block == (*err != DB_SUCCESS));
+
+ if (UNIV_LIKELY(block != nullptr))
+ {
+ if (UNIV_UNLIKELY(mode == RW_NO_LATCH));
+ else if (!!page_is_comp(block->page.frame) !=
+ index->table->not_redundant() ||
+ btr_page_get_index_id(block->page.frame) != index->id ||
+ !fil_page_index_page_check(block->page.frame) ||
+ index->is_spatial() !=
+ (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+ {
+ *err= DB_PAGE_CORRUPTED;
+ block= nullptr;
+ }
+ else if (index->is_ibuf());
+ else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+ *block, *index->table->space) ||
+ !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+ *block, *index->table->space))
+ {
+ *err= DB_CORRUPTION;
+ block= nullptr;
+ }
+ }
+ else if (*err == DB_DECRYPTION_FAILED)
+ btr_decryption_failed(*index);
+
+ return block;
+}
+
+/**************************************************************//**
+Gets the root node of a tree and sx-latches it for segment access.
+@return root page, sx-latched */
+static
+page_t*
+btr_root_get(
+/*=========*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+{
+ /* Intended to be used for accessing file segment lists.
+ Concurrent read of other data is allowed. */
+ if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))
+ return root->page.frame;
+ return nullptr;
+}
+
+/**************************************************************//**
+Checks a file segment header within a B-tree root page and updates
+the segment header space id.
+@return TRUE if valid */
+static
+bool
+btr_root_fseg_adjust_on_import(
+/*===========================*/
+ fseg_header_t* seg_header, /*!< in/out: segment header */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page,
+ or NULL */
+ ulint space) /*!< in: tablespace identifier */
+{
+ ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+ if (offset < FIL_PAGE_DATA
+ || offset > srv_page_size - FIL_PAGE_DATA_END) {
+ return false;
+ }
+
+ seg_header += FSEG_HDR_SPACE;
+
+ mach_write_to_4(seg_header, space);
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ memcpy(page_zip->data + page_offset(seg_header), seg_header,
+ 4);
+ }
+
+ return true;
+}
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+ const dict_index_t* index) /*!< in: index tree */
+{
+ dberr_t err;
+ mtr_t mtr;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ dict_table_t* table = index->table;
+
+ DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
+ return(DB_CORRUPTION););
+
+ mtr_start(&mtr);
+
+ mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+ buf_block_t* block = buf_page_get_gen(
+ page_id_t(table->space->id, index->page),
+ table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET,
+ &mtr, &err);
+ if (!block) {
+ ut_ad(err != DB_SUCCESS);
+ goto func_exit;
+ }
+
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ if (!fil_page_index_page_check(page) || page_has_siblings(page)) {
+ err = DB_CORRUPTION;
+
+ } else if (dict_index_is_clust(index)) {
+ bool page_is_compact_format;
+
+ page_is_compact_format = page_is_comp(page) > 0;
+
+ /* Check if the page format and table format agree. */
+ if (page_is_compact_format != dict_table_is_comp(table)) {
+ err = DB_CORRUPTION;
+ } else {
+ /* Check that the table flags and the tablespace
+ flags match. */
+ uint32_t tf = dict_tf_to_fsp_flags(table->flags);
+ uint32_t sf = table->space->flags;
+ sf &= ~FSP_FLAGS_MEM_MASK;
+ tf &= ~FSP_FLAGS_MEM_MASK;
+ if (fil_space_t::is_flags_equal(tf, sf)
+ || fil_space_t::is_flags_equal(sf, tf)) {
+ mysql_mutex_lock(&fil_system.mutex);
+ table->space->flags = (table->space->flags
+ & ~FSP_FLAGS_MEM_MASK)
+ | (tf & FSP_FLAGS_MEM_MASK);
+ mysql_mutex_unlock(&fil_system.mutex);
+ err = DB_SUCCESS;
+ } else {
+ err = DB_CORRUPTION;
+ }
+ }
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ /* Check and adjust the file segment headers, if all OK so far. */
+ if (err == DB_SUCCESS
+ && (!btr_root_fseg_adjust_on_import(
+ FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + page, page_zip, table->space_id)
+ || !btr_root_fseg_adjust_on_import(
+ FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + page, page_zip, table->space_id))) {
+
+ err = DB_CORRUPTION;
+ }
+
+func_exit:
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization). @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+ buf_block_t* block, /*!< in/out: page to be created */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
+ block->page.frame);
+
+ if (UNIV_LIKELY_NULL(page_zip))
+ {
+ mach_write_to_8(index_id, index->id);
+ page_create_zip(block, index, level, 0, mtr);
+ }
+ else
+ {
+ page_create(block, mtr, dict_table_is_comp(index->table));
+ if (index->is_spatial())
+ {
+ static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+ FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM))
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+ }
+ /* Set the level of the new index page */
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+ my_assume_aligned<2>(PAGE_HEADER +
+ PAGE_LEVEL +
+ block->page.frame),
+ level);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id);
+ }
+}
+
+buf_block_t *
+mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const
+{
+ ut_ad(is_active());
+ ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX ||
+ type == MTR_MEMO_PAGE_S_FIX);
+ for (ulint i= 0; i < m_memo.size(); i++)
+ {
+ const mtr_memo_slot_t &slot= m_memo[i];
+ const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY);
+ if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type)
+ {
+ buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+ if (block->page.id() == id)
+ return block;
+ }
+ }
+ return nullptr;
+}
+
+/** Fetch an index root page that was already latched in the
+mini-transaction. */
+static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr)
+{
+ return mtr->get_already_latched(page_id_t{index.table->space_id, index.page},
+ MTR_MEMO_PAGE_SX_FIX);
+}
+
+/** Fetch an index page that should have been already latched in the
+mini-transaction. */
+static buf_block_t *
+btr_block_reget(mtr_t *mtr, const dict_index_t &index,
+ const page_id_t id, dberr_t *err)
+{
+ if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX))
+ {
+ *err= DB_SUCCESS;
+ return block;
+ }
+
+ ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK));
+ return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+{
+ buf_block_t *root= btr_get_latched_root(*index, mtr);
+ if (UNIV_UNLIKELY(!root))
+ return root;
+ buf_block_t *new_block=
+ buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
+ mach_read_from_4(PAGE_HEADER +
+ PAGE_BTR_IBUF_FREE_LIST +
+ FLST_FIRST + FIL_ADDR_PAGE +
+ root->page.frame)),
+ 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+ if (new_block)
+ *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block,
+ PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+ ut_d(if (*err == DB_SUCCESS)
+ flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+ return new_block;
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+btr_page_alloc_low(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ uint32_t hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr, /*!< in/out: mini-transaction
+ for the allocation */
+ mtr_t* init_mtr, /*!< in/out: mtr or another
+ mini-transaction in which the
+ page should be initialized. */
+ dberr_t* err) /*!< out: error code */
+{
+ const auto savepoint= mtr->get_savepoint();
+ buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err);
+ if (UNIV_UNLIKELY(!root))
+ return root;
+
+ const bool have_latch= mtr->have_u_or_x_latch(*root);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!have_latch || !root->index || !root->index->freed());
+#endif
+ mtr->rollback_to_savepoint(savepoint);
+
+ if (!have_latch &&
+ UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))))
+ return root;
+
+ fseg_header_t *seg_header= root->page.frame +
+ (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF);
+ return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction,
+ true, mtr, init_mtr, err);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index */
+ uint32_t hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr, /*!< in/out: mini-transaction
+ for the allocation */
+ mtr_t* init_mtr, /*!< in/out: mini-transaction
+ for x-latching and initializing
+ the page */
+ dberr_t* err) /*!< out: error code */
+{
+ ut_ad(level < BTR_MAX_NODE_LEVEL);
+ return index->is_ibuf()
+ ? btr_page_alloc_for_ibuf(index, mtr, err)
+ : btr_page_alloc_low(index, hint_page_no, file_direction, level,
+ mtr, init_mtr, err);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+dberr_t
+btr_page_free_for_ibuf(
+/*===================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ buf_block_t *root= btr_get_latched_root(*index, mtr);
+ dberr_t err=
+ flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+ ut_d(if (err == DB_SUCCESS)
+ flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+ return err;
+}
+
+/** Free an index page.
+@param[in,out] index index tree
+@param[in,out] block block to be freed
+@param[in,out] mtr mini-transaction
+@param[in] blob whether this is freeing a BLOB page
+@param[in] latched whether index->table->space->x_lock() was called
+@return error code */
+dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+ bool blob, bool space_latched)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#if defined BTR_CUR_HASH_ADAPT && defined UNIV_DEBUG
+ if (btr_search_check_marked_free_index(block))
+ {
+ ut_ad(!blob);
+ ut_ad(page_is_leaf(block->page.frame));
+ }
+#endif
+ const uint32_t page{block->page.id().page_no()};
+ ut_ad(index->table->space_id == block->page.id().space());
+ /* The root page is freed by btr_free_root(). */
+ ut_ad(page != index->page);
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ /* The page gets invalid for optimistic searches: increment the frame
+ modify clock */
+ buf_block_modify_clock_inc(block);
+
+ /* TODO: Discard any operations for block from mtr->m_log.
+ The page will be freed, so previous changes to it by this
+ mini-transaction should not matter. */
+
+ if (index->is_ibuf())
+ return btr_page_free_for_ibuf(index, block, mtr);
+
+ fil_space_t *space= index->table->space;
+ dberr_t err;
+
+ const auto savepoint= mtr->get_savepoint();
+ if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err))
+ {
+ const bool have_latch= mtr->have_u_or_x_latch(*root);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!have_latch || !root->index || !root->index->freed());
+#endif
+ mtr->rollback_to_savepoint(savepoint);
+ if (have_latch ||
+ (root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err)))
+ err= fseg_free_page(&root->page.frame[blob ||
+ page_is_leaf(block->page.frame)
+ ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ : PAGE_HEADER + PAGE_BTR_SEG_TOP],
+ space, page, mtr, space_latched);
+ }
+ if (err == DB_SUCCESS)
+ buf_page_free(space, page, mtr);
+
+ /* The page was marked free in the allocation bitmap, but it
+ should remain exclusively latched until mtr_t::commit() or until it
+ is explicitly freed from the mini-transaction. */
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ return err;
+}
+
+/** Set the child page number in a node pointer record.
+@param[in,out] block non-leaf index page
+@param[in,out] rec node pointer record in the page
+@param[in] offsets rec_get_offsets(rec)
+@param[in] page_no child page number
+@param[in,out] mtr mini-transaction
+Sets the child node file address in a node pointer. */
+inline void btr_node_ptr_set_child_page_no(buf_block_t *block,
+ rec_t *rec, const rec_offs *offsets,
+ ulint page_no, mtr_t *mtr)
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!page_rec_is_leaf(rec));
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ const ulint offs= rec_offs_data_size(offsets);
+ ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) ==
+ REC_NODE_PTR_SIZE);
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ page_zip_write_node_ptr(block, rec, offs, page_no, mtr);
+ else
+ mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no);
+}
+
+MY_ATTRIBUTE((nonnull(1,2,3,4),warn_unused_result))
+/************************************************************//**
+Returns the child page of a node pointer and sx-latches it.
+@return child page, sx-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+ const rec_t* node_ptr,/*!< in: node pointer */
+ dict_index_t* index, /*!< in: index */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err = nullptr) /*!< out: error code */
+{
+ ut_ad(rec_offs_validate(node_ptr, index, offsets));
+ ut_ad(index->table->space_id
+ == page_get_space_id(page_align(node_ptr)));
+
+ return btr_block_get(
+ *index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
+ RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
+ mtr, err);
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_node_ptr_for_validate(
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ btr_cur_t* cursor, /*!< in: cursor pointing to user record,
+ out: cursor on node pointer record,
+ its page x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+ dict_index_t* index = btr_cur_get_index(cursor);
+ ut_ad(!dict_index_is_spatial(index));
+ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+ ut_ad(dict_index_get_page(index) != page_no);
+
+ const auto level = btr_page_get_level(btr_cur_get_page(cursor));
+
+ const rec_t* user_rec = btr_cur_get_rec(cursor);
+ ut_a(page_rec_is_user_rec(user_rec));
+
+ if (btr_cur_search_to_nth_level(level + 1,
+ dict_index_build_node_ptr(index,
+ user_rec, 0,
+ heap, level),
+ RW_S_LATCH,
+ cursor, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ const rec_t* node_ptr = btr_cur_get_rec(cursor);
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+ offsets = nullptr;
+ }
+
+ return(offsets);
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/** Return the node pointer to a page.
+@param offsets work area for the return value
+@param heap memory heap
+@param cursor in: child page; out: node pointer to it
+@param mtr mini-transaction
+@return rec_get_offsets() of the node pointer record
+@retval nullptr if the parent page had not been latched in mtr */
+static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap,
+ btr_cur_t *cursor, mtr_t *mtr)
+{
+ const uint32_t page_no= cursor->block()->page.id().page_no();
+ const dict_index_t *index= cursor->index();
+ ut_ad(!index->is_spatial());
+ ut_ad(index->page != page_no);
+
+ uint32_t p= index->page;
+ auto level= btr_page_get_level(cursor->block()->page.frame);
+ const dtuple_t *tuple=
+ dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, level);
+ level++;
+
+ ulint i;
+ for (i= 0; i < mtr->get_savepoint(); i++)
+ if (buf_block_t *block= mtr->block_at_savepoint(i))
+ if (block->page.id().page_no() == p)
+ {
+ ut_ad(block->page.lock.have_u_or_x() ||
+ (!block->page.lock.have_s() && index->lock.have_x()));
+ ulint up_match= 0, low_match= 0;
+ cursor->page_cur.block= block;
+ if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match,
+ &low_match, &cursor->page_cur,
+ nullptr))
+ return nullptr;
+ offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets);
+ if (p != page_no)
+ {
+ if (btr_page_get_level(block->page.frame) == level)
+ return nullptr;
+ i= 0; // MDEV-29835 FIXME: require all pages to be latched in order!
+ continue;
+ }
+ ut_ad(block->page.lock.have_u_or_x());
+ if (block->page.lock.have_u_not_x())
+ {
+ /* btr_cur_t::search_leaf(BTR_MODIFY_TREE) only U-latches the
+ root page initially. */
+ ut_ad(block->page.id().page_no() == index->page);
+ block->page.lock.u_x_upgrade();
+ mtr->page_lock_upgrade(*block);
+ }
+ return offsets;
+ }
+
+ return nullptr;
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_block(
+/*======================*/
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* cursor) /*!< out: cursor on node pointer record,
+ its page x-latched */
+{
+ rec_t *rec=
+ page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+ if (UNIV_UNLIKELY(!rec))
+ return nullptr;
+ cursor->page_cur.rec= rec;
+ return btr_page_get_parent(offsets, heap, cursor, mtr);
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out] mtr mini-transaction
+@param[in,out] cursor cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+{
+ rec_t *rec=
+ page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+ if (UNIV_UNLIKELY(!rec))
+ return false;
+ cursor->page_cur.rec= rec;
+ mem_heap_t *heap= mem_heap_create(100);
+ const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr);
+ mem_heap_free(heap);
+ return got;
+}
+
+#ifdef UNIV_DEBUG
+/** PAGE_INDEX_ID value for freed index B-trees */
+constexpr index_id_t BTR_FREED_INDEX_ID = 0;
+#endif
+
+/** Free a B-tree root page. btr_free_but_not_root() must already
+have been called.
+@param block index root page
+@param space tablespace
+@param mtr mini-transaction */
+static void btr_free_root(buf_block_t *block, const fil_space_t &space,
+ mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(mtr->is_named_space(&space));
+
+ btr_search_drop_page_hash_index(block, false);
+
+ if (btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP, *block, space))
+ {
+ /* Free the entire segment in small steps. */
+ ut_d(mtr->freeing_tree());
+ while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP +
+ block->page.frame, mtr));
+ }
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Prepare to free a B-tree.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] index_id PAGE_INDEX_ID contents
+@param[in,out] mtr mini-transaction
+@return root block, to invoke btr_free_but_not_root() and btr_free_root()
+@retval NULL if the page is no longer a matching B-tree page */
+static
+buf_block_t *btr_free_root_check(const page_id_t page_id, ulint zip_size,
+ index_id_t index_id, mtr_t *mtr)
+{
+ ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+ ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+ buf_block_t *block= buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
+ nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+ if (!block);
+ else if (fil_page_index_page_check(block->page.frame) &&
+ index_id == btr_page_get_index_id(block->page.frame))
+ /* This should be a root page. It should not be possible to
+ reassign the same index_id for some other index in the
+ tablespace. */
+ ut_ad(!page_has_siblings(block->page.frame));
+ else
+ block= nullptr;
+
+ return block;
+}
+
+/** Initialize the root page of the b-tree
+@param[in,out] block root block
+@param[in] index_id index id
+@param[in] index index of root page
+@param[in,out] mtr mini-transaction */
+static void btr_root_page_init(buf_block_t *block, index_id_t index_id,
+ dict_index_t *index, mtr_t *mtr)
+{
+ constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID;
+ byte *page_index_id= my_assume_aligned<2>(field + block->page.frame);
+
+ /* Create a new index page on the allocated segment page */
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ mach_write_to_8(page_index_id, index_id);
+ ut_ad(!page_has_siblings(block->page.zip.data));
+ page_create_zip(block, index, 0, 0, mtr);
+ }
+ else
+ {
+ page_create(block, mtr, index && index->table->not_redundant());
+ if (index && index->is_spatial())
+ {
+ static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+ FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM))
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+ }
+ /* Set the level of the new index page */
+ mtr->write<2,mtr_t::MAYBE_NOP>(
+ *block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, 0U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, index_id);
+ }
+}
+
+/** Create the root node for a new index tree.
+@param[in] type type of the index
+@param[in] index_id index id
+@param[in,out] space tablespace where created
+@param[in] index index, or NULL to create a system table
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return page number of the created root
+@retval FIL_NULL if did not succeed */
+uint32_t
+btr_create(
+ ulint type,
+ fil_space_t* space,
+ index_id_t index_id,
+ dict_index_t* index,
+ mtr_t* mtr,
+ dberr_t* err)
+{
+ buf_block_t* block;
+
+ ut_ad(mtr->is_named_space(space));
+ ut_ad(index_id != BTR_FREED_INDEX_ID);
+ ut_ad(index || space == fil_system.sys_space);
+
+ /* Create the two new segments (one, in the case of an ibuf tree) for
+ the index tree; the segment headers are put on the allocated root page
+ (for an ibuf tree, not in the root, but on a separate ibuf header
+ page) */
+
+ if (UNIV_UNLIKELY(type & DICT_IBUF)) {
+ /* Allocate first the ibuf header page */
+ buf_block_t* ibuf_hdr_block = fseg_create(
+ space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err);
+
+ if (ibuf_hdr_block == NULL) {
+ return(FIL_NULL);
+ }
+
+ ut_ad(ibuf_hdr_block->page.id().page_no()
+ == IBUF_HEADER_PAGE_NO);
+ /* Allocate then the next page to the segment: it will be the
+ tree root page */
+
+ block = fseg_alloc_free_page_general(
+ buf_block_get_frame(ibuf_hdr_block)
+ + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+ IBUF_TREE_ROOT_PAGE_NO,
+ FSP_UP, false, mtr, mtr, err);
+
+ if (block == NULL) {
+ return(FIL_NULL);
+ }
+
+ ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
+
+ flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+ } else {
+ block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
+ mtr, err);
+
+ if (block == NULL) {
+ return(FIL_NULL);
+ }
+
+ if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
+ err, false, block)) {
+ /* Not enough space for new segment, free root
+ segment before return. */
+ btr_free_root(block, *space, mtr);
+ return(FIL_NULL);
+ }
+ }
+
+ ut_ad(!page_has_siblings(block->page.frame));
+
+ btr_root_page_init(block, index_id, index, mtr);
+
+ /* We reset the free bits for the page in a separate
+ mini-transaction to allow creation of several trees in the
+ same mtr, otherwise the latch on a bitmap page would prevent
+ it because of the latching order.
+
+ Note: Insert Buffering is disabled for temporary tables given that
+ most temporary tables are smaller in size and short-lived. */
+ if (!(type & DICT_CLUSTERED)
+ && (!index || !index->table->is_temporary())) {
+ ibuf_reset_free_bits(block);
+ }
+
+ /* In the following assertion we test that two records of maximum
+ allowed size fit on the root page: this fact is needed to ensure
+ correctness of split algorithms */
+
+ ut_ad(page_get_max_insert_size(block->page.frame, 2)
+ > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+ return(block->page.id().page_no());
+}
+
+/** Free a B-tree except the root page. The root page MUST be freed after
+this by calling btr_free_root.
+@param[in,out] block root page
+@param[in] log_mode mtr logging mode */
+static
+void
+btr_free_but_not_root(
+ buf_block_t* block,
+ mtr_log_t log_mode
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif
+ )
+{
+ mtr_t mtr;
+
+ ut_ad(fil_page_index_page_check(block->page.frame));
+ ut_ad(!page_has_siblings(block->page.frame));
+leaf_loop:
+ mtr_start(&mtr);
+ ut_d(mtr.freeing_tree());
+ mtr_set_log_mode(&mtr, log_mode);
+ fil_space_t *space = mtr.set_named_space_id(block->page.id().space());
+
+ if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+ *block, *space)
+ || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+ *block, *space)) {
+ mtr_commit(&mtr);
+ return;
+ }
+
+ /* NOTE: page hash indexes are dropped when a page is freed inside
+ fsp0fsp. */
+
+ bool finished = fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ + block->page.frame, &mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ );
+ mtr_commit(&mtr);
+
+ if (!finished) {
+
+ goto leaf_loop;
+ }
+top_loop:
+ mtr_start(&mtr);
+ mtr_set_log_mode(&mtr, log_mode);
+ space = mtr.set_named_space_id(block->page.id().space());
+
+ finished = !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+ *block, *space)
+ || fseg_free_step_not_header(PAGE_HEADER + PAGE_BTR_SEG_TOP
+ + block->page.frame, &mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ );
+ mtr_commit(&mtr);
+
+ if (!finished) {
+ goto top_loop;
+ }
+}
+
+/** Clear the index tree and reinitialize the root page, in the
+rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+@param thr query thread
+@return error code */
+TRANSACTIONAL_TARGET
+dberr_t dict_index_t::clear(que_thr_t *thr)
+{
+ mtr_t mtr;
+ mtr.start();
+ if (table->is_temporary())
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ else
+ set_modified(mtr);
+ mtr_sx_lock_index(this, &mtr);
+
+ dberr_t err;
+ if (buf_block_t *root_block=
+ buf_page_get_gen(page_id_t(table->space->id, page),
+ table->space->zip_size(),
+ RW_X_LATCH, nullptr, BUF_GET, &mtr, &err))
+ {
+ btr_free_but_not_root(root_block, mtr.get_log_mode()
+#ifdef BTR_CUR_HASH_ADAPT
+ ,n_ahi_pages() != 0
+#endif
+ );
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (root_block->index)
+ btr_search_drop_page_hash_index(root_block, false);
+ ut_ad(n_ahi_pages() == 0);
+#endif
+ mtr.memset(root_block, PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+ FSEG_HEADER_SIZE, 0);
+ if (fseg_create(table->space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr,
+ &err, false, root_block))
+ btr_root_page_init(root_block, id, this, &mtr);
+ }
+
+ mtr.commit();
+ return err;
+}
+
+/** Free a persistent index tree if it exists.
+@param[in,out] space tablespce
+@param[in] page root page number
+@param[in] index_id PAGE_INDEX_ID contents
+@param[in,out] mtr mini-transaction */
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+ index_id_t index_id, mtr_t *mtr)
+{
+ if (buf_block_t *root= btr_free_root_check(page_id_t(space->id, page),
+ space->zip_size(),
+ index_id, mtr))
+ {
+ btr_free_but_not_root(root, mtr->get_log_mode());
+ mtr->set_named_space(space);
+ btr_free_root(root, *space, mtr);
+ }
+}
+
+/** Drop a temporary table
+@param table temporary table */
+void btr_drop_temporary_table(const dict_table_t &table)
+{
+ ut_ad(table.is_temporary());
+ ut_ad(table.space == fil_system.temp_space);
+ mtr_t mtr;
+ mtr.start();
+ for (const dict_index_t *index= table.indexes.start; index;
+ index= dict_table_get_next_index(index))
+ {
+ if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0,
+ RW_X_LATCH, nullptr, BUF_GET, &mtr,
+ nullptr, false))
+ {
+ btr_free_but_not_root(block, MTR_LOG_NO_REDO);
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ btr_free_root(block, *fil_system.temp_space, &mtr);
+ mtr.commit();
+ mtr.start();
+ }
+ }
+ mtr.commit();
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@return the last used AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+{
+ ut_ad(index->is_primary());
+ ut_ad(index->table->persistent_autoinc);
+ ut_ad(!index->table->is_temporary());
+ mtr_t mtr;
+ mtr.start();
+ ib_uint64_t autoinc;
+ if (buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space_id, index->page),
+ index->table->space->zip_size(),
+ RW_S_LATCH, &mtr)) {
+ autoinc = page_get_autoinc(block->page.frame);
+ } else {
+ autoinc = 0;
+ }
+ mtr.commit();
+ return autoinc;
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in] table table containing an AUTO_INCREMENT column
+@param[in] col_no index of the AUTO_INCREMENT column
+@return the AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+{
+ ut_ad(table->persistent_autoinc);
+ ut_ad(!table->is_temporary());
+
+ dict_index_t* index = dict_table_get_first_index(table);
+
+ if (index == NULL) {
+ return 0;
+ }
+
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space_id, index->page),
+ index->table->space->zip_size(),
+ RW_S_LATCH, &mtr);
+
+ ib_uint64_t autoinc = block
+ ? page_get_autoinc(block->page.frame) : 0;
+ const bool retry = block && autoinc == 0
+ && !page_is_empty(block->page.frame);
+ mtr.commit();
+
+ if (retry) {
+ /* This should be an old data file where
+ PAGE_ROOT_AUTO_INC was initialized to 0.
+ Fall back to reading MAX(autoinc_col).
+ There should be an index on it. */
+ const dict_col_t* autoinc_col
+ = dict_table_get_nth_col(table, col_no);
+ while (index && index->fields[0].col != autoinc_col) {
+ index = dict_table_get_next_index(index);
+ }
+
+ if (index) {
+ autoinc = row_search_max_autoinc(index);
+ }
+ }
+
+ return autoinc;
+}
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out] index clustered index
+@param[in] autoinc the AUTO_INCREMENT value
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
+{
+ ut_ad(index->is_primary());
+ ut_ad(index->table->persistent_autoinc);
+ ut_ad(!index->table->is_temporary());
+
+ mtr_t mtr;
+ mtr.start();
+ fil_space_t *space= index->table->space;
+ if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page),
+ space->zip_size(), RW_SX_LATCH, &mtr))
+ {
+ mtr.set_named_space(space);
+ page_set_autoinc(root, autoinc, &mtr, reset);
+ }
+
+ mtr.commit();
+}
+
+/** Reorganize an index page.
+@param cursor index page cursor
+@param mtr mini-transaction */
+static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr)
+{
+ buf_block_t *const block= cursor->block;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!is_buf_block_get_page_zip(block));
+ ut_ad(fil_page_index_page_check(block->page.frame));
+ ut_ad(cursor->index->is_dummy ||
+ block->page.id().space() == cursor->index->table->space->id);
+ ut_ad(cursor->index->is_dummy ||
+ block->page.id().page_no() != cursor->index->page ||
+ !page_has_siblings(block->page.frame));
+
+ /* Save the cursor position. */
+ const ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+ if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+ return DB_CORRUPTION;
+
+ btr_search_drop_page_hash_index(block, false);
+
+ buf_block_t *old= buf_block_alloc();
+ /* Copy the old page to temporary space */
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->page.frame, block->page.frame,
+ srv_page_size);
+
+ const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
+
+ page_create(block, mtr, cursor->index->table->not_redundant());
+ if (cursor->index->is_spatial())
+ block->page.frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
+
+ static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+ FIL_PAGE_RTREE, "compatibility");
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ dberr_t err=
+ page_copy_rec_list_end_no_locks(block, old,
+ page_get_infimum_rec(old->page.frame),
+ cursor->index, mtr);
+ mtr->set_log_mode(log_mode);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ return err;
+
+ /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+ ut_ad(!page_get_max_trx_id(block->page.frame));
+ memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->page.frame,
+ PAGE_MAX_TRX_ID + PAGE_HEADER + old->page.frame, 8);
+#ifdef UNIV_DEBUG
+ if (page_get_max_trx_id(block->page.frame))
+ /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+ clustered index root pages. */
+ ut_ad(dict_index_is_sec_or_ibuf(cursor->index)
+ ? page_is_leaf(block->page.frame)
+ : block->page.id().page_no() == cursor->index->page);
+ else
+ /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
+ the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
+ pages, and in temporary tables. It was always zero-initialized in
+ page_create(). PAGE_MAX_TRX_ID must be nonzero on
+ dict_index_is_sec_or_ibuf() leaf pages. */
+ ut_ad(cursor->index->table->is_temporary() ||
+ !page_is_leaf(block->page.frame) ||
+ !dict_index_is_sec_or_ibuf(cursor->index));
+#endif
+
+ const uint16_t data_size1= page_get_data_size(old->page.frame);
+ const uint16_t data_size2= page_get_data_size(block->page.frame);
+ const ulint max1=
+ page_get_max_insert_size_after_reorganize(old->page.frame, 1);
+ const ulint max2=
+ page_get_max_insert_size_after_reorganize(block->page.frame, 1);
+
+ if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2))
+ {
+ sql_print_error("InnoDB: Page old data size %u new data size %u"
+ ", page old max ins size %zu new max ins size %zu",
+ data_size1, data_size2, max1, max2);
+ return DB_CORRUPTION;
+ }
+
+ /* Restore the cursor position. */
+ if (!pos)
+ ut_ad(cursor->rec == page_get_infimum_rec(block->page.frame));
+ else if (!(cursor->rec= page_rec_get_nth(block->page.frame, pos)))
+ return DB_CORRUPTION;
+
+ if (block->page.id().page_no() != cursor->index->page ||
+ fil_page_get_type(old->page.frame) != FIL_PAGE_TYPE_INSTANT)
+ ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER));
+ else if (!cursor->index->is_instant())
+ {
+ ut_ad(!memcmp(old->page.frame, block->page.frame, FIL_PAGE_TYPE));
+ ut_ad(!memcmp(old->page.frame + FIL_PAGE_TYPE + 2,
+ block->page.frame + FIL_PAGE_TYPE + 2,
+ PAGE_HEADER - FIL_PAGE_TYPE - 2));
+ mtr->write<2,mtr_t::FORCED>(*block, FIL_PAGE_TYPE + block->page.frame,
+ FIL_PAGE_INDEX);
+ }
+ else
+ {
+ /* Preserve the PAGE_INSTANT information. */
+ memcpy_aligned<2>(FIL_PAGE_TYPE + block->page.frame,
+ FIL_PAGE_TYPE + old->page.frame, 2);
+ memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->page.frame,
+ PAGE_HEADER + PAGE_INSTANT + old->page.frame, 2);
+ if (!cursor->index->table->instant);
+ else if (page_is_comp(block->page.frame))
+ {
+ memcpy(PAGE_NEW_INFIMUM + block->page.frame,
+ PAGE_NEW_INFIMUM + old->page.frame, 8);
+ memcpy(PAGE_NEW_SUPREMUM + block->page.frame,
+ PAGE_NEW_SUPREMUM + old->page.frame, 8);
+ }
+ else
+ {
+ memcpy(PAGE_OLD_INFIMUM + block->page.frame,
+ PAGE_OLD_INFIMUM + old->page.frame, 8);
+ memcpy(PAGE_OLD_SUPREMUM + block->page.frame,
+ PAGE_OLD_SUPREMUM + old->page.frame, 8);
+ }
+
+ ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER));
+ }
+
+ ut_ad(!memcmp(old->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+ block->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+ PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER)));
+
+ if (!cursor->index->has_locking());
+ else if (cursor->index->page == FIL_NULL)
+ ut_ad(cursor->index->is_dummy);
+ else
+ lock_move_reorganize_page(block, old);
+
+ /* Write log for the changes, if needed. */
+ if (log_mode == MTR_LOG_ALL)
+ {
+ /* Check and log the changes in the page header. */
+ ulint a, e;
+ for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++)
+ {
+ if (old->page.frame[a] == block->page.frame[a])
+ continue;
+ while (--e, old->page.frame[e] == block->page.frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* Write log for the changed page header fields. */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+
+ const uint16_t top= page_header_get_offs(block->page.frame, PAGE_HEAP_TOP);
+
+ if (page_is_comp(block->page.frame))
+ {
+ /* info_bits=0, n_owned=1, heap_no=0, status */
+ ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES +
+ block->page.frame,
+ PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES +
+ old->page.frame, 3));
+ /* If the 'next' pointer of the infimum record has changed, log it. */
+ a= PAGE_NEW_INFIMUM - 2;
+ e= a + 2;
+ if (block->page.frame[a] == old->page.frame[a])
+ a++;
+ if (--e, block->page.frame[e] != old->page.frame[e])
+ e++;
+ if (ulint len= e - a)
+ mtr->memcpy(*block, a, len);
+ /* The infimum record itself must not change. */
+ ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->page.frame,
+ PAGE_NEW_INFIMUM + old->page.frame, 8));
+ /* Log any change of the n_owned of the supremum record. */
+ a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES;
+ if (block->page.frame[a] != old->page.frame[a])
+ mtr->memcpy(*block, a, 1);
+ /* The rest of the supremum record must not change. */
+ ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1],
+ PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM +
+ REC_N_NEW_EXTRA_BYTES - 1));
+
+ /* Log the differences in the payload. */
+ for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++)
+ {
+ if (old->page.frame[a] == block->page.frame[a])
+ continue;
+ while (--e, old->page.frame[e] == block->page.frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* TODO: write MEMMOVE records to minimize this further! */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+ }
+ else
+ {
+ /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */
+ ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES +
+ block->page.frame,
+ PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES +
+ old->page.frame, 4));
+ /* If the 'next' pointer of the infimum record has changed, log it. */
+ a= PAGE_OLD_INFIMUM - 2;
+ e= a + 2;
+ if (block->page.frame[a] == old->page.frame[a])
+ a++;
+ if (--e, block->page.frame[e] != old->page.frame[e])
+ e++;
+ if (ulint len= e - a)
+ mtr->memcpy(*block, a, len);
+ /* The infimum record itself must not change. */
+ ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->page.frame,
+ PAGE_OLD_INFIMUM + old->page.frame, 8));
+ /* Log any change of the n_owned of the supremum record. */
+ a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES;
+ if (block->page.frame[a] != old->page.frame[a])
+ mtr->memcpy(*block, a, 1);
+ ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1],
+ PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM +
+ REC_N_OLD_EXTRA_BYTES - 1));
+
+ /* Log the differences in the payload. */
+ for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++)
+ {
+ if (old->page.frame[a] == block->page.frame[a])
+ continue;
+ while (--e, old->page.frame[e] == block->page.frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* TODO: write MEMMOVE records to minimize this further! */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+ }
+
+ e= srv_page_size - PAGE_DIR;
+ a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->page.frame);
+
+ /* Zero out the payload area. */
+ mtr->memset(*block, top, a - top, 0);
+
+ /* Log changes to the page directory. */
+ for (; a < e; a++)
+ {
+ if (old->page.frame[a] == block->page.frame[a])
+ continue;
+ while (--e, old->page.frame[e] == block->page.frame[e]);
+ e++;
+ ut_ad(a < e);
+ /* Write log for the changed page directory slots. */
+ mtr->memcpy(*block, a, e - a);
+ break;
+ }
+ }
+
+ buf_block_free(old);
+
+ MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+ MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
+ return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t
+btr_page_reorganize_block(
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ if (buf_block_get_page_zip(block))
+ return page_zip_reorganize(block, index, z_level, mtr, true);
+ page_cur_t cur;
+ page_cur_set_before_first(block, &cur);
+ cur.index= index;
+ return btr_page_reorganize_low(&cur, mtr);
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@param cursor page cursor
+@param mtr mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+{
+ if (!buf_block_get_page_zip(cursor->block))
+ return btr_page_reorganize_low(cursor, mtr);
+
+ ulint pos= page_rec_get_n_recs_before(cursor->rec);
+ if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+ return DB_CORRUPTION;
+
+ dberr_t err= page_zip_reorganize(cursor->block, cursor->index,
+ page_zip_level, mtr, true);
+ if (err == DB_FAIL);
+ else if (!pos)
+ ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->page.frame));
+ else if (!(cursor->rec= page_rec_get_nth(cursor->block->page.frame, pos)))
+ err= DB_CORRUPTION;
+
+ return err;
+}
+
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out] block page to be emptied
+@param[in,out] page_zip compressed page frame, or NULL
+@param[in] index index of the page
+@param[in] level B-tree level of the page (0=leaf)
+@param[in,out] mtr mini-transaction */
+void
+btr_page_empty(
+ buf_block_t* block,
+ page_zip_des_t* page_zip,
+ dict_index_t* index,
+ ulint level,
+ mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_zip == buf_block_get_page_zip(block));
+ ut_ad(!index->is_dummy);
+ ut_ad(index->table->space->id == block->page.id().space());
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ btr_search_drop_page_hash_index(block, false);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index
+ root page. */
+ const ib_uint64_t autoinc
+ = dict_index_is_clust(index)
+ && index->page == block->page.id().page_no()
+ ? page_get_autoinc(block->page.frame)
+ : 0;
+
+ if (page_zip) {
+ page_create_zip(block, index, level, autoinc, mtr);
+ } else {
+ page_create(block, mtr, index->table->not_redundant());
+ if (index->is_spatial()) {
+ static_assert(((FIL_PAGE_INDEX & 0xff00)
+ | byte(FIL_PAGE_RTREE))
+ == FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1
+ + block->page.frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->page.frame
+ + FIL_RTREE_SPLIT_SEQ_NUM)) {
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+ 8, 0);
+ }
+ }
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+ + block->page.frame, level);
+ if (autoinc) {
+ mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+ + block->page.frame, autoinc);
+ }
+ }
+}
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out] root clustered index root page
+@param[in] index clustered index with instant ALTER TABLE
+@param[in,out] mtr mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
+{
+ ut_ad(index.n_core_fields > 0);
+ ut_ad(index.n_core_fields < REC_MAX_N_FIELDS);
+ ut_ad(index.is_instant());
+ ut_ad(fil_page_get_type(root->page.frame) == FIL_PAGE_TYPE_INSTANT
+ || fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX);
+ ut_ad(!page_has_siblings(root->page.frame));
+ ut_ad(root->page.id().page_no() == index.page);
+
+ rec_t* infimum = page_get_infimum_rec(root->page.frame);
+ rec_t* supremum = page_get_supremum_rec(root->page.frame);
+ byte* page_type = root->page.frame + FIL_PAGE_TYPE;
+ uint16_t i = page_header_get_field(root->page.frame, PAGE_INSTANT);
+
+ switch (mach_read_from_2(page_type)) {
+ case FIL_PAGE_TYPE_INSTANT:
+ ut_ad(page_get_instant(root->page.frame)
+ == index.n_core_fields);
+ if (memcmp(infimum, "infimum", 8)
+ || memcmp(supremum, "supremum", 8)) {
+ ut_ad(index.table->instant);
+ ut_ad(!memcmp(infimum, field_ref_zero, 8));
+ ut_ad(!memcmp(supremum, field_ref_zero, 7));
+ /* The n_core_null_bytes only matters for
+ ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */
+ ut_ad(supremum[7] == index.n_core_null_bytes
+ || !index.table->not_redundant());
+ return;
+ }
+ break;
+ default:
+ ut_ad("wrong page type" == 0);
+ /* fall through */
+ case FIL_PAGE_INDEX:
+ ut_ad(!page_is_comp(root->page.frame)
+ || !page_get_instant(root->page.frame));
+ ut_ad(!memcmp(infimum, "infimum", 8));
+ ut_ad(!memcmp(supremum, "supremum", 8));
+ mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT);
+ ut_ad(i <= PAGE_NO_DIRECTION);
+ i |= static_cast<uint16_t>(index.n_core_fields << 3);
+ mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT
+ + root->page.frame, i);
+ break;
+ }
+
+ if (index.table->instant) {
+ mtr->memset(root, infimum - root->page.frame, 8, 0);
+ mtr->memset(root, supremum - root->page.frame, 7, 0);
+ mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7],
+ index.n_core_null_bytes);
+ }
+}
+
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in] index clustered index with instant ALTER TABLE
+@param[in] all whether to reset FIL_PAGE_TYPE as well
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
+{
+ ut_ad(!index.table->is_temporary());
+ ut_ad(index.is_primary());
+ buf_block_t *root= btr_get_latched_root(index, mtr);
+ byte *page_type= root->page.frame + FIL_PAGE_TYPE;
+ if (all)
+ {
+ ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+ mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
+ byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame;
+ mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
+ page_ptr_get_direction(instant + 1));
+ }
+ else
+ ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+ static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+ uint16_t infimum, supremum;
+ if (page_is_comp(root->page.frame))
+ {
+ infimum= PAGE_NEW_INFIMUM;
+ supremum= PAGE_NEW_SUPREMUM;
+ }
+ else
+ {
+ infimum= PAGE_OLD_INFIMUM;
+ supremum= PAGE_OLD_SUPREMUM;
+ }
+ ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) ==
+ !memcmp(&root->page.frame[supremum], supremuminfimum, 8));
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[infimum],
+ supremuminfimum + 8, 8);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[supremum],
+ supremuminfimum, 8);
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+{
+ dict_index_t* index;
+ rec_t* rec;
+ dtuple_t* node_ptr;
+ ulint level;
+ rec_t* node_ptr_rec;
+ page_cur_t* page_cursor;
+ page_zip_des_t* root_page_zip;
+ page_zip_des_t* new_page_zip;
+ buf_block_t* root;
+ buf_block_t* new_block;
+
+ root = btr_cur_get_block(cursor);
+ root_page_zip = buf_block_get_page_zip(root);
+ ut_ad(!page_is_empty(root->page.frame));
+ index = btr_cur_get_index(cursor);
+ ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!root_page_zip
+ || page_zip_validate(root_page_zip, root->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ const page_id_t root_id{root->page.id()};
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
+
+ if (index->page != root_id.page_no()) {
+ ut_ad("corrupted root page number" == 0);
+ return nullptr;
+ }
+
+ if (index->is_ibuf()) {
+ } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+ *root, *index->table->space)
+ || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+ *root, *index->table->space)) {
+ return nullptr;
+ }
+
+ /* Allocate a new page to the tree. Root splitting is done by first
+ moving the root records to the new page, emptying the root, putting
+ a node pointer to the new page, and then splitting the new page. */
+
+ level = btr_page_get_level(root->page.frame);
+
+ new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err);
+
+ if (!new_block) {
+ return nullptr;
+ }
+
+ new_page_zip = buf_block_get_page_zip(new_block);
+ ut_a(!new_page_zip == !root_page_zip);
+ ut_a(!new_page_zip
+ || page_zip_get_size(new_page_zip)
+ == page_zip_get_size(root_page_zip));
+
+ btr_page_create(new_block, new_page_zip, index, level, mtr);
+ if (page_has_siblings(new_block->page.frame)) {
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+ memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV,
+ 0xff, 8);
+ mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
+ 0xff, 8);
+ }
+ }
+
+ /* Copy the records from root to the new page one by one. */
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_copy_rec_list_end(new_block, root,
+ page_get_infimum_rec(root->page.frame),
+ index, mtr, err)) {
+ switch (*err) {
+ case DB_SUCCESS:
+ break;
+ case DB_FAIL:
+ *err = DB_SUCCESS;
+ break;
+ default:
+ return nullptr;
+ }
+
+ ut_a(new_page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(new_block, root_page_zip,
+ root->page.frame, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+ if (index->has_locking()) {
+ lock_move_rec_list_end(
+ new_block, root,
+ page_get_infimum_rec(root->page.frame));
+ }
+
+ /* Move any existing predicate locks */
+ if (dict_index_is_spatial(index)) {
+ lock_prdt_rec_move(new_block, root_id);
+ } else {
+ btr_search_move_or_delete_hash_entries(
+ new_block, root);
+ }
+ }
+
+ constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
+ if (dict_index_is_sec_or_ibuf(index)) {
+ /* In secondary indexes and the change buffer,
+ PAGE_MAX_TRX_ID can be reset on the root page, because
+ the field only matters on leaf pages, and the root no
+ longer is a leaf page. (Older versions of InnoDB did
+ set PAGE_MAX_TRX_ID on all secondary index pages.) */
+ byte* p = my_assume_aligned<8>(
+ PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame);
+ if (mach_read_from_8(p)) {
+ mtr->memset(root, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + root->page.zip.data, 0, 8);
+ }
+ }
+ } else {
+ /* PAGE_ROOT_AUTO_INC is only present in the clustered index
+ root page; on other clustered index pages, we want to reserve
+ the field PAGE_MAX_TRX_ID for future use. */
+ byte* p = my_assume_aligned<8>(
+ PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame);
+ if (mach_read_from_8(p)) {
+ mtr->memset(new_block, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + new_block->page.zip.data,
+ 0, 8);
+ }
+ }
+ }
+
+ /* If this is a pessimistic insert which is actually done to
+ perform a pessimistic update then we have stored the lock
+ information of the record to be inserted on the infimum of the
+ root page: we cannot discard the lock structs on the root page */
+
+ if (index->has_locking()) {
+ lock_update_root_raise(*new_block, root_id);
+ }
+
+ /* Create a memory heap where the node pointer is stored */
+ if (!*heap) {
+ *heap = mem_heap_create(1000);
+ }
+
+ const uint32_t new_page_no = new_block->page.id().page_no();
+ rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame));
+ ut_ad(rec); /* We just created the page. */
+
+ /* Build the node pointer (= node key and page address) for the
+ child */
+ if (dict_index_is_spatial(index)) {
+ rtr_mbr_t new_mbr;
+
+ rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
+ node_ptr = rtr_index_build_node_ptr(
+ index, &new_mbr, rec, new_page_no, *heap);
+ } else {
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, new_page_no, *heap, level);
+ }
+ /* The node pointer must be marked as the predefined minimum record,
+ as there is no lower alphabetical limit to records in the leftmost
+ node of a level: */
+ dtuple_set_info_bits(node_ptr,
+ dtuple_get_info_bits(node_ptr)
+ | REC_INFO_MIN_REC_FLAG);
+
+ /* Rebuild the root page to get free space */
+ btr_page_empty(root, root_page_zip, index, level + 1, mtr);
+ /* btr_page_empty() is supposed to zero-initialize the field. */
+ ut_ad(!page_get_instant(root->page.frame));
+
+ if (index->is_instant()) {
+ ut_ad(!root_page_zip);
+ btr_set_instant(root, *index, mtr);
+ }
+
+ ut_ad(!page_has_siblings(root->page.frame));
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Insert node pointer to the root */
+
+ page_cur_set_before_first(root, page_cursor);
+
+ node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+ offsets, heap, 0, mtr);
+
+ /* The root page should only contain the node pointer
+ to new_block at this point. Thus, the data should fit. */
+ ut_a(node_ptr_rec);
+
+ /* We play safe and reset the free bits for the new page */
+
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()) {
+ ibuf_reset_free_bits(new_block);
+ }
+
+ page_cursor->block = new_block;
+ page_cursor->index = index;
+
+ ut_ad(dtuple_check_typed(tuple));
+ /* Reposition the cursor to the child node */
+ ulint low_match = 0, up_match = 0;
+
+ if (page_cur_search_with_match(tuple, PAGE_CUR_LE,
+ &up_match, &low_match,
+ page_cursor, nullptr)) {
+ *err = DB_CORRUPTION;
+ return nullptr;
+ }
+
+ /* Split the child and insert tuple */
+ return btr_page_split_and_insert(flags, cursor, offsets, heap,
+ tuple, n_ext, mtr, err);
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in] cursor insert position
+@return the first record to be moved to the right half page
+@retval NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor)
+{
+ rec_t* split_rec = btr_cur_get_rec(cursor);
+ const page_t* page = page_align(split_rec);
+
+ if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+ != page_rec_get_next(split_rec)) {
+ return NULL;
+ }
+
+ /* The metadata record must be present in the leftmost leaf page
+ of the clustered index, if and only if index->is_instant().
+ However, during innobase_instant_try(), index->is_instant()
+ would already hold when row_ins_clust_index_entry_low()
+ is being invoked to insert the the metadata record.
+ So, we can only assert that when the metadata record exists,
+ index->is_instant() must hold. */
+ ut_ad(!page_is_leaf(page) || page_has_prev(page)
+ || cursor->index()->is_instant()
+ || !(rec_get_info_bits(page_rec_get_next_const(
+ page_get_infimum_rec(page)),
+ cursor->index()->table->not_redundant())
+ & REC_INFO_MIN_REC_FLAG));
+
+ const rec_t* infimum = page_get_infimum_rec(page);
+
+ /* If the convergence is in the middle of a page, include also
+ the record immediately before the new insert to the upper
+ page. Otherwise, we could repeatedly move from page to page
+ lots of records smaller than the convergence point. */
+
+ if (split_rec == infimum
+ || split_rec == page_rec_get_next_const(infimum)) {
+ split_rec = page_rec_get_next(split_rec);
+ }
+
+ return split_rec;
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in] cursor insert position
+@param[out] split_rec if split recommended, the first record
+ on the right half page, or
+ NULL if the to-be-inserted record
+ should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec)
+{
+ rec_t* insert_point = btr_cur_get_rec(cursor);
+ const page_t* page = page_align(insert_point);
+
+ /* We use eager heuristics: if the new insert would be right after
+ the previous insert on the same page, we assume that there is a
+ pattern of sequential inserts here. */
+
+ if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) {
+ return false;
+ }
+
+ insert_point = page_rec_get_next(insert_point);
+
+ if (!insert_point || page_rec_is_supremum(insert_point)) {
+ insert_point = NULL;
+ } else {
+ insert_point = page_rec_get_next(insert_point);
+ if (page_rec_is_supremum(insert_point)) {
+ insert_point = NULL;
+ }
+
+ /* If there are >= 2 user records up from the insert
+ point, split all but 1 off. We want to keep one because
+ then sequential inserts can use the adaptive hash
+ index, as they can do the necessary checks of the right
+ search position just by looking at the records on this
+ page. */
+ }
+
+ *split_rec = insert_point;
+ return true;
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ page_t* page;
+ page_zip_des_t* page_zip;
+ ulint insert_size;
+ ulint free_space;
+ ulint total_data;
+ ulint total_n_recs;
+ ulint total_space;
+ ulint incl_data;
+ rec_t* ins_rec;
+ rec_t* rec;
+ rec_t* next_rec;
+ ulint n;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+ page = btr_cur_get_page(cursor);
+
+ insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ page_zip = btr_cur_get_page_zip(cursor);
+ if (page_zip) {
+ /* Estimate the free space of an empty compressed page. */
+ ulint free_space_zip = page_zip_empty_size(
+ cursor->index()->n_fields,
+ page_zip_get_size(page_zip));
+
+ if (free_space > (ulint) free_space_zip) {
+ free_space = (ulint) free_space_zip;
+ }
+ }
+
+ /* free_space is now the free space of a created new page */
+
+ total_data = page_get_data_size(page) + insert_size;
+ total_n_recs = ulint(page_get_n_recs(page)) + 1;
+ ut_ad(total_n_recs >= 2);
+ total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+ n = 0;
+ incl_data = 0;
+ ins_rec = btr_cur_get_rec(cursor);
+ rec = page_get_infimum_rec(page);
+
+ heap = NULL;
+ offsets = NULL;
+
+ /* We start to include records to the left half, and when the
+ space reserved by them exceeds half of total_space, then if
+ the included records fit on the left page, they will be put there
+ if something was left over also for the right page,
+ otherwise the last included record will be the first on the right
+ half page */
+
+ do {
+ /* Decide the next record to include */
+ if (rec == ins_rec) {
+ rec = NULL; /* NULL denotes that tuple is
+ now included */
+ } else if (rec == NULL) {
+ rec = page_rec_get_next(ins_rec);
+ } else {
+ rec = page_rec_get_next(rec);
+ }
+
+ if (rec == NULL) {
+ /* Include tuple */
+ incl_data += insert_size;
+ } else {
+ offsets = rec_get_offsets(rec, cursor->index(),
+ offsets, page_is_leaf(page)
+ ? cursor->index()
+ ->n_core_fields
+ : 0,
+ ULINT_UNDEFINED, &heap);
+ incl_data += rec_offs_size(offsets);
+ }
+
+ n++;
+ } while (incl_data + page_dir_calc_reserved_space(n)
+ < total_space / 2);
+
+ if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+ /* The next record will be the first on
+ the right half page if it is not the
+ supremum record of page */
+
+ if (rec == ins_rec) {
+ rec = NULL;
+
+ goto func_exit;
+ } else if (rec == NULL) {
+ next_rec = page_rec_get_next(ins_rec);
+ } else {
+ next_rec = page_rec_get_next(rec);
+ }
+ ut_ad(next_rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ rec = next_rec;
+ }
+ }
+
+func_exit:
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(rec);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return true if fits */
+static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
+bool
+btr_page_insert_fits(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: cursor at which insert
+ should be made */
+ const rec_t* split_rec,/*!< in: suggestion for first record
+ on upper half-page, or NULL if
+ tuple to be inserted should be first */
+ rec_offs** offsets,/*!< in: rec_get_offsets(
+ split_rec, cursor->index()); out: garbage */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mem_heap_t** heap) /*!< in: temporary memory heap */
+{
+ page_t* page;
+ ulint insert_size;
+ ulint free_space;
+ ulint total_data;
+ ulint total_n_recs;
+ const rec_t* rec;
+ const rec_t* end_rec;
+
+ page = btr_cur_get_page(cursor);
+
+ ut_ad(!split_rec
+ || !page_is_comp(page) == !rec_offs_comp(*offsets));
+ ut_ad(!split_rec
+ || rec_offs_validate(split_rec, cursor->index(), *offsets));
+
+ insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ /* free_space is now the free space of a created new page */
+
+ total_data = page_get_data_size(page) + insert_size;
+ total_n_recs = ulint(page_get_n_recs(page)) + 1;
+
+ /* We determine which records (from rec to end_rec, not including
+ end_rec) will end up on the other half page from tuple when it is
+ inserted. */
+
+ if (!(end_rec = split_rec)) {
+ end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+ } else if (cmp_dtuple_rec(tuple, split_rec, cursor->index(),
+ *offsets) < 0) {
+ rec = split_rec;
+ end_rec = page_get_supremum_rec(page);
+ goto got_rec;
+ }
+
+ if (!(rec = page_rec_get_next(page_get_infimum_rec(page)))) {
+ return false;
+ }
+
+got_rec:
+ if (total_data + page_dir_calc_reserved_space(total_n_recs)
+ <= free_space) {
+
+ /* Ok, there will be enough available space on the
+ half page where the tuple is inserted */
+
+ return(true);
+ }
+
+ while (rec != end_rec) {
+ /* In this loop we calculate the amount of reserved
+ space after rec is removed from page. */
+
+ *offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+ page_is_leaf(page)
+ ? cursor->index()->n_core_fields
+ : 0,
+ ULINT_UNDEFINED, heap);
+
+ total_data -= rec_offs_size(*offsets);
+ total_n_recs--;
+
+ if (total_data + page_dir_calc_reserved_space(total_n_recs)
+ <= free_space) {
+
+ /* Ok, there will be enough available space on the
+ half page where the tuple is inserted */
+
+ return(true);
+ }
+
+ if (!(rec = page_rec_get_next_const(rec))) {
+ break;
+ }
+ }
+
+ return(false);
+}
+#endif
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+dberr_t
+btr_insert_on_non_leaf_level(
+ ulint flags, /*!< in: undo logging and locking flags */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ big_rec_t* dummy_big_rec;
+ btr_cur_t cursor;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ rtr_info_t rtr_info;
+
+ ut_ad(level > 0);
+
+ flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
+ | BTR_NO_UNDO_LOG_FLAG;
+ cursor.page_cur.index = index;
+
+ dberr_t err;
+
+ if (index->is_spatial()) {
+ /* For spatial index, initialize structures to track
+ its parents etc. */
+ rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+
+ rtr_info_update_btr(&cursor, &rtr_info);
+ err = rtr_search_to_nth_level(level, tuple,
+ PAGE_CUR_RTREE_INSERT,
+ BTR_CONT_MODIFY_TREE,
+ &cursor, mtr);
+ } else {
+ err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH,
+ &cursor, mtr);
+ }
+
+ ut_ad(cursor.flag == BTR_CUR_BINARY);
+ ut_ad(btr_cur_get_block(&cursor)
+ != mtr->at_savepoint(mtr->get_savepoint() - 1)
+ || index->is_spatial()
+ || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+ err = btr_cur_optimistic_insert(flags,
+ &cursor, &offsets, &heap,
+ tuple, &rec,
+ &dummy_big_rec, 0, NULL, mtr);
+ }
+
+ if (err == DB_FAIL) {
+ err = btr_cur_pessimistic_insert(flags,
+ &cursor, &offsets, &heap,
+ tuple, &rec,
+ &dummy_big_rec, 0, NULL, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (index->is_spatial()) {
+ ut_ad(cursor.rtr_info);
+
+ rtr_clean_rtr_info(&rtr_info, true);
+ }
+
+ return err;
+}
+
+static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static
+dberr_t
+btr_attach_half_pages(
+/*==================*/
+ ulint flags, /*!< in: undo logging and
+ locking flags */
+ dict_index_t* index, /*!< in: the index tree */
+ buf_block_t* block, /*!< in/out: page to be split */
+ const rec_t* split_rec, /*!< in: first record on upper
+ half page */
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ ulint direction, /*!< in: FSP_UP or FSP_DOWN */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dtuple_t* node_ptr_upper;
+ mem_heap_t* heap;
+ buf_block_t* prev_block = nullptr;
+ buf_block_t* next_block = nullptr;
+ buf_block_t* lower_block;
+ buf_block_t* upper_block;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX));
+
+ /* Create a memory heap where the data tuple is stored */
+ heap = mem_heap_create(1024);
+
+ /* Based on split direction, decide upper and lower pages */
+ if (direction == FSP_DOWN) {
+
+ btr_cur_t cursor;
+ rec_offs* offsets;
+
+ lower_block = new_block;
+ upper_block = block;
+
+ cursor.page_cur.block = block;
+ cursor.page_cur.index = index;
+
+ /* Look up the index for the node pointer to page */
+ offsets = btr_page_get_father_block(nullptr, heap, mtr,
+ &cursor);
+
+ /* Replace the address of the old child node (= page) with the
+ address of the new lower half */
+
+ btr_node_ptr_set_child_page_no(
+ btr_cur_get_block(&cursor),
+ btr_cur_get_rec(&cursor),
+ offsets, lower_block->page.id().page_no(), mtr);
+ mem_heap_empty(heap);
+ } else {
+ lower_block = block;
+ upper_block = new_block;
+ }
+
+ /* Get the level of the split pages */
+ const ulint level = btr_page_get_level(block->page.frame);
+ ut_ad(level == btr_page_get_level(new_block->page.frame));
+ page_id_t id{block->page.id()};
+
+ /* Get the previous and next pages of page */
+ const uint32_t prev_page_no = btr_page_get_prev(block->page.frame);
+ const uint32_t next_page_no = btr_page_get_next(block->page.frame);
+
+ /* for consistency, both blocks should be locked, before change */
+ if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
+ id.set_page_no(prev_page_no);
+ prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+ if (!prev_block) {
+ ut_ad(mtr->memo_contains(index->lock,
+ MTR_MEMO_X_LOCK));
+ prev_block = btr_block_get(*index, prev_page_no,
+ RW_X_LATCH, !level, mtr);
+ }
+#endif
+ }
+ if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
+ id.set_page_no(next_page_no);
+ next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+ if (!next_block) {
+ ut_ad(mtr->memo_contains(index->lock,
+ MTR_MEMO_X_LOCK));
+ next_block = btr_block_get(*index, next_page_no,
+ RW_X_LATCH, !level, mtr);
+ }
+#endif
+ }
+
+ /* Build the node pointer (= node key and page address) for the upper
+ half */
+
+ node_ptr_upper = dict_index_build_node_ptr(
+ index, split_rec, upper_block->page.id().page_no(),
+ heap, level);
+
+ /* Insert it next to the pointer to the lower half. Note that this
+ may generate recursion leading to a split on the higher level. */
+
+ dberr_t err = btr_insert_on_non_leaf_level(
+ flags, index, level + 1, node_ptr_upper, mtr);
+
+ /* Free the memory heap */
+ mem_heap_free(heap);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+
+ /* Update page links of the level */
+
+ if (prev_block) {
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame
+ + FIL_PAGE_NEXT,
+ block->page.frame
+ + FIL_PAGE_OFFSET,
+ 4))) {
+ return DB_CORRUPTION;
+ }
+ btr_page_set_next(prev_block, lower_block->page.id().page_no(),
+ mtr);
+ }
+
+ if (next_block) {
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame
+ + FIL_PAGE_PREV,
+ block->page.frame
+ + FIL_PAGE_OFFSET,
+ 4))) {
+ return DB_CORRUPTION;
+ }
+ btr_page_set_prev(next_block, upper_block->page.id().page_no(),
+ mtr);
+ }
+
+ if (direction == FSP_DOWN) {
+ ut_ad(lower_block == new_block);
+ ut_ad(btr_page_get_next(upper_block->page.frame)
+ == next_page_no);
+ btr_page_set_prev(lower_block, prev_page_no, mtr);
+ } else {
+ ut_ad(upper_block == new_block);
+ ut_ad(btr_page_get_prev(lower_block->page.frame)
+ == prev_page_no);
+ btr_page_set_next(upper_block, next_page_no, mtr);
+ }
+
+ btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr);
+ btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr);
+
+ return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+btr_page_tuple_smaller(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: b-tree cursor */
+ const dtuple_t* tuple, /*!< in: tuple to consider */
+ rec_offs** offsets,/*!< in/out: temporary storage */
+ ulint n_uniq, /*!< in: number of unique fields
+ in the index page records */
+ mem_heap_t** heap) /*!< in/out: heap for offsets */
+{
+ buf_block_t* block;
+ const rec_t* first_rec;
+ page_cur_t pcur;
+
+ /* Read the first user record in the page. */
+ block = btr_cur_get_block(cursor);
+ page_cur_set_before_first(block, &pcur);
+ if (UNIV_UNLIKELY(!(first_rec = page_cur_move_to_next(&pcur)))) {
+ ut_ad("corrupted page" == 0);
+ return false;
+ }
+
+ *offsets = rec_get_offsets(first_rec, cursor->index(), *offsets,
+ page_is_leaf(block->page.frame)
+ ? cursor->index()->n_core_fields : 0,
+ n_uniq, heap);
+
+ return cmp_dtuple_rec(tuple, first_rec, cursor->index(), *offsets) < 0;
+}
+
+/** Insert the tuple into the right sibling page, if the cursor is at the end
+of a page.
+@param[in] flags undo logging and locking flags
+@param[in,out] cursor cursor at which to insert; when the function succeeds,
+ the cursor is positioned before the insert point.
+@param[out] offsets offsets on inserted record
+@param[in,out] heap memory heap for allocating offsets
+@param[in] tuple tuple to insert
+@param[in] n_ext number of externally stored columns
+@param[in,out] mtr mini-transaction
+@return inserted record (first record on the right sibling page);
+ the cursor will be positioned on the page infimum
+@retval NULL if the operation was not performed */
+static
+rec_t*
+btr_insert_into_right_sibling(
+ ulint flags,
+ btr_cur_t* cursor,
+ rec_offs** offsets,
+ mem_heap_t* heap,
+ const dtuple_t* tuple,
+ ulint n_ext,
+ mtr_t* mtr)
+{
+ buf_block_t* block = btr_cur_get_block(cursor);
+ page_t* page = buf_block_get_frame(block);
+ const uint32_t next_page_no = btr_page_get_next(page);
+
+ ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(heap);
+ ut_ad(dtuple_check_typed(tuple));
+
+ if (next_page_no == FIL_NULL || !page_rec_is_supremum(
+ page_rec_get_next(btr_cur_get_rec(cursor)))) {
+
+ return nullptr;
+ }
+
+ page_cur_t next_page_cursor;
+ buf_block_t* next_block;
+ page_t* next_page;
+ btr_cur_t next_father_cursor;
+ rec_t* rec = nullptr;
+ ulint max_size;
+
+ next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH,
+ page_is_leaf(page), mtr);
+ if (UNIV_UNLIKELY(!next_block)) {
+ return nullptr;
+ }
+ next_page = buf_block_get_frame(next_block);
+ const bool is_leaf = page_is_leaf(next_page);
+
+ next_page_cursor.index = cursor->index();
+ next_page_cursor.block = next_block;
+ next_father_cursor.page_cur = next_page_cursor;
+
+ if (!btr_page_get_father(mtr, &next_father_cursor)) {
+ return nullptr;
+ }
+
+ ulint up_match = 0, low_match = 0;
+
+ if (page_cur_search_with_match(tuple,
+ PAGE_CUR_LE, &up_match, &low_match,
+ &next_page_cursor, nullptr)) {
+ return nullptr;
+ }
+
+ max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
+
+ /* Extends gap lock for the next page */
+ if (is_leaf && cursor->index()->has_locking()) {
+ lock_update_node_pointer(block, next_block);
+ }
+
+ rec = page_cur_tuple_insert(&next_page_cursor, tuple, offsets, &heap,
+ n_ext, mtr);
+
+ if (!rec) {
+ if (is_leaf
+ && next_block->page.zip.ssize
+ && !dict_index_is_clust(cursor->index())
+ && !cursor->index()->table->is_temporary()) {
+ /* Reset the IBUF_BITMAP_FREE bits, because
+ page_cur_tuple_insert() will have attempted page
+ reorganize before failing. */
+ ibuf_reset_free_bits(next_block);
+ }
+ return nullptr;
+ }
+
+ ibool compressed;
+ dberr_t err;
+ ulint level = btr_page_get_level(next_page);
+
+ /* adjust cursor position */
+ *btr_cur_get_page_cur(cursor) = next_page_cursor;
+
+ ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
+ ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
+
+ /* We have to change the parent node pointer */
+
+ compressed = btr_cur_pessimistic_delete(
+ &err, TRUE, &next_father_cursor,
+ BTR_CREATE_FLAG, false, mtr);
+
+ if (err != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ if (!compressed) {
+ btr_cur_compress_if_useful(&next_father_cursor, false, mtr);
+ }
+
+ dtuple_t* node_ptr = dict_index_build_node_ptr(
+ cursor->index(), rec, next_block->page.id().page_no(),
+ heap, level);
+
+ if (btr_insert_on_non_leaf_level(flags, cursor->index(), level + 1,
+ node_ptr, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+
+ if (is_leaf
+ && !dict_index_is_clust(cursor->index())
+ && !cursor->index()->table->is_temporary()) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. */
+
+ if (next_block->page.zip.ssize) {
+ ibuf_update_free_bits_zip(next_block, mtr);
+ } else {
+ ibuf_update_free_bits_if_full(
+ next_block, max_size,
+ rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
+ }
+ }
+
+ return(rec);
+}
+
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+static
+dberr_t
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ ulint old_data_size;
+ ulint new_data_size;
+ ulint old_n_recs;
+ ulint new_n_recs;
+
+ ut_ad(!dict_index_is_spatial(index));
+
+ old_data_size = page_get_data_size(new_page);
+ old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* new_page_zip
+ = buf_block_get_page_zip(new_block);
+ page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!new_page_zip == !page_zip);
+ ut_a(!new_page_zip
+ || page_zip_validate(new_page_zip, new_page, index));
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, page_align(split_rec),
+ index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ dberr_t err;
+ if (!page_copy_rec_list_end(new_block, block,
+ split_rec, index, mtr, &err)) {
+ return err;
+ }
+
+ new_data_size = page_get_data_size(new_page);
+ new_n_recs = page_get_n_recs(new_page);
+
+ ut_ad(new_data_size >= old_data_size);
+
+ return page_delete_rec_list_end(split_rec, block, index,
+ new_n_recs - old_n_recs,
+ new_data_size - old_data_size, mtr);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+static
+dberr_t
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dberr_t err;
+ if (page_copy_rec_list_start(new_block, block, split_rec, index, mtr, &err))
+ page_delete_rec_list_start(split_rec, block, index, mtr);
+ return err;
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record or NULL if run out of space */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err) /*!< out: error code */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ buf_block_t* new_block;
+ page_t* new_page;
+ page_zip_des_t* new_page_zip;
+ rec_t* split_rec;
+ buf_block_t* left_block;
+ buf_block_t* right_block;
+ page_cur_t* page_cursor;
+ rec_t* first_rec;
+ byte* buf = 0; /* remove warning */
+ rec_t* move_limit;
+ ulint n_iterations = 0;
+ ulint n_uniq;
+
+ ut_ad(*err == DB_SUCCESS);
+ ut_ad(dtuple_check_typed(tuple));
+
+ buf_pool.pages_split++;
+
+ if (cursor->index()->is_spatial()) {
+ /* Split rtree page and update parent */
+ return rtr_page_split_and_insert(flags, cursor, offsets, heap,
+ tuple, n_ext, mtr, err);
+ }
+
+ if (!*heap) {
+ *heap = mem_heap_create(1024);
+ }
+ n_uniq = dict_index_get_n_unique_in_tree(cursor->index());
+func_start:
+ mem_heap_empty(*heap);
+ *offsets = NULL;
+
+ ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+ MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(!dict_index_is_online_ddl(cursor->index())
+ || (flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(cursor->index()));
+ ut_ad(cursor->index()->lock.have_u_or_x());
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_is_empty(page));
+
+ /* try to insert to the next page if possible before split */
+ if (rec_t* rec = btr_insert_into_right_sibling(
+ flags, cursor, offsets, *heap, tuple, n_ext, mtr)) {
+ return(rec);
+ }
+
+ /* 1. Decide the split record; split_rec == NULL means that the
+ tuple to be inserted should be the first record on the upper
+ half-page */
+ bool insert_left = false;
+ uint32_t hint_page_no = block->page.id().page_no() + 1;
+ byte direction = FSP_UP;
+
+ if (n_iterations > 0) {
+ split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+ if (split_rec == NULL) {
+ insert_left = btr_page_tuple_smaller(
+ cursor, tuple, offsets, n_uniq, heap);
+ }
+ } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+ } else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) {
+ direction = FSP_DOWN;
+ hint_page_no -= 2;
+ } else {
+ /* If there is only one record in the index page, we
+ can't split the node in the middle by default. We need
+ to determine whether the new record will be inserted
+ to the left or right. */
+
+ if (page_get_n_recs(page) > 1) {
+ split_rec = page_get_middle_rec(page);
+ } else if (btr_page_tuple_smaller(cursor, tuple,
+ offsets, n_uniq, heap)) {
+ split_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+ } else {
+ split_rec = NULL;
+ goto got_split_rec;
+ }
+
+ if (UNIV_UNLIKELY(!split_rec)) {
+ *err = DB_CORRUPTION;
+ return nullptr;
+ }
+ }
+
+got_split_rec:
+ /* 2. Allocate a new page to the index */
+ const uint16_t page_level = btr_page_get_level(page);
+ new_block = btr_page_alloc(cursor->index(), hint_page_no, direction,
+ page_level, mtr, mtr, err);
+
+ if (!new_block) {
+ return nullptr;
+ }
+
+ new_page = buf_block_get_frame(new_block);
+ new_page_zip = buf_block_get_page_zip(new_block);
+
+ if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+ /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+ to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+ memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
+ }
+ btr_page_create(new_block, new_page_zip, cursor->index(),
+ page_level, mtr);
+ /* Only record the leaf level page splits. */
+ if (!page_level) {
+ cursor->index()->stat_defrag_n_page_split ++;
+ cursor->index()->stat_defrag_modified_counter ++;
+ btr_defragment_save_defrag_stats_if_needed(cursor->index());
+ }
+
+ /* 3. Calculate the first record on the upper half-page, and the
+ first record (move_limit) on original page which ends up on the
+ upper half */
+
+ if (split_rec) {
+ first_rec = move_limit = split_rec;
+
+ *offsets = rec_get_offsets(split_rec, cursor->index(),
+ *offsets, page_is_leaf(page)
+ ? cursor->index()->n_core_fields
+ : 0,
+ n_uniq, heap);
+
+ insert_left = cmp_dtuple_rec(tuple, split_rec, cursor->index(),
+ *offsets) < 0;
+
+ if (!insert_left && new_page_zip && n_iterations > 0) {
+ /* If a compressed page has already been split,
+ avoid further splits by inserting the record
+ to an empty page. */
+ split_rec = NULL;
+ goto insert_empty;
+ }
+ } else if (insert_left) {
+ if (UNIV_UNLIKELY(!n_iterations)) {
+corrupted:
+ *err = DB_CORRUPTION;
+ return nullptr;
+ }
+ first_rec = page_rec_get_next(page_get_infimum_rec(page));
+insert_move_limit:
+ move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+ if (UNIV_UNLIKELY(!first_rec || !move_limit)) {
+ goto corrupted;
+ }
+ } else {
+insert_empty:
+ ut_ad(!split_rec);
+ ut_ad(!insert_left);
+ buf = UT_NEW_ARRAY_NOKEY(
+ byte,
+ rec_get_converted_size(cursor->index(), tuple, n_ext));
+
+ first_rec = rec_convert_dtuple_to_rec(buf, cursor->index(),
+ tuple, n_ext);
+ goto insert_move_limit;
+ }
+
+ /* 4. Do first the modifications in the tree structure */
+
+ /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
+ *err = btr_attach_half_pages(flags, cursor->index(), block,
+ first_rec, new_block, direction, mtr);
+
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return nullptr;
+ }
+
+#ifdef UNIV_DEBUG
+ /* If the split is made on the leaf level and the insert will fit
+ on the appropriate half-page, we may release the tree x-latch.
+ We can then move the records after releasing the tree latch,
+ thus reducing the tree latch contention. */
+ const bool insert_will_fit = !new_page_zip
+ && btr_page_insert_fits(cursor, split_rec, offsets, tuple,
+ n_ext, heap);
+#endif
+ if (!split_rec && !insert_left) {
+ UT_DELETE_ARRAY(buf);
+ buf = NULL;
+ }
+
+#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled
+ if (insert_will_fit
+ && page_is_leaf(page)
+ && !dict_index_is_online_ddl(cursor->index())) {
+ mtr->release(cursor->index()->lock);
+ /* NOTE: We cannot release root block latch here, because it
+ has segment header and already modified in most of cases.*/
+ }
+#endif
+
+ /* 5. Move then the records to the new page */
+ if (direction == FSP_DOWN) {
+ /* fputs("Split left\n", stderr); */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif /* UNIV_ZIP_COPY */
+ || (*err = page_move_rec_list_start(new_block, block,
+ move_limit,
+ cursor->index(),
+ mtr))) {
+ if (*err != DB_FAIL) {
+ return nullptr;
+ }
+
+ /* For some reason, compressing new_block failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_block, page_zip, page,
+ cursor->index(), mtr);
+ *err = page_delete_rec_list_end(move_limit
+ - page + new_page,
+ new_block,
+ cursor->index(),
+ ULINT_UNDEFINED,
+ ULINT_UNDEFINED, mtr);
+ if (*err != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ /* Update the lock table and possible hash index. */
+ if (cursor->index()->has_locking()) {
+ lock_move_rec_list_start(
+ new_block, block, move_limit,
+ new_page + PAGE_NEW_INFIMUM);
+ }
+
+ btr_search_move_or_delete_hash_entries(
+ new_block, block);
+
+ /* Delete the records from the source page. */
+
+ page_delete_rec_list_start(move_limit, block,
+ cursor->index(), mtr);
+ }
+
+ left_block = new_block;
+ right_block = block;
+
+ if (cursor->index()->has_locking()) {
+ lock_update_split_left(right_block, left_block);
+ }
+ } else {
+ /* fputs("Split right\n", stderr); */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif /* UNIV_ZIP_COPY */
+ || (*err = page_move_rec_list_end(new_block, block,
+ move_limit,
+ cursor->index(), mtr))) {
+ if (*err != DB_FAIL) {
+ return nullptr;
+ }
+
+ /* For some reason, compressing new_page failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_block, page_zip, page,
+ cursor->index(), mtr);
+ page_delete_rec_list_start(move_limit - page
+ + new_page, new_block,
+ cursor->index(), mtr);
+
+ /* Update the lock table and possible hash index. */
+ if (cursor->index()->has_locking()) {
+ lock_move_rec_list_end(new_block, block,
+ move_limit);
+ }
+
+ btr_search_move_or_delete_hash_entries(
+ new_block, block);
+
+ /* Delete the records from the source page. */
+
+ *err = page_delete_rec_list_end(move_limit, block,
+ cursor->index(),
+ ULINT_UNDEFINED,
+ ULINT_UNDEFINED, mtr);
+ if (*err != DB_SUCCESS) {
+ return nullptr;
+ }
+ }
+
+ left_block = block;
+ right_block = new_block;
+
+ if (cursor->index()->has_locking()) {
+ lock_update_split_right(right_block, left_block);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ if (page_zip) {
+ ut_a(page_zip_validate(page_zip, page, cursor->index()));
+ ut_a(page_zip_validate(new_page_zip, new_page,
+ cursor->index()));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* At this point, split_rec, move_limit and first_rec may point
+ to garbage on the old page. */
+
+ /* 6. The split and the tree modification is now completed. Decide the
+ page where the tuple should be inserted */
+ rec_t* rec;
+ buf_block_t* const insert_block = insert_left
+ ? left_block : right_block;
+
+ /* 7. Reposition the cursor for insert and try insertion */
+ page_cursor = btr_cur_get_page_cur(cursor);
+ page_cursor->block = insert_block;
+
+ ulint up_match = 0, low_match = 0;
+
+ if (page_cur_search_with_match(tuple,
+ PAGE_CUR_LE, &up_match, &low_match,
+ page_cursor, nullptr)) {
+ *err = DB_CORRUPTION;
+ return nullptr;
+ }
+
+ rec = page_cur_tuple_insert(page_cursor, tuple,
+ offsets, heap, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_t* insert_page
+ = buf_block_get_frame(insert_block);
+
+ page_zip_des_t* insert_page_zip
+ = buf_block_get_page_zip(insert_block);
+
+ ut_a(!insert_page_zip
+ || page_zip_validate(insert_page_zip, insert_page,
+ cursor->index()));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (rec != NULL) {
+
+ goto func_exit;
+ }
+
+ /* 8. If insert did not fit, try page reorganization.
+ For compressed pages, page_cur_tuple_insert() will have
+ attempted this already. */
+
+ if (page_cur_get_page_zip(page_cursor)) {
+ goto insert_failed;
+ }
+
+ *err = btr_page_reorganize(page_cursor, mtr);
+
+ if (*err != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ rec = page_cur_tuple_insert(page_cursor, tuple,
+ offsets, heap, n_ext, mtr);
+
+ if (rec == NULL) {
+ /* The insert did not fit on the page: loop back to the
+ start of the function for a new split */
+insert_failed:
+ /* We play safe and reset the free bits for new_page */
+ if (!dict_index_is_clust(page_cursor->index)
+ && !page_cursor->index->table->is_temporary()) {
+ ibuf_reset_free_bits(new_block);
+ ibuf_reset_free_bits(block);
+ }
+
+ n_iterations++;
+ ut_ad(n_iterations < 2
+ || buf_block_get_page_zip(insert_block));
+ ut_ad(!insert_will_fit);
+
+ goto func_start;
+ }
+
+func_exit:
+ /* Insert fit on the page: update the free bits for the
+ left and right pages in the same mtr */
+
+ if (!dict_index_is_clust(page_cursor->index)
+ && !page_cursor->index->table->is_temporary()
+ && page_is_leaf(page)) {
+
+ ibuf_update_free_bits_for_two_pages_low(
+ left_block, right_block, mtr);
+ }
+
+ ut_ad(page_validate(buf_block_get_frame(left_block),
+ page_cursor->index));
+ ut_ad(page_validate(buf_block_get_frame(right_block),
+ page_cursor->index));
+
+ ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
+ return(rec);
+}
+
+/** Remove a page from the level list of pages.
+@param[in] block page to remove
+@param[in] index index tree
+@param[in,out] mtr mini-transaction */
+dberr_t btr_level_list_remove(const buf_block_t& block,
+ const dict_index_t& index, mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(block.zip_size() == index.table->space->zip_size());
+ ut_ad(index.table->space->id == block.page.id().space());
+ /* Get the previous and next page numbers of page */
+ const uint32_t prev_page_no= btr_page_get_prev(block.page.frame);
+ const uint32_t next_page_no= btr_page_get_next(block.page.frame);
+ page_id_t id{block.page.id()};
+ buf_block_t *prev= nullptr, *next;
+ dberr_t err;
+
+ /* Update page links of the level */
+ if (prev_page_no != FIL_NULL)
+ {
+ id.set_page_no(prev_page_no);
+ prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+ if (!prev)
+ {
+ ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
+ prev= btr_block_get(index, id.page_no(), RW_X_LATCH,
+ page_is_leaf(block.page.frame), mtr, &err);
+ if (UNIV_UNLIKELY(!prev))
+ return err;
+ }
+#endif
+ }
+
+ if (next_page_no != FIL_NULL)
+ {
+ id.set_page_no(next_page_no);
+ next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+ if (!next)
+ {
+ ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
+ next= btr_block_get(index, id.page_no(), RW_X_LATCH,
+ page_is_leaf(block.page.frame), mtr, &err);
+ if (UNIV_UNLIKELY(!next))
+ return err;
+ }
+#endif
+ btr_page_set_prev(next, prev_page_no, mtr);
+ }
+
+ if (prev)
+ btr_page_set_next(prev, next_page_no, mtr);
+
+ return DB_SUCCESS;
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+buf_block_t*
+btr_lift_page_up(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ dberr_t* err) /*!< out: error code */
+{
+ buf_block_t* father_block;
+ ulint page_level;
+ page_zip_des_t* father_page_zip;
+ page_t* page = buf_block_get_frame(block);
+ ulint root_page_no;
+ buf_block_t* blocks[BTR_MAX_LEVELS];
+ ulint n_blocks; /*!< last used index in blocks[] */
+ ulint i;
+ bool lift_father_up;
+ buf_block_t* block_orig = block;
+
+ ut_ad(!page_has_siblings(page));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_is_empty(page));
+
+ page_level = btr_page_get_level(page);
+ root_page_no = dict_index_get_page(index);
+
+ {
+ btr_cur_t cursor;
+ rec_offs* offsets = NULL;
+ mem_heap_t* heap = mem_heap_create(
+ sizeof(*offsets)
+ * (REC_OFFS_HEADER_SIZE + 1 + 1
+ + unsigned(index->n_fields)));
+ buf_block_t* b;
+ cursor.page_cur.index = index;
+ cursor.page_cur.block = block;
+
+ if (index->is_spatial()) {
+ offsets = rtr_page_get_father_block(
+ nullptr, heap, mtr, nullptr, &cursor);
+ } else {
+ offsets = btr_page_get_father_block(offsets, heap,
+ mtr, &cursor);
+ }
+ father_block = btr_cur_get_block(&cursor);
+ father_page_zip = buf_block_get_page_zip(father_block);
+
+ n_blocks = 0;
+
+ /* Store all ancestor pages so we can reset their
+ levels later on. We have to do all the searches on
+ the tree now because later on, after we've replaced
+ the first level, the tree is in an inconsistent state
+ and can not be searched. */
+ for (b = father_block;
+ b->page.id().page_no() != root_page_no; ) {
+ ut_a(n_blocks < BTR_MAX_LEVELS);
+
+ if (index->is_spatial()) {
+ offsets = rtr_page_get_father_block(
+ nullptr, heap, mtr, nullptr, &cursor);
+ } else {
+ offsets = btr_page_get_father_block(offsets,
+ heap,
+ mtr,
+ &cursor);
+ }
+
+ blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+ }
+
+ lift_father_up = (n_blocks && page_level == 0);
+ if (lift_father_up) {
+ /* The father page also should be the only on its level (not
+ root). We should lift up the father page at first.
+ Because the leaf page should be lifted up only for root page.
+ The freeing page is based on page_level (==0 or !=0)
+ to choose segment. If the page_level is changed ==0 from !=0,
+ later freeing of the page doesn't find the page allocation
+ to be freed.*/
+
+ block = father_block;
+ page = buf_block_get_frame(block);
+ page_level = btr_page_get_level(page);
+
+ ut_ad(!page_has_siblings(page));
+ ut_ad(mtr->memo_contains_flagged(block,
+ MTR_MEMO_PAGE_X_FIX));
+
+ father_block = blocks[0];
+ father_page_zip = buf_block_get_page_zip(father_block);
+ }
+
+ mem_heap_free(heap);
+ }
+
+ btr_search_drop_page_hash_index(block, false);
+
+ /* Make the father empty */
+ btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+ /* btr_page_empty() is supposed to zero-initialize the field. */
+ ut_ad(!page_get_instant(father_block->page.frame));
+
+ if (index->is_instant()
+ && father_block->page.id().page_no() == root_page_no) {
+ ut_ad(!father_page_zip);
+
+ if (page_is_leaf(page)) {
+ const rec_t* rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+ ut_ad(rec_is_metadata(rec, *index));
+ if (rec_is_add_metadata(rec, *index)
+ && page_get_n_recs(page) == 1) {
+ index->clear_instant_add();
+ goto copied;
+ }
+ }
+
+ btr_set_instant(father_block, *index, mtr);
+ }
+
+ /* Copy the records to the father page one by one. */
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_copy_rec_list_end(father_block, block,
+ page_get_infimum_rec(page),
+ index, mtr, err)) {
+ switch (*err) {
+ case DB_SUCCESS:
+ break;
+ case DB_FAIL:
+ *err = DB_SUCCESS;
+ break;
+ default:
+ return nullptr;
+ }
+
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(father_page_zip);
+ ut_a(page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(father_block,
+ page_zip, page, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+
+ if (index->has_locking()) {
+ lock_move_rec_list_end(father_block, block,
+ page_get_infimum_rec(page));
+ }
+
+ /* Also update the predicate locks */
+ if (dict_index_is_spatial(index)) {
+ lock_prdt_rec_move(father_block, block->page.id());
+ } else {
+ btr_search_move_or_delete_hash_entries(
+ father_block, block);
+ }
+ }
+
+copied:
+ if (index->has_locking()) {
+ const page_id_t id{block->page.id()};
+ /* Free predicate page locks on the block */
+ if (index->is_spatial()) {
+ lock_sys.prdt_page_free_from_discard(id);
+ } else {
+ lock_update_copy_and_discard(*father_block, id);
+ }
+ }
+
+ page_level++;
+
+ /* Go upward to root page, decrementing levels by one. */
+ for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
+ ut_ad(btr_page_get_level(blocks[i]->page.frame)
+ == page_level + 1);
+ btr_page_set_level(blocks[i], page_level, mtr);
+ }
+
+ if (dict_index_is_spatial(index)) {
+ rtr_check_discard_page(index, NULL, block);
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ /* We play it safe and reset the free bits for the father */
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()) {
+ ibuf_reset_free_bits(father_block);
+ }
+ ut_ad(page_validate(father_block->page.frame, index));
+ ut_ad(btr_check_node_ptr(index, father_block, mtr));
+
+ return(lift_father_up ? block_orig : father_block);
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return error code */
+dberr_t
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
+ or lift; the page must not be empty:
+ when deleting records, use btr_discard_page()
+ if the page would become empty */
+ bool adjust, /*!< in: whether the cursor position should be
+ adjusted even when compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index;
+ buf_block_t* merge_block = nullptr;
+ page_t* merge_page = nullptr;
+ page_zip_des_t* merge_page_zip;
+ ibool is_left;
+ buf_block_t* block;
+ page_t* page;
+ btr_cur_t father_cursor;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+ ulint nth_rec = 0; /* remove bogus warning */
+ bool mbr_changed = false;
+#ifdef UNIV_DEBUG
+ bool leftmost_child;
+#endif
+ DBUG_ENTER("btr_compress");
+
+ block = btr_cur_get_block(cursor);
+ page = btr_cur_get_page(cursor);
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
+
+ const uint32_t left_page_no = btr_page_get_prev(page);
+ const uint32_t right_page_no = btr_page_get_next(page);
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(page_is_leaf(page) || left_page_no != FIL_NULL
+ || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(page)),
+ page_is_comp(page))));
+
+ heap = mem_heap_create(100);
+ father_cursor.page_cur.index = index;
+ father_cursor.page_cur.block = block;
+
+ if (index->is_spatial()) {
+ offsets = rtr_page_get_father_block(
+ NULL, heap, mtr, cursor, &father_cursor);
+ ut_ad(cursor->page_cur.block->page.id() == block->page.id());
+ rec_t* my_rec = father_cursor.page_cur.rec;
+
+ ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets);
+
+ if (page_no != block->page.id().page_no()) {
+ ib::info() << "father positioned on page "
+ << page_no << "instead of "
+ << block->page.id().page_no();
+ offsets = btr_page_get_father_block(
+ NULL, heap, mtr, &father_cursor);
+ }
+ } else {
+ offsets = btr_page_get_father_block(
+ NULL, heap, mtr, &father_cursor);
+ }
+
+ if (adjust) {
+ nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+ if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) {
+ corrupted:
+ err = DB_CORRUPTION;
+ err_exit:
+ /* We play it safe and reset the free bits. */
+ if (merge_block && merge_block->zip_size()
+ && page_is_leaf(merge_block->page.frame)
+ && !index->is_clust()) {
+ ibuf_reset_free_bits(merge_block);
+ }
+ goto func_exit;
+ }
+ }
+
+ if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
+ /* The page is the only one on the level, lift the records
+ to the father */
+
+ merge_block = btr_lift_page_up(index, block, mtr, &err);
+success:
+ if (adjust) {
+ ut_ad(nth_rec > 0);
+ if (rec_t* nth
+ = page_rec_get_nth(merge_block->page.frame,
+ nth_rec)) {
+ btr_cur_position(index, nth,
+ merge_block, cursor);
+ } else {
+ goto corrupted;
+ }
+ }
+
+ MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
+func_exit:
+ mem_heap_free(heap);
+ DBUG_RETURN(err);
+ }
+
+ ut_d(leftmost_child =
+ left_page_no != FIL_NULL
+ && (page_rec_get_next(
+ page_get_infimum_rec(
+ btr_cur_get_page(&father_cursor)))
+ == btr_cur_get_rec(&father_cursor)));
+
+ /* Decide the page to which we try to merge and which will inherit
+ the locks */
+
+ is_left = btr_can_merge_with_page(cursor, left_page_no,
+ &merge_block, mtr);
+
+ DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
+retry:
+ if (!is_left
+ && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
+ mtr)) {
+ if (!merge_block) {
+ merge_page = NULL;
+ }
+cannot_merge:
+ err = DB_FAIL;
+ goto err_exit;
+ }
+
+ merge_page = buf_block_get_frame(merge_block);
+
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_page + (is_left
+ ? FIL_PAGE_NEXT
+ : FIL_PAGE_PREV),
+ block->page.frame
+ + FIL_PAGE_OFFSET, 4))) {
+ goto corrupted;
+ }
+
+ ut_ad(page_validate(merge_page, index));
+
+ merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+ if (merge_page_zip) {
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(page_zip);
+ ut_a(page_zip_validate(merge_page_zip, merge_page, index));
+ ut_a(page_zip_validate(page_zip, page, index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ btr_cur_t cursor2;
+ cursor2.page_cur.index = index;
+ cursor2.page_cur.block = merge_block;
+
+ /* Move records to the merge page */
+ if (is_left) {
+ rtr_mbr_t new_mbr;
+ rec_offs* offsets2 = NULL;
+
+ /* For rtree, we need to update father's mbr. */
+ if (index->is_spatial()) {
+ /* We only support merge pages with the same parent
+ page */
+ if (!rtr_check_same_block(
+ index, &cursor2,
+ btr_cur_get_block(&father_cursor), heap)) {
+ is_left = false;
+ goto retry;
+ }
+
+ /* Set rtr_info for cursor2, since it is
+ necessary in recursive page merge. */
+ cursor2.rtr_info = cursor->rtr_info;
+ cursor2.tree_height = cursor->tree_height;
+
+ offsets2 = rec_get_offsets(
+ btr_cur_get_rec(&cursor2), index, NULL,
+ page_is_leaf(btr_cur_get_page(&cursor2))
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* Check if parent entry needs to be updated */
+ mbr_changed = rtr_merge_mbr_changed(
+ &cursor2, &father_cursor,
+ offsets2, offsets, &new_mbr);
+ }
+
+ rec_t* orig_pred = page_copy_rec_list_start(
+ merge_block, block, page_get_supremum_rec(page),
+ index, mtr, &err);
+
+ if (!orig_pred) {
+ goto err_exit;
+ }
+
+ btr_search_drop_page_hash_index(block, false);
+
+ /* Remove the page from the level list */
+ err = btr_level_list_remove(*block, *index, mtr);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto err_exit;
+ }
+
+ const page_id_t id{block->page.id()};
+
+ if (index->is_spatial()) {
+ rec_t* my_rec = father_cursor.page_cur.rec;
+
+ ulint page_no = btr_node_ptr_get_child_page_no(
+ my_rec, offsets);
+
+ if (page_no != block->page.id().page_no()) {
+ ib::fatal() << "father positioned on "
+ << page_no << " instead of "
+ << block->page.id().page_no();
+ }
+
+ if (mbr_changed) {
+ rtr_update_mbr_field(
+ &cursor2, offsets2, &father_cursor,
+ merge_page, &new_mbr, NULL, mtr);
+ } else {
+ rtr_node_ptr_delete(&father_cursor, mtr);
+ }
+
+ /* No GAP lock needs to be worrying about */
+ lock_sys.prdt_page_free_from_discard(id);
+ } else {
+ err = btr_cur_node_ptr_delete(&father_cursor, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto err_exit;
+ }
+ if (index->has_locking()) {
+ lock_update_merge_left(
+ *merge_block, orig_pred, id);
+ }
+ }
+
+ if (adjust) {
+ ulint n = page_rec_get_n_recs_before(orig_pred);
+ if (UNIV_UNLIKELY(!n || n == ULINT_UNDEFINED)) {
+ goto corrupted;
+ }
+ nth_rec += n;
+ }
+ } else {
+ rec_t* orig_succ;
+ ibool compressed;
+ dberr_t err;
+ byte fil_page_prev[4];
+
+ if (index->is_spatial()) {
+ /* For spatial index, we disallow merge of blocks
+ with different parents, since the merge would need
+ to update entry (for MBR and Primary key) in the
+ parent of block being merged */
+ if (!rtr_check_same_block(
+ index, &cursor2,
+ btr_cur_get_block(&father_cursor), heap)) {
+ goto cannot_merge;
+ }
+
+ /* Set rtr_info for cursor2, since it is
+ necessary in recursive page merge. */
+ cursor2.rtr_info = cursor->rtr_info;
+ cursor2.tree_height = cursor->tree_height;
+ } else if (!btr_page_get_father(mtr, &cursor2)) {
+ goto cannot_merge;
+ }
+
+ if (merge_page_zip && left_page_no == FIL_NULL) {
+
+ /* The function page_zip_compress(), which will be
+ invoked by page_copy_rec_list_end() below,
+ requires that FIL_PAGE_PREV be FIL_NULL.
+ Clear the field, but prepare to restore it. */
+ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+ memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+ compile_time_assert(FIL_NULL == 0xffffffffU);
+ memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4);
+ }
+
+ orig_succ = page_copy_rec_list_end(merge_block, block,
+ page_get_infimum_rec(page),
+ cursor->index(), mtr, &err);
+
+ if (!orig_succ) {
+ ut_a(merge_page_zip);
+ if (left_page_no == FIL_NULL) {
+ /* FIL_PAGE_PREV was restored from
+ merge_page_zip. */
+ ut_ad(!memcmp(fil_page_prev,
+ merge_page + FIL_PAGE_PREV, 4));
+ }
+ goto err_exit;
+ }
+
+ btr_search_drop_page_hash_index(block, false);
+
+ if (merge_page_zip && left_page_no == FIL_NULL) {
+
+ /* Restore FIL_PAGE_PREV in order to avoid an assertion
+ failure in btr_level_list_remove(), which will set
+ the field again to FIL_NULL. Even though this makes
+ merge_page and merge_page_zip inconsistent for a
+ split second, it is harmless, because the pages
+ are X-latched. */
+ memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+ }
+
+ /* Remove the page from the level list */
+ err = btr_level_list_remove(*block, *index, mtr);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto err_exit;
+ }
+
+ ut_ad(btr_node_ptr_get_child_page_no(
+ btr_cur_get_rec(&father_cursor), offsets)
+ == block->page.id().page_no());
+
+ /* Replace the address of the old child node (= page) with the
+ address of the merge page to the right */
+ btr_node_ptr_set_child_page_no(
+ btr_cur_get_block(&father_cursor),
+ btr_cur_get_rec(&father_cursor),
+ offsets, right_page_no, mtr);
+
+#ifdef UNIV_DEBUG
+ if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+ ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(
+ buf_block_get_frame(merge_block))),
+ page_is_comp(page)));
+ }
+#endif /* UNIV_DEBUG */
+
+ /* For rtree, we need to update father's mbr. */
+ if (index->is_spatial()) {
+ rec_offs* offsets2;
+ ulint rec_info;
+
+ offsets2 = rec_get_offsets(
+ btr_cur_get_rec(&cursor2), index, NULL,
+ page_is_leaf(btr_cur_get_page(&cursor2))
+ ? index->n_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(btr_node_ptr_get_child_page_no(
+ btr_cur_get_rec(&cursor2), offsets2)
+ == right_page_no);
+
+ rec_info = rec_get_info_bits(
+ btr_cur_get_rec(&father_cursor),
+ rec_offs_comp(offsets));
+ if (rec_info & REC_INFO_MIN_REC_FLAG) {
+ /* When the father node ptr is minimal rec,
+ we will keep it and delete the node ptr of
+ merge page. */
+ rtr_merge_and_update_mbr(&father_cursor,
+ &cursor2,
+ offsets, offsets2,
+ merge_page, mtr);
+ } else {
+ /* Otherwise, we will keep the node ptr of
+ merge page and delete the father node ptr.
+ This is for keeping the rec order in upper
+ level. */
+ rtr_merge_and_update_mbr(&cursor2,
+ &father_cursor,
+ offsets2, offsets,
+ merge_page, mtr);
+ }
+ const page_id_t id{block->page.id()};
+ lock_sys.prdt_page_free_from_discard(id);
+ } else {
+
+ compressed = btr_cur_pessimistic_delete(&err, TRUE,
+ &cursor2,
+ BTR_CREATE_FLAG,
+ false, mtr);
+ ut_a(err == DB_SUCCESS);
+
+ if (!compressed) {
+ btr_cur_compress_if_useful(&cursor2, false,
+ mtr);
+ }
+
+ if (index->has_locking()) {
+ lock_update_merge_right(
+ merge_block, orig_succ, block);
+ }
+ }
+ }
+
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(merge_page)) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. This has to be done in a
+ separate mini-transaction that is committed before the
+ main mini-transaction. We cannot update the insert
+ buffer bitmap in this mini-transaction, because
+ btr_compress() can be invoked recursively without
+ committing the mini-transaction in between. Since
+ insert buffer bitmap pages have a lower rank than
+ B-tree pages, we must not access other pages in the
+ same mini-transaction after accessing an insert buffer
+ bitmap page. */
+
+ /* The free bits in the insert buffer bitmap must
+ never exceed the free space on a page. It is safe to
+ decrement or reset the bits in the bitmap in a
+ mini-transaction that is committed before the
+ mini-transaction that affects the free space. */
+
+ /* It is unsafe to increment the bits in a separately
+ committed mini-transaction, because in crash recovery,
+ the free bits could momentarily be set too high. */
+
+ if (merge_block->zip_size()) {
+ /* Because the free bits may be incremented
+ and we cannot update the insert buffer bitmap
+ in the same mini-transaction, the only safe
+ thing we can do here is the pessimistic
+ approach: reset the free bits. */
+ ibuf_reset_free_bits(merge_block);
+ } else {
+ /* On uncompressed pages, the free bits will
+ never increase here. Thus, it is safe to
+ write the bits accurately in a separate
+ mini-transaction. */
+ ibuf_update_free_bits_if_full(merge_block,
+ srv_page_size,
+ ULINT_UNDEFINED);
+ }
+ }
+
+ ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
+ index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (dict_index_is_spatial(index)) {
+ rtr_check_discard_page(index, NULL, block);
+ }
+
+ /* Free the file page */
+ err = btr_page_free(index, block, mtr);
+ if (err == DB_SUCCESS) {
+ ut_ad(leftmost_child
+ || btr_check_node_ptr(index, merge_block, mtr));
+ goto success;
+ } else {
+ goto err_exit;
+ }
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level. This will empty
+the whole B-tree, leaving just an empty root page. This function
+should almost never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+ATTRIBUTE_COLD
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_level = 0;
+
+ ut_ad(!index->is_dummy);
+
+ /* Save the PAGE_MAX_TRX_ID from the leaf page. */
+ const trx_id_t max_trx_id = page_get_max_trx_id(block->page.frame);
+ const rec_t* r = page_rec_get_next(
+ page_get_infimum_rec(block->page.frame));
+ /* In the caller we checked that a valid key exists in the page,
+ because we were able to look up a parent page. */
+ ut_ad(r);
+ ut_ad(rec_is_metadata(r, *index) == index->is_instant());
+
+ while (block->page.id().page_no() != dict_index_get_page(index)) {
+ btr_cur_t cursor;
+ buf_block_t* father;
+ const page_t* page = buf_block_get_frame(block);
+
+ ut_a(page_get_n_recs(page) == 1);
+ ut_a(page_level == btr_page_get_level(page));
+ ut_a(!page_has_siblings(page));
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(block->page.id().space() == index->table->space->id);
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ btr_search_drop_page_hash_index(block, false);
+ cursor.page_cur.index = index;
+ cursor.page_cur.block = block;
+
+ if (index->is_spatial()) {
+ /* Check any concurrent search having this page */
+ rtr_check_discard_page(index, NULL, block);
+ if (!rtr_page_get_father(mtr, nullptr, &cursor)) {
+ return;
+ }
+ } else {
+ if (!btr_page_get_father(mtr, &cursor)) {
+ return;
+ }
+ }
+ father = btr_cur_get_block(&cursor);
+
+ if (index->has_locking()) {
+ lock_update_discard(
+ father, PAGE_HEAP_NO_SUPREMUM, block);
+ }
+
+ /* Free the file page */
+ if (btr_page_free(index, block, mtr) != DB_SUCCESS) {
+ return;
+ }
+
+ block = father;
+ page_level++;
+ }
+
+ /* block is the root page, which must be empty, except
+ for the node pointer to the (now discarded) block(s). */
+ ut_ad(!page_has_siblings(block->page.frame));
+
+ mem_heap_t* heap = nullptr;
+ const rec_t* rec = nullptr;
+ rec_offs* offsets = nullptr;
+ if (index->table->instant || index->must_avoid_clear_instant_add()) {
+ if (!rec_is_metadata(r, *index)) {
+ } else if (!index->table->instant
+ || rec_is_alter_metadata(r, *index)) {
+ heap = mem_heap_create(srv_page_size);
+ offsets = rec_get_offsets(r, index, nullptr,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ rec = rec_copy(mem_heap_alloc(heap,
+ rec_offs_size(offsets)),
+ r, offsets);
+ rec_offs_make_valid(rec, index, true, offsets);
+ }
+ }
+
+ btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+ /* btr_page_empty() is supposed to zero-initialize the field. */
+ ut_ad(!page_get_instant(block->page.frame));
+
+ if (index->is_primary()) {
+ if (rec) {
+ page_cur_t cur;
+ page_cur_set_before_first(block, &cur);
+ cur.index = index;
+ DBUG_ASSERT(index->table->instant);
+ DBUG_ASSERT(rec_is_alter_metadata(rec, *index));
+ btr_set_instant(block, *index, mtr);
+ rec = page_cur_insert_rec_low(&cur, rec, offsets, mtr);
+ ut_ad(rec);
+ mem_heap_free(heap);
+ } else if (index->is_instant()) {
+ index->clear_instant_add();
+ }
+ } else if (!index->table->is_temporary()) {
+ /* We play it safe and reset the free bits for the root */
+ ibuf_reset_free_bits(block);
+
+ ut_a(max_trx_id);
+ page_set_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ max_trx_id, mtr);
+ }
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+dberr_t
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ buf_block_t* merge_block;
+ buf_block_t* block;
+ btr_cur_t parent_cursor;
+
+ block = btr_cur_get_block(cursor);
+ index = btr_cur_get_index(cursor);
+ parent_cursor.page_cur = cursor->page_cur;
+
+ ut_ad(dict_index_get_page(index) != block->page.id().page_no());
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ MONITOR_INC(MONITOR_INDEX_DISCARD);
+
+ if (index->is_spatial()
+ ? !rtr_page_get_father(mtr, cursor, &parent_cursor)
+ : !btr_page_get_father(mtr, &parent_cursor)) {
+ return DB_CORRUPTION;
+ }
+
+ /* Decide the page which will inherit the locks */
+
+ const uint32_t left_page_no = btr_page_get_prev(block->page.frame);
+ const uint32_t right_page_no = btr_page_get_next(block->page.frame);
+ page_id_t merge_page_id{block->page.id()};
+
+ ut_d(bool parent_is_different = false);
+ dberr_t err;
+ if (left_page_no != FIL_NULL) {
+ merge_page_id.set_page_no(left_page_no);
+ merge_block = btr_block_reget(mtr, *index, merge_page_id,
+ &err);
+ if (UNIV_UNLIKELY(!merge_block)) {
+ return err;
+ }
+#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */
+ ut_ad(!memcmp_aligned<4>(merge_block->page.frame
+ + FIL_PAGE_NEXT,
+ block->page.frame + FIL_PAGE_OFFSET,
+ 4));
+#else
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame
+ + FIL_PAGE_NEXT,
+ block->page.frame
+ + FIL_PAGE_OFFSET, 4))) {
+ return DB_CORRUPTION;
+ }
+#endif
+ ut_d(parent_is_different =
+ (page_rec_get_next(
+ page_get_infimum_rec(
+ btr_cur_get_page(
+ &parent_cursor)))
+ == btr_cur_get_rec(&parent_cursor)));
+ } else if (right_page_no != FIL_NULL) {
+ merge_page_id.set_page_no(right_page_no);
+ merge_block = btr_block_reget(mtr, *index, merge_page_id,
+ &err);
+ if (UNIV_UNLIKELY(!merge_block)) {
+ return err;
+ }
+#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */
+ ut_ad(!memcmp_aligned<4>(merge_block->page.frame
+ + FIL_PAGE_PREV,
+ block->page.frame + FIL_PAGE_OFFSET,
+ 4));
+#else
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame
+ + FIL_PAGE_PREV,
+ block->page.frame
+ + FIL_PAGE_OFFSET, 4))) {
+ return DB_CORRUPTION;
+ }
+#endif
+ ut_d(parent_is_different = page_rec_is_supremum(
+ page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
+ if (page_is_leaf(merge_block->page.frame)) {
+ } else if (rec_t* node_ptr =
+ page_rec_get_next(page_get_infimum_rec(
+ merge_block->page.frame))) {
+ ut_ad(page_rec_is_user_rec(node_ptr));
+ /* We have to mark the leftmost node pointer as the
+ predefined minimum record. */
+ btr_set_min_rec_mark<true>(node_ptr, *merge_block,
+ mtr);
+ } else {
+ return DB_CORRUPTION;
+ }
+ } else {
+ btr_discard_only_page_on_level(index, block, mtr);
+ return DB_SUCCESS;
+ }
+
+ if (UNIV_UNLIKELY(memcmp_aligned<2>(&merge_block->page.frame
+ [PAGE_HEADER + PAGE_LEVEL],
+ &block->page.frame
+ [PAGE_HEADER + PAGE_LEVEL], 2))) {
+ return DB_CORRUPTION;
+ }
+
+ btr_search_drop_page_hash_index(block, false);
+
+ if (dict_index_is_spatial(index)) {
+ rtr_node_ptr_delete(&parent_cursor, mtr);
+ } else if (dberr_t err =
+ btr_cur_node_ptr_delete(&parent_cursor, mtr)) {
+ return err;
+ }
+
+ /* Remove the page from the level list */
+ if (dberr_t err = btr_level_list_remove(*block, *index, mtr)) {
+ return err;
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ if (page_zip_des_t* merge_page_zip
+ = buf_block_get_page_zip(merge_block))
+ ut_a(page_zip_validate(merge_page_zip,
+ merge_block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (index->has_locking()) {
+ if (left_page_no != FIL_NULL) {
+ lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+ block);
+ } else {
+ lock_update_discard(merge_block,
+ lock_get_min_heap_no(merge_block),
+ block);
+ }
+
+ if (index->is_spatial()) {
+ rtr_check_discard_page(index, cursor, block);
+ }
+ }
+
+ /* Free the file page */
+ err = btr_page_free(index, block, mtr);
+
+ if (err == DB_SUCCESS) {
+ /* btr_check_node_ptr() needs parent block latched.
+ If the merge_block's parent block is not same,
+ we cannot use btr_check_node_ptr() */
+ ut_ad(parent_is_different
+ || btr_check_node_ptr(index, merge_block, mtr));
+
+ if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
+ == index->page
+ && !page_has_siblings(btr_cur_get_page(&parent_cursor))
+ && page_get_n_recs(btr_cur_get_page(&parent_cursor))
+ == 1) {
+ btr_lift_page_up(index, merge_block, mtr, &err);
+ }
+ }
+
+ return err;
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index) /*!< in: index tree */
+{
+ page_t* root;
+ fseg_header_t* seg;
+ mtr_t mtr;
+
+ if (dict_index_is_ibuf(index)) {
+ fputs("Sorry, cannot print info of an ibuf tree:"
+ " use ibuf functions\n", stderr);
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ root = btr_root_get(index, &mtr);
+
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+ fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
+
+ if (!dict_index_is_ibuf(index)) {
+
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+ fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ ulint width, /*!< in: print this many entries from start
+ and end */
+ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
+ rec_offs** offsets,/*!< in/out: buffer for rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const page_t* page = buf_block_get_frame(block);
+ page_cur_t cursor;
+ ulint n_recs;
+ ulint i = 0;
+ mtr_t mtr2;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX));
+
+ ib::info() << "NODE ON LEVEL " << btr_page_get_level(page)
+ << " page " << block->page.id;
+
+ page_print(block, index, width, width);
+
+ n_recs = page_get_n_recs(page);
+
+ page_cur_set_before_first(block, &cursor);
+ page_cur_move_to_next(&cursor);
+
+ while (!page_cur_is_after_last(&cursor)) {
+
+ if (page_is_leaf(page)) {
+
+ /* If this is the leaf level, do nothing */
+
+ } else if ((i <= width) || (i >= n_recs - width)) {
+
+ const rec_t* node_ptr;
+
+ mtr_start(&mtr2);
+
+ node_ptr = page_cur_get_rec(&cursor);
+
+ *offsets = rec_get_offsets(
+ node_ptr, index, *offsets, 0,
+ ULINT_UNDEFINED, heap);
+ if (buf_block_t *child =
+ btr_node_ptr_get_child(node_ptr, index, *offsets,
+ &mtr2)) {
+ btr_print_recursive(index, child, width, heap,
+ offsets, &mtr2);
+ }
+ mtr_commit(&mtr2);
+ }
+
+ page_cur_move_to_next(&cursor);
+ i++;
+ }
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width) /*!< in: print this many entries from start
+ and end */
+{
+ mtr_t mtr;
+ buf_block_t* root;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ fputs("--------------------------\n"
+ "INDEX TREE PRINT\n", stderr);
+
+ mtr_start(&mtr);
+
+ root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+
+ btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ mtr_commit(&mtr);
+
+ ut_ad(btr_validate_index(index, 0));
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ rec_offs* offsets;
+ btr_cur_t cursor;
+ page_t* page = buf_block_get_frame(block);
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+ if (dict_index_get_page(index) == block->page.id().page_no()) {
+
+ return(TRUE);
+ }
+
+ cursor.page_cur.index = index;
+ cursor.page_cur.block = block;
+
+ heap = mem_heap_create(256);
+
+ if (dict_index_is_spatial(index)) {
+ offsets = rtr_page_get_father_block(NULL, heap, mtr,
+ NULL, &cursor);
+ } else {
+ offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor);
+ }
+
+ ut_ad(offsets);
+
+ if (page_is_leaf(page)) {
+
+ goto func_exit;
+ }
+
+ tuple = dict_index_build_node_ptr(
+ index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+ btr_page_get_level(page));
+
+ /* For spatial index, the MBR in the parent rec could be different
+ with that of first rec of child, their relationship should be
+ "WITHIN" relationship */
+ if (dict_index_is_spatial(index)) {
+ ut_a(!cmp_dtuple_rec_with_gis(
+ tuple, btr_cur_get_rec(&cursor),
+ PAGE_CUR_WITHIN));
+ } else {
+ ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), index,
+ offsets));
+ }
+func_exit:
+ mem_heap_free(heap);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+ const page_t* page, /*!< in: index page */
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index) /*!< in: index */
+{
+ ib::info() << "Record in index " << index->name
+ << " of table " << index->table->name
+ << ", page " << page_id_t(page_get_space_id(page),
+ page_get_page_no(page))
+ << ", at offset " << page_offset(rec);
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error) /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+{
+ ulint len;
+ const page_t* page;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page = page_align(rec);
+
+ ut_ad(index->n_core_fields);
+
+ if (index->is_ibuf()) {
+ /* The insert buffer index tree can contain records from any
+ other index: we cannot check the number of fields or
+ their length */
+
+ return(TRUE);
+ }
+
+#ifdef VIRTUAL_INDEX_DEBUG
+ if (dict_index_has_virtual(index)) {
+ fprintf(stderr, "index name is %s\n", index->name());
+ }
+#endif
+ if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
+ btr_index_rec_validate_report(page, rec, index);
+
+ ib::error() << "Compact flag=" << !!page_is_comp(page)
+ << ", should be " << dict_table_is_comp(index->table);
+
+ return(FALSE);
+ }
+
+ const bool is_alter_metadata = page_is_leaf(page)
+ && !page_has_prev(page)
+ && index->is_primary() && index->table->instant
+ && rec == page_rec_get_next_const(page_get_infimum_rec(page));
+
+ if (is_alter_metadata
+ && !rec_is_alter_metadata(rec, page_is_comp(page))) {
+ btr_index_rec_validate_report(page, rec, index);
+
+ ib::error() << "First record is not ALTER TABLE metadata";
+ return FALSE;
+ }
+
+ if (!page_is_comp(page)) {
+ const ulint n_rec_fields = rec_get_n_fields_old(rec);
+ if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD
+ && index->id == DICT_INDEXES_ID) {
+ /* A record for older SYS_INDEXES table
+ (missing merge_threshold column) is acceptable. */
+ } else if (is_alter_metadata) {
+ if (n_rec_fields != ulint(index->n_fields) + 1) {
+ goto n_field_mismatch;
+ }
+ } else if (n_rec_fields < index->n_core_fields
+ || n_rec_fields > index->n_fields) {
+n_field_mismatch:
+ btr_index_rec_validate_report(page, rec, index);
+
+ ib::error() << "Has " << rec_get_n_fields_old(rec)
+ << " fields, should have "
+ << index->n_core_fields << ".."
+ << index->n_fields;
+
+ if (dump_on_error) {
+ fputs("InnoDB: corrupt record ", stderr);
+ rec_print_old(stderr, rec);
+ putc('\n', stderr);
+ }
+ return(FALSE);
+ }
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ const dict_field_t* field = index->fields;
+ ut_ad(rec_offs_n_fields(offsets)
+ == ulint(index->n_fields) + is_alter_metadata);
+
+ for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) {
+ rec_get_nth_field_offs(offsets, i, &len);
+
+ ulint fixed_size;
+
+ if (is_alter_metadata && i == index->first_user_field()) {
+ fixed_size = FIELD_REF_SIZE;
+ if (len != FIELD_REF_SIZE
+ || !rec_offs_nth_extern(offsets, i)) {
+ goto len_mismatch;
+ }
+
+ continue;
+ } else {
+ fixed_size = dict_col_get_fixed_size(
+ field->col, page_is_comp(page));
+ if (rec_offs_nth_extern(offsets, i)) {
+ const byte* data = rec_get_nth_field(
+ rec, offsets, i, &len);
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ ulint extern_len = mach_read_from_4(
+ data + len + BTR_EXTERN_LEN + 4);
+ if (fixed_size == extern_len + len) {
+ goto next_field;
+ }
+ }
+ }
+
+ /* Note that if fixed_size != 0, it equals the
+ length of a fixed-size column in the clustered index.
+ We should adjust it here.
+ A prefix index of the column is of fixed, but different
+ length. When fixed_size == 0, prefix_len is the maximum
+ length of the prefix index column. */
+
+ if (len_is_stored(len)
+ && (field->prefix_len
+ ? len > field->prefix_len
+ : (fixed_size && len != fixed_size))) {
+len_mismatch:
+ btr_index_rec_validate_report(page, rec, index);
+ ib::error error;
+
+ error << "Field " << i << " len is " << len
+ << ", should be " << fixed_size;
+
+ if (dump_on_error) {
+ error << "; ";
+ rec_print(error.m_oss, rec,
+ rec_get_info_bits(
+ rec, rec_offs_comp(offsets)),
+ offsets);
+ }
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(FALSE);
+ }
+next_field:
+ field++;
+ }
+
+#ifdef VIRTUAL_INDEX_DEBUG
+ if (dict_index_has_virtual(index)) {
+ rec_print_new(stderr, rec, offsets);
+ }
+#endif
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return true if ok */
+static
+bool
+btr_index_page_validate(
+/*====================*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index) /*!< in: index */
+{
+ page_cur_t cur;
+#ifndef DBUG_OFF
+ ulint nth = 1;
+#endif /* !DBUG_OFF */
+
+ page_cur_set_before_first(block, &cur);
+
+ /* Directory slot 0 should only contain the infimum record. */
+ DBUG_EXECUTE_IF("check_table_rec_next",
+ ut_a(page_rec_get_nth_const(
+ page_cur_get_page(&cur), 0)
+ == cur.rec);
+ ut_a(page_dir_slot_get_n_owned(
+ page_dir_get_nth_slot(
+ page_cur_get_page(&cur), 0))
+ == 1););
+
+ while (page_cur_move_to_next(&cur)) {
+ if (page_cur_is_after_last(&cur)) {
+ return true;
+ }
+
+ if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+ break;
+ }
+
+ /* Verify that page_rec_get_nth_const() is correctly
+ retrieving each record. */
+ DBUG_EXECUTE_IF("check_table_rec_next",
+ ut_a(cur.rec == page_rec_get_nth_const(
+ page_cur_get_page(&cur),
+ page_rec_get_n_recs_before(
+ cur.rec)));
+ ut_a(nth++ == page_rec_get_n_recs_before(
+ cur.rec)););
+ }
+
+ return false;
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: B-tree level */
+ const buf_block_t* block) /*!< in: index page */
+{
+ ib::error error;
+ error << "In page " << block->page.id().page_no()
+ << " of index " << index->name
+ << " of table " << index->table->name;
+
+ if (level > 0) {
+ error << ", index tree level " << level;
+ }
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: B-tree level */
+ const buf_block_t* block1, /*!< in: first index page */
+ const buf_block_t* block2) /*!< in: second index page */
+{
+ ib::error error;
+ error << "In pages " << block1->page.id()
+ << " and " << block2->page.id() << " of index " << index->name
+ << " of table " << index->table->name;
+
+ if (level)
+ error << ", index tree level " << level;
+}
+
+/** Validate an index tree level. */
+static
+dberr_t
+btr_validate_level(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ const trx_t* trx, /*!< in: transaction or NULL */
+ ulint level) /*!< in: level number */
+{
+ buf_block_t* block;
+ page_t* page;
+ buf_block_t* right_block = 0; /* remove warning */
+ page_t* right_page = 0; /* remove warning */
+ page_t* father_page;
+ btr_cur_t node_cur;
+ btr_cur_t right_node_cur;
+ rec_t* rec;
+ page_cur_t cursor;
+ dtuple_t* node_ptr_tuple;
+ mtr_t mtr;
+ mem_heap_t* heap = mem_heap_create(256);
+ rec_offs* offsets = NULL;
+ rec_offs* offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+ page_zip_des_t* page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+
+ mtr.start();
+
+ mtr_x_lock_index(index, &mtr);
+
+ dberr_t err;
+ block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err);
+ if (!block) {
+ mtr.commit();
+ return err;
+ }
+ page = buf_block_get_frame(block);
+
+ fil_space_t* space = index->table->space;
+
+ while (level != btr_page_get_level(page)) {
+ const rec_t* node_ptr;
+ switch (dberr_t e =
+ fseg_page_is_allocated(space,
+ block->page.id().page_no())) {
+ case DB_SUCCESS_LOCKED_REC:
+ break;
+ case DB_SUCCESS:
+ btr_validate_report1(index, level, block);
+ ib::warn() << "Page is free";
+ e = DB_CORRUPTION;
+ /* fall through */
+ default:
+ err = e;
+ }
+ ut_ad(index->table->space_id == block->page.id().space());
+ ut_ad(block->page.id().space() == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+ page_zip = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ if (page_is_leaf(page)) {
+corrupted:
+ err = DB_CORRUPTION;
+ goto invalid_page;
+ }
+
+ page_cur_set_before_first(block, &cursor);
+ if (!(node_ptr = page_cur_move_to_next(&cursor))) {
+ goto corrupted;
+ }
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr,
+ &err);
+ if (!block) {
+ break;
+ }
+ page = buf_block_get_frame(block);
+
+ /* For R-Tree, since record order might not be the same as
+ linked index page in the lower level, we need to travers
+ backwards to get the first page rec in this level.
+ This is only used for index validation. Spatial index
+ does not use such scan for any of its DML or query
+ operations */
+ if (dict_index_is_spatial(index)) {
+ uint32_t left_page_no = btr_page_get_prev(page);
+
+ while (left_page_no != FIL_NULL) {
+ /* To obey latch order of tree blocks,
+ we should release the right_block once to
+ obtain lock of the uncle block. */
+ mtr.release_last_page();
+
+ block = btr_block_get(*index, left_page_no,
+ RW_SX_LATCH, false,
+ &mtr, &err);
+ if (!block) {
+ goto invalid_page;
+ }
+ page = buf_block_get_frame(block);
+ left_page_no = btr_page_get_prev(page);
+ }
+ }
+ }
+
+ /* Now we are on the desired level. Loop through the pages on that
+ level. */
+
+loop:
+ if (!block) {
+invalid_page:
+ mtr.commit();
+func_exit:
+ mem_heap_free(heap);
+ return err;
+ }
+
+ mem_heap_empty(heap);
+ offsets = offsets2 = NULL;
+
+ mtr_x_lock_index(index, &mtr);
+
+ page = block->page.frame;
+
+#ifdef UNIV_ZIP_DEBUG
+ page_zip = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (DB_SUCCESS_LOCKED_REC
+ != fseg_page_is_allocated(space, block->page.id().page_no())) {
+ btr_validate_report1(index, level, block);
+
+ ib::warn() << "Page is marked as free";
+ err = DB_CORRUPTION;
+ } else if (btr_page_get_index_id(page) != index->id) {
+ ib::error() << "Page index id " << btr_page_get_index_id(page)
+ << " != data dictionary index id " << index->id;
+ err = DB_CORRUPTION;
+ } else if (!page_validate(page, index)) {
+ btr_validate_report1(index, level, block);
+ err = DB_CORRUPTION;
+ } else if (btr_page_get_level(page) != level) {
+ btr_validate_report1(index, level, block);
+ ib::error() << "Page level is not " << level;
+ err = DB_CORRUPTION;
+ } else if (level == 0 && !btr_index_page_validate(block, index)) {
+ /* We are on level 0. Check that the records have the right
+ number of fields, and field lengths are right. */
+ err = DB_CORRUPTION;
+ } else if (!page_is_empty(page)) {
+ } else if (level) {
+ btr_validate_report1(index, level, block);
+ ib::error() << "Non-leaf page is empty";
+ } else if (block->page.id().page_no() != index->page) {
+ btr_validate_report1(index, level, block);
+ ib::error() << "Empty leaf page is not index root";
+ }
+
+ uint32_t right_page_no = btr_page_get_next(page);
+ uint32_t left_page_no = btr_page_get_prev(page);
+
+ if (right_page_no != FIL_NULL) {
+ const rec_t* right_rec;
+
+ right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+ !level, &mtr, &err);
+ if (!right_block) {
+ btr_validate_report1(index, level, block);
+ fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr);
+ goto invalid_page;
+ }
+ right_page = buf_block_get_frame(right_block);
+
+ if (btr_page_get_prev(right_page) != page_get_page_no(page)) {
+ btr_validate_report2(index, level, block, right_block);
+ fputs("InnoDB: broken FIL_PAGE_NEXT"
+ " or FIL_PAGE_PREV links\n", stderr);
+ err = DB_CORRUPTION;
+ }
+
+ if (!(rec = page_rec_get_prev(page_get_supremum_rec(page)))) {
+broken_links:
+ btr_validate_report1(index, level, block);
+ fputs("InnoDB: broken record links\n", stderr);
+ goto invalid_page;
+ }
+ if (!(right_rec =
+ page_rec_get_next(page_get_infimum_rec(right_page)))) {
+ goto broken_links;
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ page_is_leaf(page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ offsets2 = rec_get_offsets(right_rec, index, offsets2,
+ page_is_leaf(right_page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* For spatial index, we cannot guarantee the key ordering
+ across pages, so skip the record compare verification for
+ now. Will enhanced in special R-Tree index validation scheme */
+ if (index->is_btree()
+ && cmp_rec_rec(rec, right_rec,
+ offsets, offsets2, index) >= 0) {
+
+ btr_validate_report2(index, level, block, right_block);
+
+ fputs("InnoDB: records in wrong order"
+ " on adjacent pages\n", stderr);
+
+ rec = page_rec_get_prev(page_get_supremum_rec(page));
+ if (rec) {
+ fputs("InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+ }
+ fputs("InnoDB: record ", stderr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(right_page));
+ if (rec) {
+ rec_print(stderr, rec, index);
+ }
+ putc('\n', stderr);
+ err = DB_CORRUPTION;
+ }
+ }
+
+ if (!level || left_page_no != FIL_NULL) {
+ } else if (const rec_t* first =
+ page_rec_get_next_const(page_get_infimum_rec(page))) {
+ if (!(REC_INFO_MIN_REC_FLAG
+ & rec_get_info_bits(first, page_is_comp(page)))) {
+ btr_validate_report1(index, level, block);
+ ib::error() << "Missing REC_INFO_MIN_REC_FLAG";
+ err = DB_CORRUPTION;
+ }
+ } else {
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+
+ /* Similarly skip the father node check for spatial index for now,
+ for a couple of reasons:
+ 1) As mentioned, there is no ordering relationship between records
+ in parent level and linked pages in the child level.
+ 2) Search parent from root is very costly for R-tree.
+ We will add special validation mechanism for R-tree later (WL #7520) */
+ if (index->is_btree() && block->page.id().page_no() != index->page) {
+ /* Check father node pointers */
+ rec_t* node_ptr
+ = page_rec_get_next(page_get_infimum_rec(page));
+ if (!node_ptr) {
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+
+ btr_cur_position(index, node_ptr, block, &node_cur);
+ offsets = btr_page_get_father_node_ptr_for_validate(
+ offsets, heap, &node_cur, &mtr);
+
+ father_page = btr_cur_get_page(&node_cur);
+ node_ptr = btr_cur_get_rec(&node_cur);
+
+ rec = page_rec_get_prev(page_get_supremum_rec(page));
+ if (rec) {
+ btr_cur_position(index, rec, block, &node_cur);
+
+ offsets = btr_page_get_father_node_ptr_for_validate(
+ offsets, heap, &node_cur, &mtr);
+ } else {
+ offsets = nullptr;
+ }
+
+ if (!offsets || node_ptr != btr_cur_get_rec(&node_cur)
+ || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+ != block->page.id().page_no()) {
+
+ btr_validate_report1(index, level, block);
+
+ fputs("InnoDB: node pointer to the page is wrong\n",
+ stderr);
+
+ fputs("InnoDB: node ptr ", stderr);
+ rec_print(stderr, node_ptr, index);
+
+ if (offsets) {
+ rec = btr_cur_get_rec(&node_cur);
+ fprintf(stderr, "\n"
+ "InnoDB: node ptr child page n:o %u\n",
+ btr_node_ptr_get_child_page_no(
+ rec, offsets));
+ fputs("InnoDB: record on page ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ }
+
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+
+ if (page_is_leaf(page)) {
+ } else if (const rec_t* first_rec =
+ page_rec_get_next(page_get_infimum_rec(page))) {
+ node_ptr_tuple = dict_index_build_node_ptr(
+ index, first_rec,
+ 0, heap, btr_page_get_level(page));
+
+ if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, index,
+ offsets)) {
+ btr_validate_report1(index, level, block);
+
+ ib::error() << "Node ptrs differ on levels > 0";
+
+ fputs("InnoDB: node ptr ",stderr);
+ rec_print_new(stderr, node_ptr, offsets);
+ fputs("InnoDB: first rec ", stderr);
+ rec_print(stderr, first_rec, index);
+ putc('\n', stderr);
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+ } else {
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+
+ if (left_page_no == FIL_NULL) {
+ if (page_has_prev(father_page)
+ || node_ptr != page_rec_get_next(
+ page_get_infimum_rec(father_page))) {
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+ }
+
+ if (right_page_no == FIL_NULL) {
+ if (page_has_next(father_page)
+ || node_ptr != page_rec_get_prev(
+ page_get_supremum_rec(father_page))) {
+ err = DB_CORRUPTION;
+ goto node_ptr_fails;
+ }
+ } else if (const rec_t* right_node_ptr
+ = page_rec_get_next(node_ptr)) {
+ btr_cur_position(
+ index,
+ page_get_infimum_rec(right_block->page.frame),
+ right_block, &right_node_cur);
+ if (!page_cur_move_to_next(&right_node_cur.page_cur)) {
+ goto node_pointer_corrupted;
+ }
+
+ offsets = btr_page_get_father_node_ptr_for_validate(
+ offsets, heap, &right_node_cur, &mtr);
+
+ if (right_node_ptr
+ != page_get_supremum_rec(father_page)) {
+
+ if (btr_cur_get_rec(&right_node_cur)
+ != right_node_ptr) {
+node_pointer_corrupted:
+ err = DB_CORRUPTION;
+ fputs("InnoDB: node pointer to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+ }
+ } else {
+ page_t* right_father_page
+ = btr_cur_get_page(&right_node_cur);
+
+ if (btr_cur_get_rec(&right_node_cur)
+ != page_rec_get_next(
+ page_get_infimum_rec(
+ right_father_page))) {
+ err = DB_CORRUPTION;
+ fputs("InnoDB: node pointer 2 to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+ }
+
+ if (page_get_page_no(right_father_page)
+ != btr_page_get_next(father_page)) {
+
+ err = DB_CORRUPTION;
+ fputs("InnoDB: node pointer 3 to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+ }
+ }
+ } else {
+ err = DB_CORRUPTION;
+ }
+ }
+
+node_ptr_fails:
+ /* Commit the mini-transaction to release the latch on 'page'.
+ Re-acquire the latch on right_page, which will become 'page'
+ on the next loop. The page has already been checked. */
+ mtr.commit();
+
+ if (trx_is_interrupted(trx)) {
+ /* On interrupt, return the current status. */
+ } else if (right_page_no != FIL_NULL) {
+
+ mtr.start();
+
+ block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+ !level, &mtr, &err);
+ goto loop;
+ }
+
+ goto func_exit;
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ const trx_t* trx) /*!< in: transaction or NULL */
+{
+ mtr_t mtr;
+ mtr.start();
+
+ mtr_x_lock_index(index, &mtr);
+
+ dberr_t err;
+ if (page_t *root= btr_root_get(index, &mtr, &err))
+ for (auto level= btr_page_get_level(root);; level--)
+ {
+ if (dberr_t err_level= btr_validate_level(index, trx, level))
+ err= err_level;
+ if (!level)
+ break;
+ }
+
+ mtr.commit();
+ return err;
+}
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge */
+ uint32_t page_no, /*!< in: a sibling page */
+ buf_block_t** merge_block, /*!< out: the merge block */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ dict_index_t* index;
+ page_t* page;
+ ulint n_recs;
+ ulint data_size;
+ ulint max_ins_size_reorg;
+ ulint max_ins_size;
+ buf_block_t* mblock;
+ page_t* mpage;
+ DBUG_ENTER("btr_can_merge_with_page");
+
+ if (page_no == FIL_NULL) {
+error:
+ *merge_block = NULL;
+ DBUG_RETURN(false);
+ }
+
+ index = btr_cur_get_index(cursor);
+ page = btr_cur_get_page(cursor);
+
+ mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
+ mtr);
+ if (!mblock) {
+ goto error;
+ }
+ mpage = buf_block_get_frame(mblock);
+
+ n_recs = page_get_n_recs(page);
+ data_size = page_get_data_size(page);
+
+ max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+ mpage, n_recs);
+
+ if (data_size > max_ins_size_reorg) {
+ goto error;
+ }
+
+ /* If compression padding tells us that merging will result in
+ too packed up page i.e.: which is likely to cause compression
+ failure then don't merge the pages. */
+ if (mblock->page.zip.data && page_is_leaf(mpage)
+ && (page_get_data_size(mpage) + data_size
+ >= dict_index_zip_pad_optimal_page_size(index))) {
+
+ goto error;
+ }
+
+ max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+ if (data_size > max_ins_size) {
+ /* We have to reorganize mpage */
+ if (btr_page_reorganize_block(page_zip_level, mblock, index,
+ mtr) != DB_SUCCESS) {
+ goto error;
+ }
+
+ max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+ ut_ad(page_validate(mpage, index));
+ ut_ad(max_ins_size == max_ins_size_reorg);
+
+ if (data_size > max_ins_size) {
+
+ /* Add fault tolerance, though this should
+ never happen */
+
+ goto error;
+ }
+ }
+
+ *merge_block = mblock;
+ DBUG_RETURN(true);
+}
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
new file mode 100644
index 00000000..013cd131
--- /dev/null
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -0,0 +1,1233 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0bulk.cc
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*******************************************************/
+
+#include "btr0bulk.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "ibuf0ibuf.h"
+#include "page0page.h"
+#include "trx0trx.h"
+
+/** Innodb B-tree index fill factor for bulk load. */
+uint innobase_fill_factor;
+
+/** Initialize members, allocate page if needed and start mtr.
+Note: we commit all mtrs on failure.
+@return error code. */
+dberr_t
+PageBulk::init()
+{
+ buf_block_t* new_block;
+ page_t* new_page;
+
+ ut_ad(m_heap == NULL);
+ m_heap = mem_heap_create(1000);
+
+ m_mtr.start();
+ m_index->set_modified(m_mtr);
+
+ if (m_page_no == FIL_NULL) {
+ mtr_t alloc_mtr;
+
+ /* We commit redo log for allocation by a separate mtr,
+ because we don't guarantee pages are committed following
+ the allocation order, and we will always generate redo log
+ for page allocation, even when creating a new tablespace. */
+ alloc_mtr.start();
+ m_index->set_modified(alloc_mtr);
+
+ uint32_t n_reserved;
+ dberr_t err = fsp_reserve_free_extents(
+ &n_reserved, m_index->table->space, 1, FSP_NORMAL,
+ &alloc_mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+oom:
+ alloc_mtr.commit();
+ m_mtr.commit();
+ return err;
+ }
+
+ /* Allocate a new page. */
+ new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level,
+ &alloc_mtr, &m_mtr, &err);
+ if (!new_block) {
+ goto oom;
+ }
+
+ m_index->table->space->release_free_extents(n_reserved);
+
+ alloc_mtr.commit();
+
+ new_page = buf_block_get_frame(new_block);
+ m_page_no = new_block->page.id().page_no();
+
+ byte* index_id = my_assume_aligned<2>
+ (PAGE_HEADER + PAGE_INDEX_ID + new_page);
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8);
+
+ if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ mach_write_to_8(index_id, m_index->id);
+ page_create_zip(new_block, m_index, m_level, 0,
+ &m_mtr);
+ } else {
+ ut_ad(!m_index->is_spatial());
+ page_create(new_block, &m_mtr,
+ m_index->table->not_redundant());
+ m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff);
+ m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+ + PAGE_LEVEL
+ + new_page, m_level);
+ m_mtr.write<8>(*new_block, index_id, m_index->id);
+ }
+ } else {
+ new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
+ false, &m_mtr);
+ if (!new_block) {
+ m_mtr.commit();
+ return(DB_CORRUPTION);
+ }
+
+ new_page = buf_block_get_frame(new_block);
+
+ ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+
+ btr_page_set_level(new_block, m_level, &m_mtr);
+ }
+
+ m_page_zip = buf_block_get_page_zip(new_block);
+
+ if (!m_level && dict_index_is_sec_or_ibuf(m_index)) {
+ page_update_max_trx_id(new_block, m_page_zip, m_trx_id,
+ &m_mtr);
+ }
+
+ m_block = new_block;
+ m_page = new_page;
+ m_cur_rec = page_get_infimum_rec(new_page);
+ ut_ad(m_is_comp == !!page_is_comp(new_page));
+ m_free_space = page_get_free_space_of_empty(m_is_comp);
+
+ if (innobase_fill_factor == 100 && dict_index_is_clust(m_index)) {
+ /* Keep default behavior compatible with 5.6 */
+ m_reserved_space = dict_index_get_space_reserve();
+ } else {
+ m_reserved_space =
+ srv_page_size * (100 - innobase_fill_factor) / 100;
+ }
+
+ m_padding_space =
+ srv_page_size - dict_index_zip_pad_optimal_page_size(m_index);
+ m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP);
+ m_rec_no = page_header_get_field(new_page, PAGE_N_RECS);
+ /* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0,
+ without writing redo log, to ensure that needs_finish() will hold
+ on an empty page. */
+ ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION);
+ m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0;
+ ut_d(m_total_data = 0);
+
+ return(DB_SUCCESS);
+}
+
+/** Insert a record in the page.
+@tparam fmt the page format
+@param[in,out] rec record
+@param[in] offsets record offsets */
+template<PageBulk::format fmt>
+inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets)
+{
+ ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+ ut_ad((fmt != REDUNDANT) == m_is_comp);
+ ut_ad(page_align(m_heap_top) == m_page);
+ ut_ad(m_heap);
+
+ const ulint rec_size= rec_offs_size(offsets);
+ const ulint extra_size= rec_offs_extra_size(offsets);
+ ut_ad(page_align(m_heap_top + rec_size) == m_page);
+ ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec));
+
+#ifdef UNIV_DEBUG
+ /* Check whether records are in order. */
+ if (page_offset(m_cur_rec) !=
+ (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+ {
+ const rec_t *old_rec = m_cur_rec;
+ rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf
+ ? m_index->n_core_fields : 0,
+ ULINT_UNDEFINED, &m_heap);
+ ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0);
+ }
+
+ m_total_data+= rec_size;
+#endif /* UNIV_DEBUG */
+
+ rec_t* const insert_rec= m_heap_top + extra_size;
+
+ /* Insert the record in the linked list. */
+ if (fmt != REDUNDANT)
+ {
+ const rec_t *next_rec= m_page +
+ page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT));
+ if (fmt != COMPRESSED)
+ m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - m_cur_rec));
+ else
+ {
+ mach_write_to_2(m_cur_rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - m_cur_rec));
+ memcpy(m_heap_top, rec - extra_size, rec_size);
+ }
+
+ rec_t * const this_rec= fmt != COMPRESSED
+ ? const_cast<rec_t*>(rec) : insert_rec;
+ rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
+ REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(this_rec - REC_NEXT,
+ static_cast<uint16_t>(next_rec - insert_rec));
+ }
+ else
+ {
+ memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2);
+ m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec));
+ rec_set_bit_field_1(const_cast<rec_t*>(rec), 0,
+ REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(const_cast<rec_t*>(rec),
+ PAGE_HEAP_NO_USER_LOW + m_rec_no,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ }
+
+ if (fmt == COMPRESSED)
+ /* We already wrote the record. Log is written in PageBulk::compress(). */;
+ else if (page_offset(m_cur_rec) ==
+ (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+ m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size);
+ else
+ {
+ /* Try to copy common prefix from the preceding record. */
+ const byte *r= rec - extra_size;
+ const byte * const insert_rec_end= m_heap_top + rec_size;
+ byte *b= m_heap_top;
+
+ /* Skip any unchanged prefix of the record. */
+ for (; * b == *r; b++, r++);
+
+ ut_ad(b < insert_rec_end);
+
+ const byte *c= m_cur_rec - (rec - r);
+ const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets),
+ m_heap_top);
+
+ /* Try to copy any bytes of the preceding record. */
+ if (UNIV_LIKELY(c >= m_page && c < c_end))
+ {
+ const byte *cm= c;
+ byte *bm= b;
+ const byte *rm= r;
+ for (; cm < c_end && *rm == *cm; cm++, bm++, rm++);
+ ut_ad(bm <= insert_rec_end);
+ size_t len= static_cast<size_t>(rm - r);
+ ut_ad(!memcmp(r, c, len));
+ if (len > 2)
+ {
+ memcpy(b, c, len);
+ m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len);
+ c= cm;
+ b= bm;
+ r= rm;
+ }
+ }
+
+ if (c < m_cur_rec)
+ {
+ if (!rec_offs_data_size(offsets))
+ {
+no_data:
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+ goto rec_done;
+ }
+ /* Some header bytes differ. Compare the data separately. */
+ const byte *cd= m_cur_rec;
+ byte *bd= insert_rec;
+ const byte *rd= rec;
+ /* Skip any unchanged prefix of the record. */
+ for (;; cd++, bd++, rd++)
+ if (bd == insert_rec_end)
+ goto no_data;
+ else if (*bd != *rd)
+ break;
+
+ /* Try to copy any data bytes of the preceding record. */
+ if (c_end - cd > 2)
+ {
+ const byte *cdm= cd;
+ const byte *rdm= rd;
+ for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++)
+ ut_ad(rdm - rd + bd <= insert_rec_end);
+ size_t len= static_cast<size_t>(rdm - rd);
+ ut_ad(!memcmp(rd, cd, len));
+ if (len > 2)
+ {
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+ memcpy(bd, cd, len);
+ m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len);
+ c= cdm;
+ b= rdm - rd + bd;
+ r= rdm;
+ }
+ }
+ }
+
+ if (size_t len= static_cast<size_t>(insert_rec_end - b))
+ m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len);
+ }
+
+rec_done:
+ ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size));
+ rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
+
+ /* Update the member variables. */
+ ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) -
+ page_dir_calc_reserved_space(m_rec_no);
+
+ ut_ad(m_free_space >= rec_size + slot_size);
+ ut_ad(m_heap_top + rec_size < m_page + srv_page_size);
+
+ m_free_space-= rec_size + slot_size;
+ m_heap_top+= rec_size;
+ m_rec_no++;
+ m_cur_rec= insert_rec;
+}
+
+/** Insert a record in the page.
+@param[in] rec record
+@param[in] offsets record offsets */
+inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets)
+{
+ byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+ static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format");
+
+ if (UNIV_LIKELY_NULL(m_page_zip))
+ insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets);
+ else if (m_is_comp)
+ {
+ memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+ insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets);
+ memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr,
+ REC_N_NEW_EXTRA_BYTES);
+ }
+ else
+ {
+ memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+ insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets);
+ memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr,
+ REC_N_OLD_EXTRA_BYTES);
+ }
+}
+
+/** Set the number of owned records in the uncompressed page of
+a ROW_FORMAT=COMPRESSED record without redo-logging. */
+static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned)
+{
+ rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam fmt page format */
+template<PageBulk::format fmt>
+inline void PageBulk::finishPage()
+{
+ ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+ ut_ad((fmt != REDUNDANT) == m_is_comp);
+
+ ulint count= 0;
+ byte *slot= my_assume_aligned<2>(m_page + srv_page_size -
+ (PAGE_DIR + PAGE_DIR_SLOT_SIZE));
+ const page_dir_slot_t *const slot0 = slot;
+ compile_time_assert(PAGE_DIR_SLOT_SIZE == 2);
+ if (fmt != REDUNDANT)
+ {
+ uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page);
+ ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM);
+ offset= static_cast<uint16_t>(offset + PAGE_NEW_INFIMUM);
+ /* Set owner & dir. */
+ while (offset != PAGE_NEW_SUPREMUM)
+ {
+ ut_ad(offset >= PAGE_NEW_SUPREMUM);
+ ut_ad(offset < page_offset(slot));
+ count++;
+
+ if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+ {
+ slot-= PAGE_DIR_SLOT_SIZE;
+ mach_write_to_2(slot, offset);
+
+ if (fmt != COMPRESSED)
+ page_rec_set_n_owned<false>(m_block, m_page + offset, count, true,
+ &m_mtr);
+ else
+ rec_set_n_owned_zip(m_page + offset, count);
+
+ count= 0;
+ }
+
+ uint16_t next= static_cast<uint16_t>
+ ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) &
+ (srv_page_size - 1));
+ ut_ad(next);
+ offset= next;
+ }
+
+ if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+ PAGE_DIR_SLOT_MAX_N_OWNED))
+ {
+ /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+ count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+ rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+ if (fmt != COMPRESSED)
+ page_rec_set_n_owned<false>(m_block, rec, 0, true, &m_mtr);
+ else
+ rec_set_n_owned_zip(rec, 0);
+ }
+ else
+ slot-= PAGE_DIR_SLOT_SIZE;
+
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+ if (fmt != COMPRESSED)
+ page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM,
+ count + 1, true, &m_mtr);
+ else
+ rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1);
+ }
+ else
+ {
+ rec_t *insert_rec= m_page +
+ mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page);
+
+ /* Set owner & dir. */
+ while (insert_rec != m_page + PAGE_OLD_SUPREMUM)
+ {
+ count++;
+
+ if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+ {
+ slot-= PAGE_DIR_SLOT_SIZE;
+ mach_write_to_2(slot, page_offset(insert_rec));
+ page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr);
+ count= 0;
+ }
+
+ insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT);
+ }
+
+ if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+ PAGE_DIR_SLOT_MAX_N_OWNED))
+ {
+ /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+ count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+ rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+ page_rec_set_n_owned<false>(m_block, rec, 0, false, &m_mtr);
+ }
+ else
+ slot-= PAGE_DIR_SLOT_SIZE;
+
+ mach_write_to_2(slot, PAGE_OLD_SUPREMUM);
+ page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1,
+ false, &m_mtr);
+ }
+
+ if (!m_rec_no);
+ else if (fmt != COMPRESSED)
+ {
+ static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility");
+ alignas(8) byte page_header[PAGE_N_HEAP + 2];
+ mach_write_to_2(page_header + PAGE_N_DIR_SLOTS,
+ 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+ mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page);
+ mach_write_to_2(page_header + PAGE_N_HEAP,
+ (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
+ uint16_t{fmt != REDUNDANT} << 15);
+ m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header,
+ sizeof page_header);
+ m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+ m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot);
+ }
+ else
+ {
+ /* For ROW_FORMAT=COMPRESSED, redo log may be written in
+ PageBulk::compress(). */
+ mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
+ 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+ mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
+ static_cast<ulint>(m_heap_top - m_page));
+ mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page,
+ (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15);
+ mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+ }
+}
+
+inline bool PageBulk::needs_finish() const
+{
+ ut_ad(page_align(m_cur_rec) == m_block->page.frame);
+ ut_ad(m_page == m_block->page.frame);
+ if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B])
+ return true;
+ ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP);
+ ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW);
+ if (n_heap & 0x8000)
+ {
+ n_heap&= 0x7fff;
+ heap_no= rec_get_heap_no_new(m_cur_rec);
+ if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+ page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_NEW_SUPREMUM_END)
+ return false;
+ }
+ else
+ {
+ heap_no= rec_get_heap_no_old(m_cur_rec);
+ if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+ page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_OLD_SUPREMUM_END)
+ return false;
+ }
+ return heap_no != n_heap - 1;
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED */
+inline void PageBulk::finish()
+{
+ ut_ad(!m_index->is_spatial());
+
+ if (!needs_finish());
+ else if (UNIV_LIKELY_NULL(m_page_zip))
+ finishPage<COMPRESSED>();
+ else if (m_is_comp)
+ finishPage<DYNAMIC>();
+ else
+ finishPage<REDUNDANT>();
+
+ /* In MariaDB 10.2, 10.3, 10.4, we would initialize
+ PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT
+ in the same way as we would during normal INSERT operations.
+ Starting with MariaDB Server 10.5, bulk insert will not
+ touch those fields. */
+ ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]);
+ /* Restore the temporary change of PageBulk::init() that was necessary to
+ ensure that PageBulk::needs_finish() holds on an empty page. */
+ m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION;
+
+ ut_ad(!page_header_get_field(m_page, PAGE_FREE));
+ ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE));
+ ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT));
+ ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION));
+ ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <=
+ page_get_free_space_of_empty(m_is_comp));
+ ut_ad(!needs_finish());
+ ut_ad(page_validate(m_page, m_index));
+}
+
+/** Commit inserts done to the page
+@param[in] success Flag whether all inserts succeed. */
+void PageBulk::commit(bool success)
+{
+ finish();
+ if (success && !m_index->is_clust() && page_is_leaf(m_page))
+ ibuf_set_bitmap_for_bulk_load(m_block, &m_mtr,
+ innobase_fill_factor == 100);
+ m_mtr.commit();
+}
+
+/** Compress a page of compressed table
+@return true compress successfully or no need to compress
+@return false compress failed. */
+bool
+PageBulk::compress()
+{
+ ut_ad(m_page_zip != NULL);
+
+ return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr);
+}
+
+/** Get node pointer
+@return node pointer */
+dtuple_t*
+PageBulk::getNodePtr()
+{
+ rec_t* first_rec;
+ dtuple_t* node_ptr;
+
+ /* Create node pointer */
+ first_rec = page_rec_get_next(page_get_infimum_rec(m_page));
+ ut_a(page_rec_is_user_rec(first_rec));
+ node_ptr = dict_index_build_node_ptr(m_index, first_rec, m_page_no,
+ m_heap, m_level);
+
+ return(node_ptr);
+}
+
+/** Get split rec in left page.We split a page in half when compresssion fails,
+and the split rec will be copied to right page.
+@return split rec */
+rec_t*
+PageBulk::getSplitRec()
+{
+ rec_t* rec;
+ rec_offs* offsets;
+ ulint total_used_size;
+ ulint total_recs_size;
+ ulint n_recs;
+
+ ut_ad(m_page_zip != NULL);
+ ut_ad(m_rec_no >= 2);
+ ut_ad(!m_index->is_instant());
+
+ ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space);
+ total_used_size = page_get_free_space_of_empty(m_is_comp)
+ - m_free_space;
+
+ total_recs_size = 0;
+ n_recs = 0;
+ offsets = NULL;
+ rec = page_get_infimum_rec(m_page);
+ const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0;
+
+ do {
+ rec = page_rec_get_next(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+ total_recs_size += rec_offs_size(offsets);
+ n_recs++;
+ } while (total_recs_size + page_dir_calc_reserved_space(n_recs)
+ < total_used_size / 2);
+
+ /* Keep at least one record on left page */
+ if (page_rec_is_first(rec, m_page)) {
+ rec = page_rec_get_next(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+ }
+
+ return(rec);
+}
+
+/** Copy all records after split rec including itself.
+@param[in] rec split rec */
+void
+PageBulk::copyIn(
+ rec_t* split_rec)
+{
+
+ rec_t* rec = split_rec;
+ rec_offs* offsets = NULL;
+
+ ut_ad(m_rec_no == 0);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ const ulint n_core = page_rec_is_leaf(rec)
+ ? m_index->n_core_fields : 0;
+
+ do {
+ offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+
+ insert(rec, offsets);
+
+ rec = page_rec_get_next(rec);
+ } while (!page_rec_is_supremum(rec));
+
+ ut_ad(m_rec_no > 0);
+}
+
+/** Remove all records after split rec including itself.
+@param[in] rec split rec */
+void
+PageBulk::copyOut(
+ rec_t* split_rec)
+{
+ /* Suppose before copyOut, we have 5 records on the page:
+ infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec.
+
+ after copyOut, we have 2 records on the page:
+ infimum->r1->r2->supremum. slot ajustment is not done. */
+
+ rec_t *rec = page_get_infimum_rec(m_page);
+ ulint n;
+
+ for (n = 0;; n++) {
+ rec_t *next = page_rec_get_next(rec);
+ if (next == split_rec) {
+ break;
+ }
+ rec = next;
+ }
+
+ ut_ad(n > 0);
+
+ const rec_t *last_rec = split_rec;
+ for (;;) {
+ const rec_t *next = page_rec_get_next_const(last_rec);
+ if (page_rec_is_supremum(next)) {
+ break;
+ }
+ last_rec = next;
+ }
+
+ /* Set last record's next in page */
+ const ulint n_core = page_rec_is_leaf(split_rec)
+ ? m_index->n_core_fields : 0;
+
+ rec_offs* offsets = rec_get_offsets(rec, m_index, nullptr, n_core,
+ ULINT_UNDEFINED, &m_heap);
+ mach_write_to_2(rec - REC_NEXT, m_is_comp
+ ? static_cast<uint16_t>
+ (PAGE_NEW_SUPREMUM - page_offset(rec))
+ : PAGE_OLD_SUPREMUM);
+
+ /* Set related members */
+ m_cur_rec = rec;
+ m_heap_top = rec_get_end(rec, offsets);
+
+ offsets = rec_get_offsets(last_rec, m_index, offsets, n_core,
+ ULINT_UNDEFINED, &m_heap);
+
+ m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top)
+ + page_dir_calc_reserved_space(m_rec_no)
+ - page_dir_calc_reserved_space(n);
+ ut_ad(lint(m_free_space) > 0);
+ m_rec_no = n;
+
+#ifdef UNIV_DEBUG
+ m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top);
+#endif /* UNIV_DEBUG */
+}
+
+/** Set next page
+@param[in] next_page_no next page no */
+inline void PageBulk::setNext(ulint next_page_no)
+{
+ if (UNIV_LIKELY_NULL(m_page_zip))
+ /* For ROW_FORMAT=COMPRESSED, redo log may be written
+ in PageBulk::compress(). */
+ mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no);
+ else
+ m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no);
+}
+
+/** Set previous page
+@param[in] prev_page_no previous page no */
+inline void PageBulk::setPrev(ulint prev_page_no)
+{
+ if (UNIV_LIKELY_NULL(m_page_zip))
+ /* For ROW_FORMAT=COMPRESSED, redo log may be written
+ in PageBulk::compress(). */
+ mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no);
+ else
+ m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no);
+}
+
+/** Check if required space is available in the page for the rec to be inserted.
+We check fill factor & padding here.
+@param[in] length required length
+@return true if space is available */
+bool
+PageBulk::isSpaceAvailable(
+ ulint rec_size)
+{
+ if (m_rec_no >= 8190) {
+ ut_ad(srv_page_size == 65536);
+ return false;
+ }
+
+ ulint slot_size;
+ ulint required_space;
+
+ slot_size = page_dir_calc_reserved_space(m_rec_no + 1)
+ - page_dir_calc_reserved_space(m_rec_no);
+
+ required_space = rec_size + slot_size;
+
+ if (required_space > m_free_space) {
+ ut_ad(m_rec_no > 0);
+ return false;
+ }
+
+ /* Fillfactor & Padding apply to both leaf and non-leaf pages.
+ Note: we keep at least 2 records in a page to avoid B-tree level
+ growing too high. */
+ if (m_rec_no >= 2
+ && ((m_page_zip == NULL && m_free_space - required_space
+ < m_reserved_space)
+ || (m_page_zip != NULL && m_free_space - required_space
+ < m_padding_space))) {
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Check whether the record needs to be stored externally.
+@return false if the entire record can be stored locally on the page */
+bool
+PageBulk::needExt(
+ const dtuple_t* tuple,
+ ulint rec_size)
+{
+ return page_zip_rec_needs_ext(rec_size, m_is_comp,
+ dtuple_get_n_fields(tuple),
+ m_block->zip_size());
+}
+
+/** Store external record
+Since the record is not logged yet, so we don't log update to the record.
+the blob data is logged first, then the record is logged in bulk mode.
+@param[in] big_rec external recrod
+@param[in] offsets record offsets
+@return error code */
+dberr_t
+PageBulk::storeExt(
+ const big_rec_t* big_rec,
+ rec_offs* offsets)
+{
+ finish();
+
+ /* Note: not all fields are initialized in btr_pcur. */
+ btr_pcur_t btr_pcur;
+ btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED;
+ btr_pcur.latch_mode = BTR_MODIFY_LEAF;
+ btr_pcur.btr_cur.page_cur.index = m_index;
+ btr_pcur.btr_cur.page_cur.rec = m_cur_rec;
+ btr_pcur.btr_cur.page_cur.offsets = offsets;
+ btr_pcur.btr_cur.page_cur.block = m_block;
+
+ dberr_t err = btr_store_big_rec_extern_fields(
+ &btr_pcur, offsets, big_rec, &m_mtr, BTR_STORE_INSERT_BULK);
+
+ return(err);
+}
+
+/** Release block by commiting mtr
+Note: log_free_check requires holding no lock/latch in current thread. */
+void
+PageBulk::release()
+{
+ finish();
+
+ /* We fix the block because we will re-pin it soon. */
+ m_block->page.fix();
+
+ /* No other threads can modify this block. */
+ m_modify_clock = buf_block_get_modify_clock(m_block);
+
+ m_mtr.commit();
+}
+
+/** Start mtr and latch the block */
+void PageBulk::latch()
+{
+ m_mtr.start();
+ m_index->set_modified(m_mtr);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!m_block->index);
+#endif
+ m_block->page.lock.x_lock();
+ ut_ad(m_block->page.buf_fix_count());
+ m_mtr.memo_push(m_block, MTR_MEMO_PAGE_X_FIX);
+
+ ut_ad(m_cur_rec > m_page);
+ ut_ad(m_cur_rec < m_heap_top);
+}
+
+/** Split a page
+@param[in] page_bulk page to split
+@param[in] next_page_bulk next page
+@return error code */
+dberr_t
+BtrBulk::pageSplit(
+ PageBulk* page_bulk,
+ PageBulk* next_page_bulk)
+{
+ ut_ad(page_bulk->getPageZip() != NULL);
+
+ if (page_bulk->getRecNo() <= 1) {
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ /* Initialize a new page */
+ PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL,
+ page_bulk->getLevel());
+ dberr_t err = new_page_bulk.init();
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Copy the upper half to the new page. */
+ rec_t* split_rec = page_bulk->getSplitRec();
+ new_page_bulk.copyIn(split_rec);
+ page_bulk->copyOut(split_rec);
+
+ /* Commit the pages after split. */
+ err = pageCommit(page_bulk, &new_page_bulk, true);
+ if (err != DB_SUCCESS) {
+ pageAbort(&new_page_bulk);
+ return(err);
+ }
+
+ err = pageCommit(&new_page_bulk, next_page_bulk, true);
+ if (err != DB_SUCCESS) {
+ pageAbort(&new_page_bulk);
+ return(err);
+ }
+
+ return(err);
+}
+
+/** Commit(finish) a page. We set next/prev page no, compress a page of
+compressed table and split the page if compression fails, insert a node
+pointer to father page if needed, and commit mini-transaction.
+@param[in] page_bulk page to commit
+@param[in] next_page_bulk next page
+@param[in] insert_father false when page_bulk is a root page and
+ true when it's a non-root page
+@return error code */
+dberr_t
+BtrBulk::pageCommit(
+ PageBulk* page_bulk,
+ PageBulk* next_page_bulk,
+ bool insert_father)
+{
+ page_bulk->finish();
+
+ /* Set page links */
+ if (next_page_bulk != NULL) {
+ ut_ad(page_bulk->getLevel() == next_page_bulk->getLevel());
+
+ page_bulk->setNext(next_page_bulk->getPageNo());
+ next_page_bulk->setPrev(page_bulk->getPageNo());
+ } else {
+ ut_ad(!page_has_next(page_bulk->getPage()));
+ /* If a page is released and latched again, we need to
+ mark it modified in mini-transaction. */
+ page_bulk->set_modified();
+ }
+
+ ut_ad(!m_index->lock.have_any());
+
+ /* Compress page if it's a compressed table. */
+ if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) {
+ return(pageSplit(page_bulk, next_page_bulk));
+ }
+
+ /* Insert node pointer to father page. */
+ if (insert_father) {
+ dtuple_t* node_ptr = page_bulk->getNodePtr();
+ dberr_t err = insert(node_ptr, page_bulk->getLevel()+1);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Commit mtr. */
+ page_bulk->commit(true);
+
+ return(DB_SUCCESS);
+}
+
+/** Log free check */
+inline void BtrBulk::logFreeCheck()
+{
+ if (log_sys.check_flush_or_checkpoint()) {
+ release();
+
+ log_check_margins();
+
+ latch();
+ }
+}
+
+/** Release all latches */
+void
+BtrBulk::release()
+{
+ ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+ for (ulint level = 0; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+
+ page_bulk->release();
+ }
+}
+
+/** Re-latch all latches */
+void
+BtrBulk::latch()
+{
+ ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+ for (ulint level = 0; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+ page_bulk->latch();
+ }
+}
+
+/** Insert a tuple to page in a level
+@param[in] tuple tuple to insert
+@param[in] level B-tree level
+@return error code */
+dberr_t
+BtrBulk::insert(
+ dtuple_t* tuple,
+ ulint level)
+{
+ bool is_left_most = false;
+ dberr_t err = DB_SUCCESS;
+
+ /* Check if we need to create a PageBulk for the level. */
+ if (level + 1 > m_page_bulks.size()) {
+ PageBulk* new_page_bulk
+ = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL,
+ level));
+ err = new_page_bulk->init();
+ if (err != DB_SUCCESS) {
+ UT_DELETE(new_page_bulk);
+ return(err);
+ }
+
+ m_page_bulks.push_back(new_page_bulk);
+ ut_ad(level + 1 == m_page_bulks.size());
+ m_root_level = level;
+
+ is_left_most = true;
+ }
+
+ ut_ad(m_page_bulks.size() > level);
+
+ PageBulk* page_bulk = m_page_bulks.at(level);
+
+ if (is_left_most && level > 0 && page_bulk->getRecNo() == 0) {
+ /* The node pointer must be marked as the predefined minimum
+ record, as there is no lower alphabetical limit to records in
+ the leftmost node of a level: */
+ dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+ | REC_INFO_MIN_REC_FLAG);
+ }
+
+ ulint n_ext = 0;
+ ulint rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+ big_rec_t* big_rec = NULL;
+ rec_t* rec = NULL;
+ rec_offs* offsets = NULL;
+
+ if (page_bulk->needExt(tuple, rec_size)) {
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+ big_rec = dtuple_convert_big_rec(m_index, 0, tuple, &n_ext);
+
+ if (big_rec == NULL) {
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+ }
+
+ if (page_bulk->getPageZip() != NULL
+ && page_zip_is_too_big(m_index, tuple)) {
+ err = DB_TOO_BIG_RECORD;
+ goto func_exit;
+ }
+
+ if (!page_bulk->isSpaceAvailable(rec_size)) {
+ /* Create a sibling page_bulk. */
+ PageBulk* sibling_page_bulk;
+ sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id,
+ FIL_NULL, level));
+ err = sibling_page_bulk->init();
+ if (err != DB_SUCCESS) {
+ UT_DELETE(sibling_page_bulk);
+ goto func_exit;
+ }
+
+ /* Commit page bulk. */
+ err = pageCommit(page_bulk, sibling_page_bulk, true);
+ if (err != DB_SUCCESS) {
+ pageAbort(sibling_page_bulk);
+ UT_DELETE(sibling_page_bulk);
+ goto func_exit;
+ }
+
+ /* Set new page bulk to page_bulks. */
+ ut_ad(sibling_page_bulk->getLevel() <= m_root_level);
+ m_page_bulks.at(level) = sibling_page_bulk;
+
+ UT_DELETE(page_bulk);
+ page_bulk = sibling_page_bulk;
+
+ /* Important: log_free_check whether we need a checkpoint. */
+ if (page_is_leaf(sibling_page_bulk->getPage())) {
+ if (trx_is_interrupted(m_trx)) {
+ err = DB_INTERRUPTED;
+ goto func_exit;
+ }
+
+ srv_inc_activity_count();
+ logFreeCheck();
+ }
+ }
+
+ /* Convert tuple to rec. */
+ rec = rec_convert_dtuple_to_rec(static_cast<byte*>(mem_heap_alloc(
+ page_bulk->m_heap, rec_size)), m_index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, m_index, offsets, level
+ ? 0 : m_index->n_core_fields,
+ ULINT_UNDEFINED, &page_bulk->m_heap);
+
+ page_bulk->insert(rec, offsets);
+
+ if (big_rec != NULL) {
+ ut_ad(dict_index_is_clust(m_index));
+ ut_ad(page_bulk->getLevel() == 0);
+ ut_ad(page_bulk == m_page_bulks.at(0));
+
+ /* Release all pages above the leaf level */
+ for (ulint level = 1; level <= m_root_level; level++) {
+ m_page_bulks.at(level)->release();
+ }
+
+ err = page_bulk->storeExt(big_rec, offsets);
+
+ /* Latch */
+ for (ulint level = 1; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+ page_bulk->latch();
+ }
+ }
+
+func_exit:
+ if (big_rec != NULL) {
+ dtuple_convert_back_big_rec(m_index, tuple, big_rec);
+ }
+
+ return(err);
+}
+
+/** Btree bulk load finish. We commit the last page in each level
+and copy the last page in top level to the root page of the index
+if no error occurs.
+@param[in] err whether bulk load was successful until now
+@return error code */
+dberr_t
+BtrBulk::finish(dberr_t err)
+{
+ uint32_t last_page_no = FIL_NULL;
+
+ ut_ad(!m_index->table->is_temporary());
+
+ if (m_page_bulks.size() == 0) {
+ /* The table is empty. The root page of the index tree
+ is already in a consistent state. No need to flush. */
+ return(err);
+ }
+
+ ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+ /* Finish all page bulks */
+ for (ulint level = 0; level <= m_root_level; level++) {
+ PageBulk* page_bulk = m_page_bulks.at(level);
+
+ last_page_no = page_bulk->getPageNo();
+
+ if (err == DB_SUCCESS) {
+ err = pageCommit(page_bulk, NULL,
+ level != m_root_level);
+ }
+
+ if (err != DB_SUCCESS) {
+ pageAbort(page_bulk);
+ }
+
+ UT_DELETE(page_bulk);
+ }
+
+ if (err == DB_SUCCESS) {
+ rec_t* first_rec;
+ mtr_t mtr;
+ buf_block_t* last_block;
+ PageBulk root_page_bulk(m_index, m_trx->id,
+ m_index->page, m_root_level);
+
+ mtr.start();
+ m_index->set_modified(mtr);
+ mtr_x_lock_index(m_index, &mtr);
+
+ ut_ad(last_page_no != FIL_NULL);
+ last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
+ false, &mtr);
+ if (!last_block) {
+ err = DB_CORRUPTION;
+err_exit:
+ mtr.commit();
+ return err;
+ }
+
+ first_rec = page_rec_get_next(
+ page_get_infimum_rec(last_block->page.frame));
+ /* Because this index tree is being created by this thread,
+ we assume that it cannot be corrupted. */
+ ut_ad(first_rec);
+ ut_ad(page_rec_is_user_rec(first_rec));
+
+ /* Copy last page to root page. */
+ err = root_page_bulk.init();
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+ root_page_bulk.copyIn(first_rec);
+ root_page_bulk.finish();
+
+ /* Remove last page. */
+ err = btr_page_free(m_index, last_block, &mtr);
+ mtr.commit();
+
+ if (dberr_t e = pageCommit(&root_page_bulk, NULL, false)) {
+ err = e;
+ }
+ ut_ad(err == DB_SUCCESS);
+ }
+
+ ut_ad(err != DB_SUCCESS
+ || btr_validate_index(m_index, NULL) == DB_SUCCESS);
+ return(err);
+}
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
new file mode 100644
index 00000000..e736f338
--- /dev/null
+++ b/storage/innobase/btr/btr0cur.cc
@@ -0,0 +1,7017 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.cc
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+ NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+#include "row0upd.h"
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "row0log.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+#include "srv0start.h"
+#include "mysql_com.h"
+#include "dict0stats.h"
+#include "row0ins.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h"
+#endif /* WITH_WSREP */
+#include "log.h"
+
+/** Buffered B-tree operation types, introduced as part of delete buffering. */
+enum btr_op_t {
+ BTR_NO_OP = 0, /*!< Not buffered */
+ BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
+ BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
+ BTR_DELETE_OP, /*!< Purge a delete-marked record */
+ BTR_DELMARK_OP /*!< Mark a record for deletion */
+};
+
+/** Modification types for the B-tree operation.
+ Note that the order must be DELETE, BOTH, INSERT !!
+ */
+enum btr_intention_t {
+ BTR_INTENTION_DELETE,
+ BTR_INTENTION_BOTH,
+ BTR_INTENTION_INSERT
+};
+
+/** For the index->lock scalability improvement, only possibility of clear
+performance regression observed was caused by grown huge history list length.
+That is because the exclusive use of index->lock also worked as reserving
+free blocks and read IO bandwidth with priority. To avoid huge glowing history
+list as same level with previous implementation, prioritizes pessimistic tree
+operations by purge as the previous, when it seems to be growing huge.
+
+ Experimentally, the history list length starts to affect to performance
+throughput clearly from about 100000. */
+#define BTR_CUR_FINE_HISTORY_LENGTH 100000
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+ulint btr_cur_n_non_sea_old;
+/** Number of successful adaptive hash index lookups in
+btr_cur_t::search_leaf(). */
+ib_counter_t<ulint, ib_counter_element_t> btr_cur_n_sea;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+ulint btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+uint btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
+ page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
+ FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
+ part header, in bytes */
+
+/* @} */
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /*!< in: record */
+ buf_block_t* block, /*!< in: index page of rec */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr); /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /*!< in: record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ buf_block_t* block, /*!< in: index page of rec */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr); /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+
+/*==================== B-TREE SEARCH =========================*/
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out] index clustered index definition
+@param[in,out] mtr mini-transaction
+@return error code
+@retval DB_SUCCESS if no error occurred
+@retval DB_CORRUPTION if any corruption was noticed */
+static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
+{
+ ut_ad(index->is_primary());
+ ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
+ ut_ad(index->table->supports_instant());
+ ut_ad(index->table->is_readable());
+
+ dberr_t err;
+ const fil_space_t* space = index->table->space;
+ if (!space) {
+corrupted:
+ err = DB_CORRUPTION;
+unreadable:
+ ib::error() << "Table " << index->table->name
+ << " has an unreadable root page";
+ index->table->corrupted = true;
+ index->table->file_unreadable = true;
+ return err;
+ }
+
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
+ if (!root) {
+ goto unreadable;
+ }
+
+ if (btr_cur_instant_root_init(index, root->page.frame)) {
+ goto corrupted;
+ }
+
+ ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+ if (fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX) {
+ ut_ad(!index->is_instant());
+ return DB_SUCCESS;
+ }
+
+ btr_cur_t cur;
+ /* Relax the assertion in rec_init_offsets(). */
+ ut_ad(!index->in_instant_init);
+ ut_d(index->in_instant_init = true);
+ err = cur.open_leaf(true, index, BTR_SEARCH_LEAF, mtr);
+ ut_d(index->in_instant_init = false);
+ if (err != DB_SUCCESS) {
+ index->table->file_unreadable = true;
+ index->table->corrupted = true;
+ return err;
+ }
+
+ ut_ad(page_cur_is_before_first(&cur.page_cur));
+ ut_ad(page_is_leaf(cur.page_cur.block->page.frame));
+
+ const rec_t* rec = page_cur_move_to_next(&cur.page_cur);
+ const ulint comp = dict_table_is_comp(index->table);
+ const ulint info_bits = rec ? rec_get_info_bits(rec, comp) : 0;
+
+ if (page_rec_is_supremum(rec)
+ || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
+ if (rec && !index->is_instant()) {
+ /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
+ assigned even if instant ADD COLUMN was not
+ committed. Changes to these page header fields are not
+ undo-logged, but changes to the hidden metadata record
+ are. If the server is killed and restarted, the page
+ header fields could remain set even though no metadata
+ record is present. */
+ return DB_SUCCESS;
+ }
+
+ ib::error() << "Table " << index->table->name
+ << " is missing instant ALTER metadata";
+ index->table->corrupted = true;
+ return DB_CORRUPTION;
+ }
+
+ if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
+ || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
+incompatible:
+ ib::error() << "Table " << index->table->name
+ << " contains unrecognizable instant ALTER metadata";
+ index->table->corrupted = true;
+ return DB_CORRUPTION;
+ }
+
+ /* Read the metadata. We can get here on server restart
+ or when the table was evicted from the data dictionary cache
+ and is now being accessed again.
+
+ Here, READ COMMITTED and REPEATABLE READ should be equivalent.
+ Committing the ADD COLUMN operation would acquire
+ MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
+ concurrent operations on the table, including table eviction
+ from the cache. */
+
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+ /* This metadata record includes a BLOB that identifies
+ any dropped or reordered columns. */
+ ulint trx_id_offset = index->trx_id_offset;
+ /* If !index->trx_id_offset, the PRIMARY KEY contains
+ variable-length columns. For the metadata record,
+ variable-length columns should be written with zero
+ length. However, before MDEV-21088 was fixed, for
+ variable-length encoded PRIMARY KEY column of type
+ CHAR, we wrote more than zero bytes. That is why we
+ must determine the actual length of each PRIMARY KEY
+ column. The DB_TRX_ID will start right after any
+ PRIMARY KEY columns. */
+ ut_ad(index->n_uniq);
+
+ /* We cannot invoke rec_get_offsets() before
+ index->table->deserialise_columns(). Therefore,
+ we must duplicate some logic here. */
+ if (trx_id_offset) {
+ } else if (index->table->not_redundant()) {
+ /* The PRIMARY KEY contains variable-length columns.
+ For the metadata record, variable-length columns are
+ always written with zero length. The DB_TRX_ID will
+ start right after any fixed-length columns. */
+
+ /* OK, before MDEV-21088 was fixed, for
+ variable-length encoded PRIMARY KEY column of
+ type CHAR, we wrote more than zero bytes. In
+ order to allow affected tables to be accessed,
+ it would be nice to determine the actual
+ length of each PRIMARY KEY column. However, to
+ be able to do that, we should determine the
+ size of the null-bit bitmap in the metadata
+ record. And we cannot know that before reading
+ the metadata BLOB, whose starting point we are
+ trying to find here. (Although the PRIMARY KEY
+ columns cannot be NULL, we would have to know
+ where the lengths of variable-length PRIMARY KEY
+ columns start.)
+
+ So, unfortunately we cannot help users who
+ were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
+ or ROW_FORMAT=DYNAMIC table. */
+
+ for (uint i = index->n_uniq; i--; ) {
+ trx_id_offset += index->fields[i].fixed_len;
+ }
+ } else if (rec_get_1byte_offs_flag(rec)) {
+ trx_id_offset = rec_1_get_field_end_info(
+ rec, index->n_uniq - 1);
+ ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+ trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ trx_id_offset = rec_2_get_field_end_info(
+ rec, index->n_uniq - 1);
+ ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+ trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
+ }
+
+ const byte* ptr = rec + trx_id_offset
+ + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
+ goto incompatible;
+ }
+
+ uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+ if (!len
+ || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
+ != FIL_PAGE_DATA
+ || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+ != space->id) {
+ goto incompatible;
+ }
+
+ buf_block_t* block = buf_page_get(
+ page_id_t(space->id,
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+ 0, RW_S_LATCH, mtr);
+ if (!block) {
+ goto incompatible;
+ }
+
+ if (fil_page_get_type(block->page.frame) != FIL_PAGE_TYPE_BLOB
+ || mach_read_from_4(&block->page.frame
+ [FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO])
+ != FIL_NULL
+ || mach_read_from_4(&block->page.frame
+ [FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN])
+ != len) {
+ goto incompatible;
+ }
+
+ /* The unused part of the BLOB page should be zero-filled. */
+ for (const byte* b = block->page.frame
+ + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
+ * const end = block->page.frame + srv_page_size
+ - BTR_EXTERN_LEN;
+ b < end; ) {
+ if (*b++) {
+ goto incompatible;
+ }
+ }
+
+ if (index->table->deserialise_columns(
+ &block->page.frame
+ [FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) {
+ goto incompatible;
+ }
+
+ /* Proceed to initialize the default values of
+ any instantly added columns. */
+ }
+
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets = rec_get_offsets(rec, index, NULL,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ if (rec_offs_any_default(offsets)) {
+inconsistent:
+ mem_heap_free(heap);
+ goto incompatible;
+ }
+
+ /* In fact, because we only ever append fields to the metadata
+ record, it is also OK to perform READ UNCOMMITTED and
+ then ignore any extra fields, provided that
+ trx_sys.is_registered(DB_TRX_ID). */
+ if (rec_offs_n_fields(offsets)
+ > ulint(index->n_fields) + !!index->table->instant
+ && !trx_sys.is_registered(current_trx(),
+ row_get_rec_trx_id(rec, index,
+ offsets))) {
+ goto inconsistent;
+ }
+
+ for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
+ dict_col_t* col = index->fields[i].col;
+ const unsigned o = i + !!index->table->instant;
+ ulint len;
+ const byte* data = rec_get_nth_field(rec, offsets, o, &len);
+ ut_ad(!col->is_added());
+ ut_ad(!col->def_val.data);
+ col->def_val.len = len;
+ switch (len) {
+ case UNIV_SQL_NULL:
+ continue;
+ case 0:
+ col->def_val.data = field_ref_zero;
+ continue;
+ }
+ ut_ad(len != UNIV_SQL_DEFAULT);
+ if (!rec_offs_nth_extern(offsets, o)) {
+ col->def_val.data = mem_heap_dup(
+ index->table->heap, data, len);
+ } else if (len < BTR_EXTERN_FIELD_REF_SIZE
+ || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)) {
+ col->def_val.len = UNIV_SQL_DEFAULT;
+ goto inconsistent;
+ } else {
+ col->def_val.data = btr_copy_externally_stored_field(
+ &col->def_val.len, data,
+ cur.page_cur.block->zip_size(),
+ len, index->table->heap);
+ }
+ }
+
+ mem_heap_free(heap);
+ return DB_SUCCESS;
+}
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out] table table definition from the data dictionary
+@return error code
+@retval DB_SUCCESS if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+{
+ mtr_t mtr;
+ dict_index_t* index = dict_table_get_first_index(table);
+ mtr.start();
+ dberr_t err = index
+ ? btr_cur_instant_init_low(index, &mtr)
+ : DB_CORRUPTION;
+ mtr.commit();
+ return(err);
+}
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in] index clustered index that is on its first access
+@param[in] page clustered index root page
+@return whether the page is corrupted */
+bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+{
+ ut_ad(!index->is_dummy);
+ ut_ad(index->is_primary());
+ ut_ad(!index->is_instant());
+ ut_ad(index->table->supports_instant());
+
+ if (page_has_siblings(page)) {
+ return true;
+ }
+
+ /* This is normally executed as part of btr_cur_instant_init()
+ when dict_load_table_one() is loading a table definition.
+ Other threads should not access or modify the n_core_null_bytes,
+ n_core_fields before dict_load_table_one() returns.
+
+ This can also be executed during IMPORT TABLESPACE, where the
+ table definition is exclusively locked. */
+
+ switch (fil_page_get_type(page)) {
+ default:
+ return true;
+ case FIL_PAGE_INDEX:
+ /* The field PAGE_INSTANT is guaranteed 0 on clustered
+ index root pages of ROW_FORMAT=COMPACT or
+ ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
+ if (page_is_comp(page) && page_get_instant(page)) {
+ return true;
+ }
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+ return false;
+ case FIL_PAGE_TYPE_INSTANT:
+ break;
+ }
+
+ const uint16_t n = page_get_instant(page);
+
+ if (n < index->n_uniq + DATA_ROLL_PTR) {
+ /* The PRIMARY KEY (or hidden DB_ROW_ID) and
+ DB_TRX_ID,DB_ROLL_PTR columns must always be present
+ as 'core' fields. */
+ return true;
+ }
+
+ if (n > REC_MAX_N_FIELDS) {
+ return true;
+ }
+
+ index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
+
+ const rec_t* infimum = page_get_infimum_rec(page);
+ const rec_t* supremum = page_get_supremum_rec(page);
+
+ if (!memcmp(infimum, "infimum", 8)
+ && !memcmp(supremum, "supremum", 8)) {
+ if (n > index->n_fields) {
+ /* All fields, including those for instantly
+ added columns, must be present in the
+ data dictionary. */
+ return true;
+ }
+
+ ut_ad(!index->is_dummy);
+ ut_d(index->is_dummy = true);
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(index->get_n_nullable(n)));
+ ut_d(index->is_dummy = false);
+ return false;
+ }
+
+ if (memcmp(infimum, field_ref_zero, 8)
+ || memcmp(supremum, field_ref_zero, 7)) {
+ /* The infimum and supremum records must either contain
+ the original strings, or they must be filled with zero
+ bytes, except for the bytes that we have repurposed. */
+ return true;
+ }
+
+ index->n_core_null_bytes = supremum[7];
+ return index->n_core_null_bytes > 128;
+}
+
+/**
+Gets intention in btr_intention_t from latch_mode, and cleares the intention
+at the latch_mode.
+@param latch_mode in/out: pointer to latch_mode
+@return intention for latching tree */
+static
+btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode)
+{
+ btr_intention_t intention;
+
+ switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
+ case BTR_LATCH_FOR_INSERT:
+ intention = BTR_INTENTION_INSERT;
+ break;
+ case BTR_LATCH_FOR_DELETE:
+ intention = BTR_INTENTION_DELETE;
+ break;
+ default:
+ /* both or unknown */
+ intention = BTR_INTENTION_BOTH;
+ }
+ *latch_mode = btr_latch_mode(
+ *latch_mode & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
+
+ return(intention);
+}
+
+/** @return whether the distance between two records is at most the
+specified value */
+static bool
+page_rec_distance_is_at_most(const rec_t *left, const rec_t *right, ulint val)
+{
+ do
+ {
+ if (left == right)
+ return true;
+ left= page_rec_get_next_const(left);
+ }
+ while (left && val--);
+ return false;
+}
+
+/** Detects whether the modifying record might need a modifying tree structure.
+@param[in] index index
+@param[in] page page
+@param[in] lock_intention lock intention for the tree operation
+@param[in] rec record (current node_ptr)
+@param[in] rec_size size of the record or max size of node_ptr
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] mtr mtr
+@return true if tree modification is needed */
+static
+bool
+btr_cur_will_modify_tree(
+ dict_index_t* index,
+ const page_t* page,
+ btr_intention_t lock_intention,
+ const rec_t* rec,
+ ulint rec_size,
+ ulint zip_size,
+ mtr_t* mtr)
+{
+ ut_ad(!page_is_leaf(page));
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+
+ /* Pessimistic delete of the first record causes delete & insert
+ of node_ptr at upper level. And a subsequent page shrink is
+ possible. It causes delete of node_ptr at the upper level.
+ So we should pay attention also to 2nd record not only
+ first record and last record. Because if the "delete & insert" are
+ done for the different page, the 2nd record become
+ first record and following compress might delete the record and causes
+ the uppper level node_ptr modification. */
+
+ const ulint n_recs = page_get_n_recs(page);
+
+ if (lock_intention <= BTR_INTENTION_BOTH) {
+ compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
+ compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
+
+ if (!page_has_siblings(page)) {
+ return true;
+ }
+
+ ulint margin = rec_size;
+
+ if (lock_intention == BTR_INTENTION_BOTH) {
+ ulint level = btr_page_get_level(page);
+
+ /* This value is the worst expectation for the node_ptr
+ records to be deleted from this page. It is used to
+ expect whether the cursor position can be the left_most
+ record in this page or not. */
+ ulint max_nodes_deleted = 0;
+
+ /* By modifying tree operations from the under of this
+ level, logically (2 ^ (level - 1)) opportunities to
+ deleting records in maximum even unreally rare case. */
+ if (level > 7) {
+ /* TODO: adjust this practical limit. */
+ max_nodes_deleted = 64;
+ } else if (level > 0) {
+ max_nodes_deleted = (ulint)1 << (level - 1);
+ }
+ /* check delete will cause. (BTR_INTENTION_BOTH
+ or BTR_INTENTION_DELETE) */
+ if (n_recs <= max_nodes_deleted * 2
+ || page_rec_is_first(rec, page)) {
+ /* The cursor record can be the left most record
+ in this page. */
+ return true;
+ }
+
+ if (page_has_prev(page)
+ && page_rec_distance_is_at_most(
+ page_get_infimum_rec(page), rec,
+ max_nodes_deleted)) {
+ return true;
+ }
+
+ if (page_has_next(page)
+ && page_rec_distance_is_at_most(
+ rec, page_get_supremum_rec(page),
+ max_nodes_deleted)) {
+ return true;
+ }
+
+ /* Delete at leftmost record in a page causes delete
+ & insert at its parent page. After that, the delete
+ might cause btr_compress() and delete record at its
+ parent page. Thus we should consider max deletes. */
+ margin *= max_nodes_deleted;
+ }
+
+ /* Safe because we already have SX latch of the index tree */
+ if (page_get_data_size(page)
+ < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
+ return(true);
+ }
+ }
+
+ if (lock_intention >= BTR_INTENTION_BOTH) {
+ /* check insert will cause. BTR_INTENTION_BOTH
+ or BTR_INTENTION_INSERT*/
+
+ /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
+ we should check it here in advance, since the max allowable
+ records in a page is limited. */
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
+
+ /* needs 2 records' space for the case the single split and
+ insert cannot fit.
+ page_get_max_insert_size_after_reorganize() includes space
+ for page directory already */
+ ulint max_size
+ = page_get_max_insert_size_after_reorganize(page, 2);
+
+ if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
+ || max_size < rec_size * 2) {
+ return(true);
+ }
+
+ /* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
+ This is based on the worst case, and we could invoke
+ page_zip_available() on the block->page.zip. */
+ /* needs 2 records' space also for worst compress rate. */
+ if (zip_size
+ && page_zip_empty_size(index->n_fields, zip_size)
+ <= rec_size * 2 + page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + 2)) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Detects whether the modifying record might need a opposite modification
+to the intention.
+@param bpage buffer pool page
+@param is_clust whether this is a clustered index
+@param lock_intention lock intention for the tree operation
+@param node_ptr_max_size the maximum size of a node pointer
+@param compress_limit BTR_CUR_PAGE_COMPRESS_LIMIT(index)
+@param rec record (current node_ptr)
+@return true if tree modification is needed */
+static bool btr_cur_need_opposite_intention(const buf_page_t &bpage,
+ bool is_clust,
+ btr_intention_t lock_intention,
+ ulint node_ptr_max_size,
+ ulint compress_limit,
+ const rec_t *rec)
+{
+ if (UNIV_LIKELY_NULL(bpage.zip.data) &&
+ !page_zip_available(&bpage.zip, is_clust, node_ptr_max_size, 1))
+ return true;
+ const page_t *const page= bpage.frame;
+ if (lock_intention != BTR_INTENTION_INSERT)
+ {
+ /* We compensate also for btr_cur_compress_recommendation() */
+ if (!page_has_siblings(page) ||
+ page_rec_is_first(rec, page) || page_rec_is_last(rec, page) ||
+ page_get_data_size(page) < node_ptr_max_size + compress_limit)
+ return true;
+ if (lock_intention == BTR_INTENTION_DELETE)
+ return false;
+ }
+ else if (page_has_next(page) && page_rec_is_last(rec, page))
+ return true;
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true);
+ const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2);
+ return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size ||
+ max_size < node_ptr_max_size * 2;
+}
+
+/**
+@param[in] index b-tree
+@return maximum size of a node pointer record in bytes */
+static ulint btr_node_ptr_max_size(const dict_index_t* index)
+{
+ if (dict_index_is_ibuf(index)) {
+ /* cannot estimate accurately */
+ /* This is universal index for change buffer.
+ The max size of the entry is about max key length * 2.
+ (index key + primary key to be inserted to the index)
+ (The max key length is UNIV_PAGE_SIZE / 16 * 3 at
+ ha_innobase::max_supported_key_length(),
+ considering MAX_KEY_LENGTH = 3072 at MySQL imposes
+ the 3500 historical InnoDB value for 16K page size case.)
+ For the universal index, node_ptr contains most of the entry.
+ And 512 is enough to contain ibuf columns and meta-data */
+ return srv_page_size / 8 * 3 + 512;
+ }
+
+ /* Each record has page_no, length of page_no and header. */
+ ulint comp = dict_table_is_comp(index->table);
+ ulint rec_max_size = comp
+ ? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
+ + UT_BITS_IN_BYTES(index->n_nullable)
+ : REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
+ + 2 * index->n_fields;
+
+ /* Compute the maximum possible record size. */
+ for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
+ const dict_field_t* field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ ulint field_max_size;
+ ulint field_ext_max_size;
+
+ /* Determine the maximum length of the index field. */
+
+ field_max_size = dict_col_get_fixed_size(col, comp);
+ if (field_max_size) {
+ /* dict_index_add_col() should guarantee this */
+ ut_ad(!field->prefix_len
+ || field->fixed_len == field->prefix_len);
+ /* Fixed lengths are not encoded
+ in ROW_FORMAT=COMPACT. */
+ rec_max_size += field_max_size;
+ continue;
+ }
+
+ field_max_size = dict_col_get_max_size(col);
+ if (UNIV_UNLIKELY(!field_max_size)) {
+ switch (col->mtype) {
+ case DATA_VARCHAR:
+ if (!comp
+ && (!strcmp(index->table->name.m_name,
+ "SYS_FOREIGN")
+ || !strcmp(index->table->name.m_name,
+ "SYS_FOREIGN_COLS"))) {
+ break;
+ }
+ /* fall through */
+ case DATA_FIXBINARY:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ case DATA_CHAR:
+ case DATA_MYSQL:
+ /* BINARY(0), VARBINARY(0),
+ CHAR(0) and VARCHAR(0) are possible
+ data type definitions in MariaDB.
+ The InnoDB internal SQL parser maps
+ CHAR to DATA_VARCHAR, so DATA_CHAR (or
+ DATA_MYSQL) is only coming from the
+ MariaDB SQL layer. */
+ if (comp) {
+ /* Add a length byte, because
+ fixed-length empty field are
+ encoded as variable-length.
+ For ROW_FORMAT=REDUNDANT,
+ these bytes were added to
+ rec_max_size before this loop. */
+ rec_max_size++;
+ }
+ continue;
+ }
+
+ /* SYS_FOREIGN.ID is defined as CHAR in the
+ InnoDB internal SQL parser, which translates
+ into the incorrect VARCHAR(0). InnoDB does
+ not enforce maximum lengths of columns, so
+ that is why any data can be inserted in the
+ first place.
+
+ Likewise, SYS_FOREIGN.FOR_NAME,
+ SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
+ defined as CHAR, and also they are part of a key. */
+
+ ut_ad(!strcmp(index->table->name.m_name,
+ "SYS_FOREIGN")
+ || !strcmp(index->table->name.m_name,
+ "SYS_FOREIGN_COLS"));
+ ut_ad(!comp);
+ ut_ad(col->mtype == DATA_VARCHAR);
+
+ rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
+ ? REDUNDANT_REC_MAX_DATA_SIZE
+ : page_get_free_space_of_empty(FALSE) / 2;
+ } else if (field_max_size == NAME_LEN && i == 1
+ && (!strcmp(index->table->name.m_name,
+ TABLE_STATS_NAME)
+ || !strcmp(index->table->name.m_name,
+ INDEX_STATS_NAME))) {
+ /* Interpret "table_name" as VARCHAR(199) even
+ if it was incorrectly defined as VARCHAR(64).
+ While the caller of ha_innobase enforces the
+ maximum length on any data written, the InnoDB
+ internal SQL parser will happily write as much
+ data as is provided. The purpose of this hack
+ is to avoid InnoDB hangs after persistent
+ statistics on partitioned tables are
+ deleted. */
+ field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
+ }
+ field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+ if (field->prefix_len
+ && field->prefix_len < field_max_size) {
+ field_max_size = field->prefix_len;
+ }
+
+ if (comp) {
+ /* Add the extra size for ROW_FORMAT=COMPACT.
+ For ROW_FORMAT=REDUNDANT, these bytes were
+ added to rec_max_size before this loop. */
+ rec_max_size += field_ext_max_size;
+ }
+
+ rec_max_size += field_max_size;
+ }
+
+ return rec_max_size;
+}
+
+/** @return a B-tree search mode suitable for non-leaf pages
+@param mode leaf page search mode */
+static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)
+{
+ if (mode > PAGE_CUR_GE)
+ {
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+ return mode;
+ }
+ if (mode == PAGE_CUR_GE)
+ return PAGE_CUR_L;
+ ut_ad(mode == PAGE_CUR_G);
+ return PAGE_CUR_LE;
+}
+
+static MY_ATTRIBUTE((nonnull))
+/** Acquire a latch on the previous page without violating the latching order.
+@param block index page
+@param page_id page identifier with valid space identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH)
+@param mtr mini-transaction
+@param err error code
+@retval 0 if an error occurred
+@retval 1 if the page could be latched in the wrong order
+@retval -1 if the latch on block was temporarily released */
+int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size,
+ rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err)
+{
+ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+ ut_ad(page_id.space() == block->page.id().space());
+
+ const auto prev_savepoint= mtr->get_savepoint();
+ ut_ad(block == mtr->at_savepoint(prev_savepoint - 1));
+
+ page_id.set_page_no(btr_page_get_prev(block->page.frame));
+ buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr,
+ BUF_GET, mtr, err, false);
+ if (UNIV_UNLIKELY(!prev))
+ return 0;
+
+ int ret= 1;
+ if (UNIV_UNLIKELY(rw_latch == RW_S_LATCH))
+ {
+ if (UNIV_LIKELY(prev->page.lock.s_lock_try()))
+ {
+ mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_S_FIX);
+ goto prev_latched;
+ }
+ block->page.lock.s_unlock();
+ }
+ else
+ {
+ if (UNIV_LIKELY(prev->page.lock.x_lock_try()))
+ {
+ mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_X_FIX);
+ goto prev_latched;
+ }
+ block->page.lock.x_unlock();
+ }
+
+ ret= -1;
+ mtr->lock_register(prev_savepoint - 1, MTR_MEMO_BUF_FIX);
+ mtr->rollback_to_savepoint(prev_savepoint);
+ prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev,
+ BUF_GET, mtr, err, false);
+ if (UNIV_UNLIKELY(!prev))
+ return 0;
+ mtr->upgrade_buffer_fix(prev_savepoint - 1, rw_latch);
+
+ prev_latched:
+ if (memcmp_aligned<2>(FIL_PAGE_TYPE + prev->page.frame,
+ FIL_PAGE_TYPE + block->page.frame, 2) ||
+ memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + prev->page.frame,
+ PAGE_HEADER + PAGE_INDEX_ID + block->page.frame, 8) ||
+ page_is_comp(prev->page.frame) != page_is_comp(block->page.frame))
+ {
+ ut_ad("corrupted" == 0); // FIXME: remove this
+ *err= DB_CORRUPTION;
+ ret= 0;
+ }
+
+ return ret;
+}
+
+dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+ btr_latch_mode latch_mode, mtr_t *mtr)
+{
+ ut_ad(index()->is_btree() || index()->is_ibuf());
+ ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+
+ buf_block_t *guess;
+ btr_op_t btr_op;
+ btr_intention_t lock_intention;
+ bool detected_same_key_root= false;
+
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets2 = offsets2_;
+ rec_offs_init(offsets_);
+ rec_offs_init(offsets2_);
+
+ ut_ad(dict_index_check_search_tuple(index(), tuple));
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(index()->page != FIL_NULL);
+
+ MEM_UNDEFINED(&up_match, sizeof up_match);
+ MEM_UNDEFINED(&up_bytes, sizeof up_bytes);
+ MEM_UNDEFINED(&low_match, sizeof low_match);
+ MEM_UNDEFINED(&low_bytes, sizeof low_bytes);
+ ut_d(up_match= ULINT_UNDEFINED);
+ ut_d(low_match= ULINT_UNDEFINED);
+
+ ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) ||
+ mtr->memo_contains_flagged(&index()->lock,
+ MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK |
+ MTR_MEMO_X_LOCK));
+
+ /* These flags are mutually exclusive, they are lumped together
+ with the latch mode for historical reasons. It's possible for
+ none of the flags to be set. */
+ switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) {
+ default:
+ btr_op= BTR_NO_OP;
+ break;
+ case BTR_INSERT:
+ btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+ ? BTR_INSERT_IGNORE_UNIQUE_OP
+ : BTR_INSERT_OP;
+ break;
+ case BTR_DELETE:
+ btr_op= BTR_DELETE_OP;
+ ut_a(purge_node);
+ break;
+ case BTR_DELETE_MARK:
+ btr_op= BTR_DELMARK_OP;
+ break;
+ }
+
+ /* Operations on the insert buffer tree cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf());
+ /* Operations on the clustered index cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !index()->is_clust());
+ /* Operations on the temporary table(indexes) cannot be buffered. */
+ ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary());
+
+ const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+ lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
+ latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+ ut_ad(!latch_by_caller
+ || latch_mode == BTR_SEARCH_LEAF
+ || latch_mode == BTR_MODIFY_LEAF
+ || latch_mode == BTR_MODIFY_TREE
+ || latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
+
+ flag= BTR_CUR_BINARY;
+#ifndef BTR_CUR_ADAPT
+ guess= nullptr;
+#else
+ btr_search_t *info= btr_search_get_info(index());
+ guess= info->root_guess;
+
+# ifdef BTR_CUR_HASH_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+ info->n_searches++;
+# endif
+ bool ahi_enabled= btr_search_enabled && !index()->is_ibuf();
+ /* We do a dirty read of btr_search_enabled below,
+ and btr_search_guess_on_hash() will have to check it again. */
+ if (!ahi_enabled);
+ else if (btr_search_guess_on_hash(index(), info, tuple, mode,
+ latch_mode, this, mtr))
+ {
+ /* Search using the hash index succeeded */
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+ ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+ ++btr_cur_n_sea;
+
+ return DB_SUCCESS;
+ }
+ else
+ ++btr_cur_n_non_sea;
+# endif
+#endif
+
+ /* If the hash search did not succeed, do binary search down the
+ tree */
+
+ /* Store the position of the tree latch we push to mtr so that we
+ know how to release it when we have latched leaf node(s) */
+
+ const ulint savepoint= mtr->get_savepoint();
+
+ ulint node_ptr_max_size= 0, compress_limit= 0;
+ rw_lock_type_t rw_latch= RW_S_LATCH;
+
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ rw_latch= RW_X_LATCH;
+ node_ptr_max_size= btr_node_ptr_max_size(index());
+ if (latch_by_caller)
+ {
+ ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK));
+ break;
+ }
+ if (lock_intention == BTR_INTENTION_DELETE)
+ {
+ compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index());
+ if (os_aio_pending_reads_approx() &&
+ trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
+ {
+ /* Most delete-intended operations are due to the purge of history.
+ Prioritize them when the history list is growing huge. */
+ mtr_x_lock_index(index(), mtr);
+ break;
+ }
+ }
+ mtr_sx_lock_index(index(), mtr);
+ break;
+#ifdef UNIV_DEBUG
+ case BTR_CONT_MODIFY_TREE:
+ ut_ad("invalid mode" == 0);
+ break;
+#endif
+ case BTR_MODIFY_ROOT_AND_LEAF:
+ rw_latch= RW_SX_LATCH;
+ /* fall through */
+ default:
+ if (!latch_by_caller)
+ mtr_s_lock_index(index(), mtr);
+ }
+
+ const ulint zip_size= index()->table->space->zip_size();
+
+ /* Start with the root page. */
+ page_id_t page_id(index()->table->space_id, index()->page);
+
+ const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode);
+ ulint height= ULINT_UNDEFINED;
+ up_match= 0;
+ up_bytes= 0;
+ low_match= 0;
+ low_bytes= 0;
+ ulint buf_mode= BUF_GET;
+ search_loop:
+ dberr_t err;
+ auto block_savepoint= mtr->get_savepoint();
+ buf_block_t *block=
+ buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr,
+ &err, height == 0 && !index()->is_clust());
+ if (!block)
+ {
+ switch (err) {
+ case DB_DECRYPTION_FAILED:
+ btr_decryption_failed(*index());
+ /* fall through */
+ default:
+ func_exit:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return err;
+ case DB_SUCCESS:
+ /* This must be a search to perform an insert, delete mark, or delete;
+ try using the change buffer */
+ ut_ad(height == 0);
+ ut_ad(thr);
+ break;
+ }
+
+ switch (btr_op) {
+ default:
+ MY_ASSERT_UNREACHABLE();
+ break;
+ case BTR_INSERT_OP:
+ case BTR_INSERT_IGNORE_UNIQUE_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+ if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr))
+ {
+ flag= BTR_CUR_INSERT_TO_IBUF;
+ goto func_exit;
+ }
+ break;
+
+ case BTR_DELMARK_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+ if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+ index(), page_id, zip_size, thr))
+ {
+ flag = BTR_CUR_DEL_MARK_IBUF;
+ goto func_exit;
+ }
+
+ break;
+
+ case BTR_DELETE_OP:
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+ auto& chain = buf_pool.page_hash.cell_get(page_id.fold());
+
+ if (!row_purge_poss_sec(purge_node, index(), tuple))
+ /* The record cannot be purged yet. */
+ flag= BTR_CUR_DELETE_REF;
+ else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(),
+ page_id, zip_size, thr))
+ /* The purge was buffered. */
+ flag= BTR_CUR_DELETE_IBUF;
+ else
+ {
+ /* The purge could not be buffered. */
+ buf_pool.watch_unset(page_id, chain);
+ break;
+ }
+
+ buf_pool.watch_unset(page_id, chain);
+ goto func_exit;
+ }
+
+ /* Change buffering did not succeed, we must read the page. */
+ buf_mode= BUF_GET;
+ goto search_loop;
+ }
+
+ if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
+ btr_page_get_index_id(block->page.frame) != index()->id ||
+ fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+ !fil_page_index_page_check(block->page.frame))
+ {
+ corrupted:
+ ut_ad("corrupted" == 0); // FIXME: remove this
+ err= DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ page_cur.block= block;
+ ut_ad(block == mtr->at_savepoint(block_savepoint));
+ ut_ad(rw_latch != RW_NO_LATCH);
+#ifdef UNIV_ZIP_DEBUG
+ if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
+ ut_a(page_zip_validate(page_zip, block->page.frame, index()));
+#endif /* UNIV_ZIP_DEBUG */
+
+ const uint32_t page_level= btr_page_get_level(block->page.frame);
+
+ if (height == ULINT_UNDEFINED)
+ {
+ /* We are in the B-tree index root page. */
+#ifdef BTR_CUR_ADAPT
+ info->root_guess= block;
+#endif
+ height= page_level;
+ tree_height= height + 1;
+
+ if (!height)
+ {
+ /* The root page is also a leaf page.
+ We may have to reacquire the page latch in a different mode. */
+ switch (rw_latch) {
+ case RW_S_LATCH:
+ if ((latch_mode & ~12) != RW_S_LATCH)
+ {
+ ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH);
+ goto relatch_x;
+ }
+ if (latch_mode != BTR_MODIFY_PREV)
+ {
+ if (!latch_by_caller)
+ /* Release the tree s-latch */
+ mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+ goto reached_latched_leaf;
+ }
+ /* fall through */
+ case RW_SX_LATCH:
+ ut_ad(rw_latch == RW_S_LATCH ||
+ latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
+ relatch_x:
+ mtr->rollback_to_savepoint(block_savepoint);
+ height= ULINT_UNDEFINED;
+ rw_latch= RW_X_LATCH;
+ goto search_loop;
+ case RW_X_LATCH:
+ if (latch_mode == BTR_MODIFY_TREE)
+ goto reached_index_root_and_leaf;
+ goto reached_root_and_leaf;
+ case RW_NO_LATCH:
+ ut_ad(0);
+ }
+ goto reached_leaf;
+ }
+ }
+ else if (UNIV_UNLIKELY(height != page_level))
+ goto corrupted;
+ else
+ switch (latch_mode) {
+ case BTR_MODIFY_TREE:
+ break;
+ case BTR_MODIFY_ROOT_AND_LEAF:
+ ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() ==
+ index()->page) == (tree_height <= height + 2));
+ if (tree_height <= height + 2)
+ /* Retain the root page latch. */
+ break;
+ /* fall through */
+ default:
+ ut_ad(block_savepoint > savepoint);
+ mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
+ block_savepoint--;
+ }
+
+ if (!height)
+ {
+ reached_leaf:
+ /* We reached the leaf level. */
+ ut_ad(block == mtr->at_savepoint(block_savepoint));
+
+ if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF)
+ {
+ reached_root_and_leaf:
+ if (!latch_by_caller)
+ mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+ reached_index_root_and_leaf:
+ ut_ad(rw_latch == RW_X_LATCH);
+#ifdef BTR_CUR_HASH_ADAPT
+ btr_search_drop_page_hash_index(block, true);
+#endif
+ if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+ &page_cur, nullptr))
+ goto corrupted;
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+ ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+ goto func_exit;
+ }
+
+ switch (latch_mode) {
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
+ static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
+ ut_ad(!latch_by_caller);
+ ut_ad(rw_latch ==
+ rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH)));
+
+ /* latch also siblings from left to right */
+ if (page_has_prev(block->page.frame) &&
+ !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
+ goto func_exit;
+ if (page_has_next(block->page.frame) &&
+ !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+ rw_latch, false, mtr, &err))
+ goto func_exit;
+ goto release_tree;
+ case BTR_SEARCH_LEAF:
+ case BTR_MODIFY_LEAF:
+ if (!latch_by_caller)
+ {
+release_tree:
+ /* Release the tree s-latch */
+ block_savepoint--;
+ mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+ }
+ /* release upper blocks */
+ if (savepoint < block_savepoint)
+ mtr->rollback_to_savepoint(savepoint, block_savepoint);
+ break;
+ default:
+ ut_ad(latch_mode == BTR_MODIFY_TREE);
+ ut_ad(rw_latch == RW_X_LATCH);
+ /* x-latch also siblings from left to right */
+ if (page_has_prev(block->page.frame) &&
+ !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
+ goto func_exit;
+ if (page_has_next(block->page.frame) &&
+ !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+ RW_X_LATCH, false, mtr, &err))
+ goto func_exit;
+ if (btr_cur_need_opposite_intention(block->page, index()->is_clust(),
+ lock_intention,
+ node_ptr_max_size, compress_limit,
+ page_cur.rec))
+ goto need_opposite_intention;
+ }
+
+ reached_latched_leaf:
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi_enabled && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG))
+ {
+ if (page_cur_search_with_match_bytes(tuple, mode,
+ &up_match, &up_bytes,
+ &low_match, &low_bytes, &page_cur))
+ goto corrupted;
+ }
+ else
+#endif /* BTR_CUR_HASH_ADAPT */
+ if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+ &page_cur, nullptr))
+ goto corrupted;
+
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+ ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* We do a dirty read of btr_search_enabled here. We will
+ properly check btr_search_enabled again in
+ btr_search_build_page_hash_index() before building a page hash
+ index, while holding search latch. */
+ if (!btr_search_enabled);
+ else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+ /* This may be a search tuple for btr_pcur_t::restore_position(). */
+ ut_ad(tuple->is_metadata() ||
+ (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
+ else if (index()->table->is_temporary());
+ else if (!rec_is_metadata(page_cur.rec, *index()))
+ btr_search_info_update(index(), this);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ goto func_exit;
+ }
+
+ guess= nullptr;
+ if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
+ &page_cur, nullptr))
+ goto corrupted;
+ offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
+ &heap);
+
+ ut_ad(block == mtr->at_savepoint(block_savepoint));
+
+ switch (latch_mode) {
+ default:
+ break;
+ case BTR_MODIFY_TREE:
+ if (btr_cur_need_opposite_intention(block->page, index()->is_clust(),
+ lock_intention,
+ node_ptr_max_size, compress_limit,
+ page_cur.rec))
+ /* If the rec is the first or last in the page for pessimistic
+ delete intention, it might cause node_ptr insert for the upper
+ level. We should change the intention and retry. */
+ need_opposite_intention:
+ return pessimistic_search_leaf(tuple, mode, mtr);
+
+ if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH ||
+ index()->is_unique() ||
+ (up_match <= rec_offs_n_fields(offsets) &&
+ low_match <= rec_offs_n_fields(offsets)))
+ break;
+
+ /* If the first or the last record of the page or the same key
+ value to the first record or last record, then another page might
+ be chosen when BTR_CONT_MODIFY_TREE. So, the parent page should
+ not released to avoiding deadlock with blocking the another search
+ with the same key value. */
+ const rec_t *first=
+ page_rec_get_next_const(page_get_infimum_rec(block->page.frame));
+ ulint matched_fields;
+
+ if (UNIV_UNLIKELY(!first))
+ goto corrupted;
+ if (page_cur.rec == first ||
+ page_rec_is_last(page_cur.rec, block->page.frame))
+ {
+ same_key_root:
+ detected_same_key_root= true;
+ break;
+ }
+
+ matched_fields= 0;
+ offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED,
+ &heap);
+ cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false,
+ &matched_fields);
+ if (matched_fields >= rec_offs_n_fields(offsets) - 1)
+ goto same_key_root;
+ if (const rec_t* last=
+ page_rec_get_prev_const(page_get_supremum_rec(block->page.frame)))
+ {
+ matched_fields= 0;
+ offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED,
+ &heap);
+ cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false,
+ &matched_fields);
+ if (matched_fields >= rec_offs_n_fields(offsets) - 1)
+ goto same_key_root;
+ }
+ else
+ goto corrupted;
+
+ /* Release the non-root parent page unless it may need to be modified. */
+ if (tree_height > height + 1 &&
+ !btr_cur_will_modify_tree(index(), block->page.frame, lock_intention,
+ page_cur.rec, node_ptr_max_size,
+ zip_size, mtr))
+ {
+ mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
+ block_savepoint--;
+ }
+ }
+
+ /* Go to the child node */
+ page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
+
+ if (!--height)
+ {
+ /* We are about to access the leaf level. */
+
+ switch (latch_mode) {
+ case BTR_MODIFY_ROOT_AND_LEAF:
+ rw_latch= RW_X_LATCH;
+ break;
+ case BTR_MODIFY_PREV: /* ibuf_insert() or btr_pcur_move_to_prev() */
+ case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
+ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+ if (page_has_prev(block->page.frame) &&
+ page_rec_is_first(page_cur.rec, block->page.frame))
+ {
+ ut_ad(block_savepoint + 1 == mtr->get_savepoint());
+
+ /* Latch the previous page if the node pointer is the leftmost
+ of the current page. */
+ int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err);
+ if (!ret)
+ goto func_exit;
+ ut_ad(block_savepoint + 2 == mtr->get_savepoint());
+ if (ret < 0)
+ {
+ /* While our latch on the level-2 page prevents splits or
+ merges of this level-1 block, other threads may have
+ modified it due to splitting or merging some level-0 (leaf)
+ pages underneath it. Thus, we must search again. */
+ if (page_cur_search_with_match(tuple, page_mode,
+ &up_match, &low_match,
+ &page_cur, nullptr))
+ goto corrupted;
+ offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec,
+ offsets));
+ }
+ }
+ rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
+ break;
+ case BTR_MODIFY_LEAF:
+ case BTR_SEARCH_LEAF:
+ rw_latch= rw_lock_type_t(latch_mode);
+ if (btr_op != BTR_NO_OP && !index()->is_ibuf() &&
+ ibuf_should_try(index(), btr_op != BTR_INSERT_OP))
+ /* Try to buffer the operation if the leaf page
+ is not in the buffer pool. */
+ buf_mode= btr_op == BTR_DELETE_OP
+ ? BUF_GET_IF_IN_POOL_OR_WATCH
+ : BUF_GET_IF_IN_POOL;
+ break;
+ case BTR_MODIFY_TREE:
+ ut_ad(rw_latch == RW_X_LATCH);
+
+ if (lock_intention == BTR_INTENTION_INSERT &&
+ page_has_next(block->page.frame) &&
+ page_rec_is_last(page_cur.rec, block->page.frame))
+ {
+ /* btr_insert_into_right_sibling() might cause deleting node_ptr
+ at upper level */
+ mtr->rollback_to_savepoint(block_savepoint);
+ goto need_opposite_intention;
+ }
+ break;
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ }
+ }
+
+ goto search_loop;
+}
+
+ATTRIBUTE_COLD void mtr_t::index_lock_upgrade()
+{
+ auto &slot= m_memo[get_savepoint() - 1];
+ if (slot.type == MTR_MEMO_X_LOCK)
+ return;
+ ut_ad(slot.type == MTR_MEMO_SX_LOCK);
+ index_lock *lock= static_cast<index_lock*>(slot.object);
+ lock->u_x_upgrade(SRW_LOCK_CALL);
+ slot.type= MTR_MEMO_X_LOCK;
+}
+
+ATTRIBUTE_COLD
+dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
+ page_cur_mode_t mode, mtr_t *mtr)
+{
+ ut_ad(index()->is_btree() || index()->is_ibuf());
+ ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets= offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(flag == BTR_CUR_BINARY);
+ ut_ad(dict_index_check_search_tuple(index(), tuple));
+ ut_ad(dtuple_check_typed(tuple));
+ buf_block_t *block= mtr->at_savepoint(1);
+ ut_ad(block->page.id().page_no() == index()->page);
+ block->page.fix();
+ mtr->rollback_to_savepoint(1);
+ mtr->index_lock_upgrade();
+
+ const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)};
+
+ mtr->page_lock(block, RW_X_LATCH);
+
+ up_match= 0;
+ up_bytes= 0;
+ low_match= 0;
+ low_bytes= 0;
+ ulint height= btr_page_get_level(block->page.frame);
+ tree_height= height + 1;
+ mem_heap_t *heap= nullptr;
+
+ search_loop:
+ dberr_t err;
+ page_cur.block= block;
+
+ if (UNIV_UNLIKELY(!height))
+ {
+ if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+ &page_cur, nullptr))
+ corrupted:
+ err= DB_CORRUPTION;
+ else
+ {
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+ ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+ ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* We do a dirty read of btr_search_enabled here. We will
+ properly check btr_search_enabled again in
+ btr_search_build_page_hash_index() before building a page hash
+ index, while holding search latch. */
+ if (!btr_search_enabled);
+ else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+ /* This may be a search tuple for btr_pcur_t::restore_position(). */
+ ut_ad(tuple->is_metadata() ||
+ (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
+ else if (index()->table->is_temporary());
+ else if (!rec_is_metadata(page_cur.rec, *index()))
+ btr_search_info_update(index(), this);
+#endif /* BTR_CUR_HASH_ADAPT */
+ err= DB_SUCCESS;
+ }
+
+ func_exit:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return err;
+ }
+
+ if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
+ &page_cur, nullptr))
+ goto corrupted;
+
+ page_id_t page_id{block->page.id()};
+
+ offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
+ &heap);
+ /* Go to the child node */
+ page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
+
+ block=
+ buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET,
+ mtr, &err, !--height && !index()->is_clust());
+
+ if (!block)
+ {
+ if (err == DB_DECRYPTION_FAILED)
+ btr_decryption_failed(*index());
+ goto func_exit;
+ }
+
+ if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
+ btr_page_get_index_id(block->page.frame) != index()->id ||
+ fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+ !fil_page_index_page_check(block->page.frame))
+ goto corrupted;
+
+ if (height != btr_page_get_level(block->page.frame))
+ goto corrupted;
+
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index()));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_has_prev(block->page.frame) &&
+ !btr_latch_prev(block, page_id, block->zip_size(),
+ RW_X_LATCH, mtr, &err))
+ goto func_exit;
+ if (page_has_next(block->page.frame) &&
+ !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+ RW_X_LATCH, false, mtr, &err))
+ goto func_exit;
+ goto search_loop;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+cursor->up_match and cursor->low_match both will have sensible values.
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
+@param level the tree level of search
+@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that
+ it cannot get compared to the node ptr page number field!
+@param latch RW_S_LATCH or RW_X_LATCH
+@param cursor tree cursor; the cursor page is s- or x-latched, but see also
+ above!
+@param mtr mini-transaction
+@return DB_SUCCESS on success or error code otherwise */
+TRANSACTIONAL_TARGET
+dberr_t btr_cur_search_to_nth_level(ulint level,
+ const dtuple_t *tuple,
+ rw_lock_type_t rw_latch,
+ btr_cur_t *cursor, mtr_t *mtr)
+{
+ dict_index_t *const index= cursor->index();
+
+ ut_ad(index->is_btree() || index->is_ibuf());
+ mem_heap_t *heap= nullptr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ rec_offs_init(offsets_);
+ ut_ad(level);
+ ut_ad(dict_index_check_search_tuple(index, tuple));
+ ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree());
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(index->page != FIL_NULL);
+
+ MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
+ MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
+ cursor->up_match= 0;
+ cursor->low_match= 0;
+ cursor->flag= BTR_CUR_BINARY;
+
+#ifndef BTR_CUR_ADAPT
+ buf_block_t *block= nullptr;
+#else
+ btr_search_t *info= btr_search_get_info(index);
+ buf_block_t *block= info->root_guess;
+#endif /* BTR_CUR_ADAPT */
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+
+ const ulint zip_size= index->table->space->zip_size();
+
+ /* Start with the root page. */
+ page_id_t page_id(index->table->space_id, index->page);
+ ulint height= ULINT_UNDEFINED;
+
+search_loop:
+ dberr_t err= DB_SUCCESS;
+ if (buf_block_t *b=
+ mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch)))
+ block= b;
+ else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch,
+ block, BUF_GET, mtr, &err)))
+ {
+ if (err == DB_DECRYPTION_FAILED)
+ btr_decryption_failed(*index);
+ goto func_exit;
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
+ ut_a(page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
+ btr_page_get_index_id(block->page.frame) != index->id ||
+ fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+ !fil_page_index_page_check(block->page.frame))
+ {
+ corrupted:
+ err= DB_CORRUPTION;
+ func_exit:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return err;
+ }
+
+ const uint32_t page_level= btr_page_get_level(block->page.frame);
+
+ if (height == ULINT_UNDEFINED)
+ {
+ /* We are in the root node */
+ height= page_level;
+ if (!height)
+ goto corrupted;
+ cursor->tree_height= height + 1;
+ }
+ else if (height != ulint{page_level})
+ goto corrupted;
+
+ cursor->page_cur.block= block;
+
+ /* Search for complete index fields. */
+ if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match,
+ &cursor->low_match, &cursor->page_cur,
+ nullptr))
+ goto corrupted;
+
+ /* If this is the desired level, leave the loop */
+ if (level == height)
+ goto func_exit;
+
+ ut_ad(height > level);
+ height--;
+
+ offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Go to the child node */
+ page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
+ offsets));
+ block= nullptr;
+ goto search_loop;
+}
+
+dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
+ btr_latch_mode latch_mode, mtr_t *mtr)
+{
+ ulint n_blocks= 0;
+ mem_heap_t *heap= nullptr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ dberr_t err;
+
+ rec_offs_init(offsets_);
+
+ const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+ latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED);
+
+ btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
+
+ /* Store the position of the tree latch we push to mtr so that we
+ know how to release it when we have latched the leaf node */
+
+ auto savepoint= mtr->get_savepoint();
+
+ rw_lock_type_t upper_rw_latch= RW_X_LATCH;
+ ulint node_ptr_max_size= 0, compress_limit= 0;
+
+ if (latch_mode == BTR_MODIFY_TREE)
+ {
+ node_ptr_max_size= btr_node_ptr_max_size(index);
+ /* Most of delete-intended operations are purging. Free blocks
+ and read IO bandwidth should be prioritized for them, when the
+ history list is growing huge. */
+ savepoint++;
+ if (lock_intention == BTR_INTENTION_DELETE)
+ {
+ compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index);
+
+ if (os_aio_pending_reads_approx() &&
+ trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
+ {
+ mtr_x_lock_index(index, mtr);
+ goto index_locked;
+ }
+ }
+ mtr_sx_lock_index(index, mtr);
+ }
+ else
+ {
+ static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), "");
+ ut_ad(!(latch_mode & 8));
+ /* This function doesn't need to lock left page of the leaf page */
+ static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), "");
+ static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), "");
+ latch_mode= btr_latch_mode(latch_mode & ~4);
+ ut_ad(!latch_by_caller ||
+ mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK));
+ upper_rw_latch= RW_S_LATCH;
+ if (!latch_by_caller)
+ {
+ savepoint++;
+ mtr_s_lock_index(index, mtr);
+ }
+ }
+
+index_locked:
+ ut_ad(savepoint == mtr->get_savepoint());
+
+ const rw_lock_type_t root_leaf_rw_latch=
+ rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH));
+
+ page_cur.index = index;
+
+ uint32_t page= index->page;
+ const auto zip_size= index->table->space->zip_size();
+
+ for (ulint height= ULINT_UNDEFINED;;)
+ {
+ ut_ad(n_blocks < BTR_MAX_LEVELS);
+ ut_ad(savepoint + n_blocks == mtr->get_savepoint());
+
+ buf_block_t* block=
+ btr_block_get(*index, page,
+ height ? upper_rw_latch : root_leaf_rw_latch,
+ !height, mtr, &err);
+ ut_ad(!block == (err != DB_SUCCESS));
+
+ if (!block)
+ {
+ if (err == DB_DECRYPTION_FAILED)
+ btr_decryption_failed(*index);
+ break;
+ }
+
+ if (first)
+ page_cur_set_before_first(block, &page_cur);
+ else
+ page_cur_set_after_last(block, &page_cur);
+
+ const uint32_t l= btr_page_get_level(block->page.frame);
+
+ if (height == ULINT_UNDEFINED)
+ {
+ /* We are in the root node */
+ height= l;
+ if (height);
+ else if (upper_rw_latch != root_leaf_rw_latch)
+ {
+ /* We should retry to get the page, because the root page
+ is latched with different level as a leaf page. */
+ ut_ad(n_blocks == 0);
+ ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+ upper_rw_latch= root_leaf_rw_latch;
+ mtr->rollback_to_savepoint(savepoint);
+ height= ULINT_UNDEFINED;
+ continue;
+ }
+ else
+ {
+ reached_leaf:
+ const auto leaf_savepoint= mtr->get_savepoint();
+ ut_ad(leaf_savepoint);
+ ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1));
+
+ if (latch_mode == BTR_MODIFY_TREE)
+ {
+ /* x-latch also siblings from left to right */
+ if (page_has_prev(block->page.frame) &&
+ !btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH,
+ mtr, &err))
+ break;
+ if (page_has_next(block->page.frame) &&
+ !btr_block_get(*index, btr_page_get_next(block->page.frame),
+ RW_X_LATCH, false, mtr, &err))
+ break;
+
+ if (!index->lock.have_x() &&
+ btr_cur_need_opposite_intention(block->page, index->is_clust(),
+ lock_intention,
+ node_ptr_max_size,
+ compress_limit, page_cur.rec))
+ goto need_opposite_intention;
+ }
+ else
+ {
+ if (latch_mode != BTR_CONT_MODIFY_TREE)
+ {
+ ut_ad(latch_mode == BTR_MODIFY_LEAF ||
+ latch_mode == BTR_SEARCH_LEAF);
+ /* Release index->lock if needed, and the non-leaf pages. */
+ mtr->rollback_to_savepoint(savepoint - !latch_by_caller,
+ leaf_savepoint - 1);
+ }
+ }
+ break;
+ }
+ }
+ else if (UNIV_UNLIKELY(height != l))
+ {
+ corrupted:
+ err= DB_CORRUPTION;
+ break;
+ }
+
+ if (!height)
+ goto reached_leaf;
+
+ height--;
+
+ if (first
+ ? !page_cur_move_to_next(&page_cur)
+ : !page_cur_move_to_prev(&page_cur))
+ goto corrupted;
+
+ offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED,
+ &heap);
+
+ ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH);
+
+ if (latch_mode != BTR_MODIFY_TREE);
+ else if (btr_cur_need_opposite_intention(block->page, index->is_clust(),
+ lock_intention,
+ node_ptr_max_size, compress_limit,
+ page_cur.rec))
+ {
+ need_opposite_intention:
+ /* If the rec is the first or last in the page for pessimistic
+ delete intention, it might cause node_ptr insert for the upper
+ level. We should change the intention and retry. */
+
+ mtr->rollback_to_savepoint(savepoint);
+ mtr->index_lock_upgrade();
+ /* X-latch all pages from now on */
+ latch_mode= BTR_CONT_MODIFY_TREE;
+ page= index->page;
+ height= ULINT_UNDEFINED;
+ n_blocks= 0;
+ continue;
+ }
+ else
+ {
+ if (!btr_cur_will_modify_tree(index, block->page.frame,
+ lock_intention, page_cur.rec,
+ node_ptr_max_size, zip_size, mtr))
+ {
+ ut_ad(n_blocks);
+ /* release buffer-fixes on pages that will not be modified
+ (except the root) */
+ if (n_blocks > 1)
+ {
+ mtr->rollback_to_savepoint(savepoint + 1, savepoint + n_blocks - 1);
+ n_blocks= 1;
+ }
+ }
+ }
+
+ /* Go to the child node */
+ page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
+ n_blocks++;
+ }
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+
+ return err;
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record if succeed, else NULL */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
+ have been stored to tuple */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_cur_t* page_cursor;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Now, try the insert */
+ rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext,
+ mtr);
+
+ /* If the record did not fit, reorganize.
+ For compressed pages, page_cur_tuple_insert()
+ attempted this already. */
+ if (!rec && !page_cur_get_page_zip(page_cursor)
+ && btr_page_reorganize(page_cursor, mtr) == DB_SUCCESS) {
+ rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap,
+ n_ext, mtr);
+ }
+
+ ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
+ return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
+dberr_t
+btr_cur_ins_lock_and_undo(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if
+ not zero, the parameters index and thr
+ should be specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ bool* inherit)/*!< out: true if the inserted new record maybe
+ should inherit LOCK_GAP type locks from the
+ successor record */
+{
+ if (!(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))) {
+ return DB_SUCCESS;
+ }
+
+ /* Check if we have to wait for a lock: enqueue an explicit lock
+ request if yes */
+
+ rec_t* rec = btr_cur_get_rec(cursor);
+ dict_index_t* index = cursor->index();
+
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad((flags & BTR_NO_UNDO_LOG_FLAG)
+ || !index->table->skip_alter_undo);
+
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ /* Check if there is predicate or GAP lock preventing the insertion */
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ const unsigned type = index->type;
+ if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
+ lock_prdt_t prdt;
+ rtr_mbr_t mbr;
+
+ rtr_get_mbr_from_tuple(entry, &mbr);
+
+ /* Use on stack MBR variable to test if a lock is
+ needed. If so, the predicate (MBR) will be allocated
+ from lock heap in lock_prdt_insert_check_and_lock() */
+ lock_init_prdt_from_mbr(&prdt, &mbr, 0, nullptr);
+
+ if (dberr_t err = lock_prdt_insert_check_and_lock(
+ rec, btr_cur_get_block(cursor),
+ index, thr, mtr, &prdt)) {
+ return err;
+ }
+ *inherit = false;
+ } else {
+ ut_ad(!dict_index_is_online_ddl(index)
+ || index->is_primary()
+ || (flags & BTR_CREATE_FLAG));
+#ifdef WITH_WSREP
+ trx_t* trx= thr_get_trx(thr);
+ /* If transaction scanning an unique secondary
+ key is wsrep high priority thread (brute
+ force) this scanning may involve GAP-locking
+ in the index. As this locking happens also
+ when applying replication events in high
+ priority applier threads, there is a
+ probability for lock conflicts between two
+ wsrep high priority threads. To avoid this
+ GAP-locking we mark that this transaction
+ is using unique key scan here. */
+ if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
+ && trx->is_wsrep()
+ && wsrep_thd_is_BF(trx->mysql_thd, false)) {
+ trx->wsrep = 3;
+ }
+#endif /* WITH_WSREP */
+ if (dberr_t err = lock_rec_insert_check_and_lock(
+ rec, btr_cur_get_block(cursor),
+ index, thr, mtr, inherit)) {
+ return err;
+ }
+ }
+ }
+
+ if (!index->is_primary() || !page_is_leaf(page_align(rec))) {
+ return DB_SUCCESS;
+ }
+
+ constexpr roll_ptr_t dummy_roll_ptr = roll_ptr_t{1}
+ << ROLL_PTR_INSERT_FLAG_POS;
+ roll_ptr_t roll_ptr = dummy_roll_ptr;
+
+ if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+ if (dberr_t err = trx_undo_report_row_operation(
+ thr, index, entry, NULL, 0, NULL, NULL,
+ &roll_ptr)) {
+ return err;
+ }
+
+ if (roll_ptr != dummy_roll_ptr) {
+ dfield_t* r = dtuple_get_nth_field(entry,
+ index->db_trx_id());
+ trx_write_trx_id(static_cast<byte*>(r->data),
+ thr_get_trx(thr)->id);
+ }
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ dfield_t* r = dtuple_get_nth_field(
+ entry, index->db_roll_ptr());
+ ut_ad(r->len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+ }
+
+ return DB_SUCCESS;
+}
+
+/**
+Prefetch siblings of the leaf for the pessimistic operation.
+@param block leaf page
+@param index index of the page */
+static void btr_cur_prefetch_siblings(const buf_block_t *block,
+ const dict_index_t *index)
+{
+ ut_ad(page_is_leaf(block->page.frame));
+
+ if (index->is_ibuf())
+ return;
+
+ const page_t *page= block->page.frame;
+ uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+ uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+
+ fil_space_t *space= index->table->space;
+
+ if (prev == FIL_NULL);
+ else if (space->acquire())
+ buf_read_page_background(space, page_id_t(space->id, prev),
+ block->zip_size());
+ if (next == FIL_NULL);
+ else if (space->acquire())
+ buf_read_page_background(space, page_id_t(space->id, next),
+ block->zip_size());
+}
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction;
+ if this function returns DB_SUCCESS on
+ a leaf page of a secondary index in a
+ compressed tablespace, the caller must
+ mtr_commit(mtr) before latching
+ any further pages */
+{
+ big_rec_t* big_rec_vec = NULL;
+ dict_index_t* index;
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ page_t* page;
+ rec_t* dummy;
+ bool leaf;
+ bool reorg __attribute__((unused));
+ bool inherit = true;
+ ulint rec_size;
+ dberr_t err;
+
+ ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+ *big_rec = NULL;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ index = cursor->index();
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad(dtuple_check_typed(entry));
+
+#ifdef HAVE_valgrind
+ if (block->page.zip.data) {
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
+ }
+#endif /* HAVE_valgrind */
+
+ leaf = page_is_leaf(page);
+
+ if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+ ut_ad(leaf);
+ goto convert_big_rec;
+ }
+
+ /* Calculate the record size when entry is converted to a record */
+ rec_size = rec_get_converted_size(index, entry, n_ext);
+
+ if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+ dtuple_get_n_fields(entry),
+ block->zip_size())) {
+convert_big_rec:
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+ big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+ if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ rec_size = rec_get_converted_size(index, entry, n_ext);
+ }
+
+ if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
+ if (big_rec_vec != NULL) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail);
+
+ if (block->page.zip.data && leaf
+ && (page_get_data_size(page) + rec_size
+ >= dict_index_zip_pad_optimal_page_size(index))) {
+ /* If compression padding tells us that insertion will
+ result in too packed up page i.e.: which is likely to
+ cause compression failure then don't do an optimistic
+ insertion. */
+fail:
+ err = DB_FAIL;
+
+ /* prefetch siblings of the leaf for the pessimistic
+ operation, if the page is leaf. */
+ if (leaf) {
+ btr_cur_prefetch_siblings(block, index);
+ }
+fail_err:
+
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ return(err);
+ }
+
+ ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
+ if (max_size < rec_size) {
+ goto fail;
+ }
+
+ const ulint n_recs = page_get_n_recs(page);
+ if (UNIV_UNLIKELY(n_recs >= 8189)) {
+ ut_ad(srv_page_size == 65536);
+ goto fail;
+ }
+
+ if (page_has_garbage(page)) {
+ if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
+ && n_recs > 1
+ && page_get_max_insert_size(page, 1) < rec_size) {
+
+ goto fail;
+ }
+ }
+
+ /* If there have been many consecutive inserts to the
+ clustered index leaf page of an uncompressed table, check if
+ we have to split the page to reserve enough free space for
+ future updates of records. */
+
+ if (leaf && !block->page.zip.data && dict_index_is_clust(index)
+ && page_get_n_recs(page) >= 2
+ && dict_index_get_space_reserve() + rec_size > max_size
+ && (btr_page_get_split_rec_to_right(cursor, &dummy)
+ || btr_page_get_split_rec_to_left(cursor))) {
+ goto fail;
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ DBUG_LOG("ib_cur",
+ "insert " << index->name << " (" << index->id << ") by "
+ << ib::hex(thr ? thr->graph->trx->id : 0)
+ << ' ' << rec_printer(entry).str());
+ DBUG_EXECUTE_IF("do_page_reorganize",
+ ut_a(!n_recs || btr_page_reorganize(page_cursor, mtr)
+ == DB_SUCCESS););
+
+ /* Now, try the insert */
+ {
+ const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
+
+ /* Check locks and write to the undo log,
+ if specified */
+ err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+ thr, mtr, &inherit);
+ if (err != DB_SUCCESS) {
+ goto fail_err;
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(flags & BTR_CREATE_FLAG)
+ && leaf && index->is_primary()) {
+ const dfield_t* trx_id = dtuple_get_nth_field(
+ entry, dict_col_get_clust_pos(
+ dict_table_get_sys_col(index->table,
+ DATA_TRX_ID),
+ index));
+
+ ut_ad(trx_id->len == DATA_TRX_ID_LEN);
+ ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
+ ut_ad(*static_cast<const byte*>
+ (trx_id[1].data) & 0x80);
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+ ut_ad(!memcmp(trx_id->data, reset_trx_id,
+ DATA_TRX_ID_LEN));
+ } else {
+ ut_ad(thr->graph->trx->id);
+ ut_ad(thr->graph->trx->bulk_insert
+ || thr->graph->trx->id
+ == trx_read_trx_id(
+ static_cast<const byte*>(
+ trx_id->data))
+ || index->table->is_temporary());
+ }
+ }
+#endif
+
+ *rec = page_cur_tuple_insert(page_cursor, entry, offsets, heap,
+ n_ext, mtr);
+
+ reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+ }
+
+ if (*rec) {
+ } else if (block->page.zip.data) {
+ ut_ad(!index->table->is_temporary());
+ /* Reset the IBUF_BITMAP_FREE bits, because
+ page_cur_tuple_insert() will have attempted page
+ reorganize before failing. */
+ if (leaf
+ && !dict_index_is_clust(index)) {
+ ibuf_reset_free_bits(block);
+ }
+
+ goto fail;
+ } else {
+ ut_ad(!reorg);
+ reorg = true;
+
+ /* If the record did not fit, reorganize */
+ err = btr_page_reorganize(page_cursor, mtr);
+ if (err != DB_SUCCESS
+ || page_get_max_insert_size(page, 1) != max_size
+ || !(*rec = page_cur_tuple_insert(page_cursor, entry,
+ offsets, heap, n_ext,
+ mtr))) {
+ err = DB_CORRUPTION;
+ goto fail_err;
+ }
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (!leaf) {
+ } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+ ut_ad(entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(flags == BTR_NO_LOCKING_FLAG);
+ } else if (index->table->is_temporary()) {
+ } else {
+ srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index);
+ if (!reorg && cursor->flag == BTR_CUR_HASH) {
+ btr_search_update_hash_node_on_insert(
+ cursor, ahi_latch);
+ } else {
+ btr_search_update_hash_on_insert(cursor, ahi_latch);
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+ lock_update_insert(block, *rec);
+ }
+
+ if (leaf
+ && !dict_index_is_clust(index)
+ && !index->table->is_temporary()) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. */
+
+ /* The free bits in the insert buffer bitmap must
+ never exceed the free space on a page. It is safe to
+ decrement or reset the bits in the bitmap in a
+ mini-transaction that is committed before the
+ mini-transaction that affects the free space. */
+
+ /* It is unsafe to increment the bits in a separately
+ committed mini-transaction, because in crash recovery,
+ the free bits could momentarily be set too high. */
+
+ if (block->page.zip.data) {
+ /* Update the bits in the same mini-transaction. */
+ ibuf_update_free_bits_zip(block, mtr);
+ } else {
+ /* Decrement the bits in a separate
+ mini-transaction. */
+ ibuf_update_free_bits_if_full(
+ block, max_size,
+ rec_size + PAGE_DIR_SLOT_SIZE);
+ }
+ }
+
+ *big_rec = big_rec_vec;
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ rec_offs** offsets,/*!< out: offsets on *rec */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap
+ that can be emptied */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in/out: query thread; can be NULL if
+ !(~flags
+ & (BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG)) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index = cursor->index();
+ big_rec_t* big_rec_vec = NULL;
+ bool inherit = false;
+ uint32_t n_reserved = 0;
+
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+
+ *big_rec = NULL;
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+
+ cursor->flag = BTR_CUR_BINARY;
+
+ /* Check locks and write to undo log, if specified */
+
+ dberr_t err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+ thr, mtr, &inherit);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* First reserve enough free space for the file segments of
+ the index tree, so that the insert will not fail because of
+ lack of space */
+
+ if (!index->is_ibuf()
+ && (err = fsp_reserve_free_extents(&n_reserved, index->table->space,
+ uint32_t(cursor->tree_height / 16
+ + 3),
+ FSP_NORMAL, mtr))
+ != DB_SUCCESS) {
+ return err;
+ }
+
+ if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+ index->table->not_redundant(),
+ dtuple_get_n_fields(entry),
+ btr_cur_get_block(cursor)->zip_size())
+ || UNIV_UNLIKELY(entry->is_alter_metadata()
+ && !dfield_is_ext(
+ dtuple_get_nth_field(
+ entry,
+ index->first_user_field())))) {
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+
+ if (UNIV_LIKELY_NULL(big_rec_vec)) {
+ /* This should never happen, but we handle
+ the situation in a robust manner. */
+ ut_ad(0);
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+ if (big_rec_vec == NULL) {
+
+ index->table->space->release_free_extents(n_reserved);
+ return(DB_TOO_BIG_RECORD);
+ }
+ }
+
+ *rec = index->page == btr_cur_get_block(cursor)->page.id().page_no()
+ ? btr_root_raise_and_insert(flags, cursor, offsets, heap,
+ entry, n_ext, mtr, &err)
+ : btr_page_split_and_insert(flags, cursor, offsets, heap,
+ entry, n_ext, mtr, &err);
+
+ if (!*rec) {
+ goto func_exit;
+ }
+
+ ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
+ || dict_index_is_spatial(index));
+
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ ut_ad(!index->table->is_temporary());
+ if (dict_index_is_spatial(index)) {
+ /* Do nothing */
+ } else {
+ /* The cursor might be moved to the other page
+ and the max trx id field should be updated after
+ the cursor was fixed. */
+ if (!dict_index_is_clust(index)) {
+ page_update_max_trx_id(
+ btr_cur_get_block(cursor),
+ btr_cur_get_page_zip(cursor),
+ thr_get_trx(thr)->id, mtr);
+ }
+
+ if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
+ || !page_has_prev(btr_cur_get_page(cursor))) {
+ /* split and inserted need to call
+ lock_update_insert() always. */
+ inherit = true;
+ }
+ }
+ }
+
+ if (!page_is_leaf(btr_cur_get_page(cursor))) {
+ ut_ad(!big_rec_vec);
+ } else {
+#ifdef BTR_CUR_HASH_ADAPT
+ if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+ ut_ad(entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(!(flags & BTR_CREATE_FLAG));
+ } else if (index->table->is_temporary()) {
+ } else {
+ btr_search_update_hash_on_insert(
+ cursor, btr_search_sys.get_latch(*index));
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+ if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
+
+ lock_update_insert(btr_cur_get_block(cursor), *rec);
+ }
+ }
+
+ err = DB_SUCCESS;
+func_exit:
+ index->table->space->release_free_extents(n_reserved);
+ *big_rec = big_rec_vec;
+
+ return err;
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+btr_cur_upd_lock_and_undo(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on record to update */
+ const rec_offs* offsets,/*!< in: rec_get_offsets() on cursor */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread
+ (can be NULL if BTR_NO_LOCKING_FLAG) */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ roll_ptr_t* roll_ptr)/*!< out: roll pointer */
+{
+ dict_index_t* index;
+ const rec_t* rec;
+ dberr_t err;
+
+ ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index();
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ if (!dict_index_is_clust(index)) {
+ ut_ad(dict_index_is_online_ddl(index)
+ == !!(flags & BTR_CREATE_FLAG));
+
+ /* We do undo logging only when we update a clustered index
+ record */
+ return(lock_sec_rec_modify_check_and_lock(
+ flags, btr_cur_get_block(cursor), rec,
+ index, thr, mtr));
+ }
+
+ /* Check if we have to wait for a lock: enqueue an explicit lock
+ request if yes */
+
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ err = lock_clust_rec_modify_check_and_lock(
+ btr_cur_get_block(cursor), rec, index,
+ offsets, thr);
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Append the info about the update in the undo log */
+
+ return((flags & BTR_NO_UNDO_LOG_FLAG)
+ ? DB_SUCCESS
+ : trx_undo_report_row_operation(
+ thr, index, NULL, update,
+ cmpl_info, rec, offsets, roll_ptr));
+}
+
+/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
+@param[in,out] entry clustered index entry
+@param[in] index clustered index
+@param[in] trx_id DB_TRX_ID
+@param[in] roll_ptr DB_ROLL_PTR */
+static void btr_cur_write_sys(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr)
+{
+ dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+ ut_ad(t->len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
+ dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
+ ut_ad(r->len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
+@param[in,out] block clustered index leaf page
+@param[in,out] rec clustered index record
+@param[in] index clustered index
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx transaction
+@param[in] roll_ptr DB_ROLL_PTR value
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+ dict_index_t *index, const rec_offs *offsets,
+ const trx_t *trx, roll_ptr_t roll_ptr,
+ mtr_t *mtr)
+{
+ ut_ad(index->is_primary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
+ trx->id, roll_ptr, mtr);
+ return DB_SUCCESS;
+ }
+
+ ulint offset= index->trx_id_offset;
+
+ if (!offset)
+ offset= row_get_trx_id_offset(index, offsets);
+
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+
+ /* During IMPORT the trx id in the record can be in the future, if
+ the .ibd file is being imported from another instance. During IMPORT
+ roll_ptr will be 0. */
+ ut_ad(roll_ptr == 0 ||
+ lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
+ rec, index, offsets));
+
+ byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+ trx_write_trx_id(sys, trx->id);
+ trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
+
+ ulint d= 0;
+ const byte *src= nullptr;
+ byte *dest= rec + offset;
+ ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ if (UNIV_LIKELY(index->trx_id_offset))
+ {
+ const rec_t *prev= page_rec_get_prev_const(rec);
+ if (UNIV_UNLIKELY(!prev || prev == rec))
+ return DB_CORRUPTION;
+ else if (page_rec_is_infimum(prev));
+ else
+ for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
+ if (src[d] != sys[d])
+ break;
+ if (d > 6 && memcmp(dest, sys, d))
+ {
+ /* We save space by replacing a single record
+
+ WRITE,page_offset(dest),byte[13]
+
+ with two records:
+
+ MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
+ WRITE|0x80,0,byte[13-d]
+
+ The single WRITE record would be x+13 bytes long, with x>2.
+ The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
+ second WRITE would be 1+1+13-d = 15-d bytes.
+
+ The total size is: x+13 versus x+4+15-d = x+19-d bytes.
+ To save space, we must have d>6, that is, the complete DB_TRX_ID and
+ the first byte(s) of DB_ROLL_PTR must match the previous record. */
+ memcpy(dest, src, d);
+ mtr->memmove(*block, page_offset(dest), page_offset(src), d);
+ dest+= d;
+ len-= d;
+ /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
+ DB_TRX_ID refers to an active transaction. */
+ ut_ad(len);
+ }
+ else
+ d= 0;
+ }
+
+ if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
+
+ return DB_SUCCESS;
+}
+
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ page_cur_t* cursor, /*!< in/out: B-tree page cursor */
+#ifdef UNIV_DEBUG
+ rec_offs* offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+ ulint length, /*!< in: size needed */
+ bool create, /*!< in: true=delete-and-insert,
+ false=update-in-place */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ dict_index_t* index = cursor->index;
+
+ /* Have a local copy of the variables as these can change
+ dynamically. */
+ const page_t* page = page_cur_get_page(cursor);
+
+ ut_ad(page_zip == page_cur_get_page_zip(cursor));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+ if (page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create)) {
+ return(true);
+ }
+
+ if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+ /* The page has been freshly compressed, so
+ reorganizing it will not help. */
+ return(false);
+ }
+
+ if (create && page_is_leaf(page)
+ && (length + page_get_data_size(page)
+ >= dict_index_zip_pad_optimal_page_size(index))) {
+ return(false);
+ }
+
+ if (btr_page_reorganize(cursor, mtr) == DB_SUCCESS) {
+ rec_offs_make_valid(page_cur_get_rec(cursor), index,
+ page_is_leaf(page), offsets);
+
+ /* After recompressing a page, we must make sure that the free
+ bits in the insert buffer bitmap will not exceed the free
+ space on the page. Because this function will not attempt
+ recompression unless page_zip_available() fails above, it is
+ safe to reset the free bits if page_zip_available() fails
+ again, below. The free bits can safely be reset in a separate
+ mini-transaction. If page_zip_available() succeeds below, we
+ can be sure that the btr_page_reorganize() above did not reduce
+ the free space available on the page. */
+
+ if (page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create)) {
+ return true;
+ }
+ }
+
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(page)) {
+ ibuf_reset_free_bits(page_cur_get_block(cursor));
+ }
+
+ return(false);
+}
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out] rec index record
+@param[in] index the index of the record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] update update vector
+@param[in,out] block index page
+@param[in,out] mtr mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+ const rec_offs *offsets, const upd_t *update,
+ buf_block_t *block, mtr_t *mtr)
+{
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!index->table->skip_alter_undo);
+ ut_ad(!block->page.zip.data || index->table->not_redundant());
+
+#ifdef UNIV_DEBUG
+ if (rec_offs_comp(offsets)) {
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ break;
+ case REC_STATUS_INSTANT:
+ ut_ad(index->is_instant());
+ break;
+ case REC_STATUS_NODE_PTR:
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_ad("wrong record status in update" == 0);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ ut_ad(rec_offs_comp(offsets));
+ byte* info_bits = &rec[-REC_NEW_INFO_BITS];
+ const bool flip_del_mark = (*info_bits ^ update->info_bits)
+ & REC_INFO_DELETED_FLAG;
+ *info_bits &= byte(~REC_INFO_BITS_MASK);
+ *info_bits |= update->info_bits;
+
+ if (flip_del_mark) {
+ page_zip_rec_set_deleted(block, rec, update->info_bits
+ & REC_INFO_DELETED_FLAG, mtr);
+ }
+ } else {
+ byte* info_bits = &rec[rec_offs_comp(offsets)
+ ? -REC_NEW_INFO_BITS
+ : -REC_OLD_INFO_BITS];
+
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
+ (*info_bits
+ & ~REC_INFO_BITS_MASK)
+ | update->info_bits);
+ }
+
+ for (ulint i = 0; i < update->n_fields; i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
+ continue;
+ }
+ const ulint n = uf->field_no;
+
+ ut_ad(!dfield_is_ext(&uf->new_val)
+ == !rec_offs_nth_extern(offsets, n));
+ ut_ad(!rec_offs_nth_default(offsets, n));
+
+ if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+ if (rec_offs_nth_sql_null(offsets, n)) {
+ ut_ad(index->table->is_instant());
+ ut_ad(n >= index->n_core_fields);
+ continue;
+ }
+
+ ut_ad(!index->table->not_redundant());
+ switch (ulint size = rec_get_nth_field_size(rec, n)) {
+ case 0:
+ break;
+ case 1:
+ mtr->write<1,mtr_t::MAYBE_NOP>(
+ *block,
+ rec_get_field_start_offs(rec, n) + rec,
+ 0U);
+ break;
+ default:
+ mtr->memset(
+ block,
+ page_offset(rec_get_field_start_offs(
+ rec, n) + rec),
+ size, 0);
+ }
+ ulint l = rec_get_1byte_offs_flag(rec)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ mtr->write<1>(*block, b,
+ byte(*b | REC_1BYTE_SQL_NULL_MASK));
+ continue;
+ }
+
+ ulint len;
+ byte* data = rec_get_nth_field(rec, offsets, n, &len);
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ ut_ad(len == uf->new_val.len);
+ memcpy(data, uf->new_val.data, len);
+ continue;
+ }
+
+ if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+ ut_ad(len == UNIV_SQL_NULL);
+ ut_ad(!rec_offs_comp(offsets));
+ len = uf->new_val.len;
+ ut_ad(len == rec_get_nth_field_size(rec, n));
+ ulint l = rec_get_1byte_offs_flag(rec)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ mtr->write<1>(*block, b,
+ byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
+ }
+
+ if (len) {
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
+ uf->new_val.data, len);
+ }
+ }
+
+ if (UNIV_LIKELY(!block->page.zip.data)) {
+ return;
+ }
+
+ switch (update->n_fields) {
+ case 0:
+ /* We only changed the delete-mark flag. */
+ return;
+ case 1:
+ if (!index->is_clust()
+ || update->fields[0].field_no != index->db_roll_ptr()) {
+ break;
+ }
+ goto update_sys;
+ case 2:
+ if (!index->is_clust()
+ || update->fields[0].field_no != index->db_trx_id()
+ || update->fields[1].field_no != index->db_roll_ptr()) {
+ break;
+ }
+ update_sys:
+ ulint len;
+ const byte* sys = rec_get_nth_field(rec, offsets,
+ index->db_trx_id(), &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ page_zip_write_trx_id_and_roll_ptr(
+ block, rec, offsets, index->db_trx_id(),
+ trx_read_trx_id(sys),
+ trx_read_roll_ptr(sys + DATA_TRX_ID_LEN), mtr);
+ return;
+ }
+
+ page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+}
+
+/** Check if a ROW_FORMAT=COMPRESSED page can be updated in place
+@param cur cursor pointing to ROW_FORMAT=COMPRESSED page
+@param offsets rec_get_offsets(btr_cur_get_rec(cur))
+@param update index fields being updated
+@param mtr mini-transaction
+@return the record in the ROW_FORMAT=COMPRESSED page
+@retval nullptr if the page cannot be updated in place */
+ATTRIBUTE_COLD static
+rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets,
+ const upd_t& update, mtr_t *mtr)
+{
+ dict_index_t *index= cur->index();
+ ut_ad(!index->table->is_temporary());
+
+ switch (update.n_fields) {
+ case 0:
+ /* We are only changing the delete-mark flag. */
+ break;
+ case 1:
+ if (!index->is_clust() ||
+ update.fields[0].field_no != index->db_roll_ptr())
+ goto check_for_overflow;
+ /* We are only changing the delete-mark flag and DB_ROLL_PTR. */
+ break;
+ case 2:
+ if (!index->is_clust() ||
+ update.fields[0].field_no != index->db_trx_id() ||
+ update.fields[1].field_no != index->db_roll_ptr())
+ goto check_for_overflow;
+ /* We are only changing DB_TRX_ID, DB_ROLL_PTR, and the delete-mark.
+ They can be updated in place in the uncompressed part of the
+ ROW_FORMAT=COMPRESSED page. */
+ break;
+ check_for_overflow:
+ default:
+ if (!btr_cur_update_alloc_zip(btr_cur_get_page_zip(cur),
+ btr_cur_get_page_cur(cur),
+ offsets, rec_offs_size(offsets),
+ false, mtr))
+ return nullptr;
+ }
+
+ return btr_cur_get_rec(cur);
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+{
+ dict_index_t* index;
+ dberr_t err;
+ rec_t* rec;
+ roll_ptr_t roll_ptr = 0;
+ ulint was_delete_marked;
+
+ ut_ad(page_is_leaf(cursor->page_cur.block->page.frame));
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index();
+ ut_ad(!index->is_ibuf());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+ || index->table->is_temporary());
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->id == trx_id
+ || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+ == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+ ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
+ ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
+ ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
+
+ DBUG_LOG("ib_cur",
+ "update-in-place " << index->name << " (" << index->id
+ << ") by " << ib::hex(trx_id) << ": "
+ << rec_printer(rec, offsets).str());
+
+ buf_block_t* block = btr_cur_get_block(cursor);
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+ /* Check that enough space is available on the compressed page. */
+ if (UNIV_LIKELY_NULL(page_zip)
+ && !(rec = btr_cur_update_in_place_zip_check(
+ cursor, offsets, *update, mtr))) {
+ return DB_ZIP_OVERFLOW;
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+ goto func_exit;
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ err = btr_cur_upd_rec_sys(block, rec, index, offsets,
+ thr_get_trx(thr), roll_ptr, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ goto func_exit;
+ }
+ }
+
+ was_delete_marked = rec_get_deleted_flag(
+ rec, page_is_comp(buf_block_get_frame(block)));
+ /* In delete-marked records, DB_TRX_ID must always refer to an
+ existing undo log record. */
+ ut_ad(!was_delete_marked
+ || !dict_index_is_clust(index)
+ || row_get_rec_trx_id(rec, index, offsets));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ {
+ srw_spin_lock* ahi_latch = block->index
+ ? btr_search_sys.get_latch(*index) : NULL;
+ if (ahi_latch) {
+ /* TO DO: Can we skip this if none of the fields
+ index->search_info->curr_n_fields
+ are being updated? */
+
+ /* The function row_upd_changes_ord_field_binary
+ does not work on a secondary index. */
+
+ if (!dict_index_is_clust(index)
+ || row_upd_changes_ord_field_binary(
+ index, update, thr, NULL, NULL)) {
+ ut_ad(!(update->info_bits
+ & REC_INFO_MIN_REC_FLAG));
+ /* Remove possible hash index pointer
+ to this record */
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+ }
+
+ assert_block_ahi_valid(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
+ mtr);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi_latch) {
+ ahi_latch->wr_unlock();
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (was_delete_marked
+ && !rec_get_deleted_flag(
+ rec, page_is_comp(buf_block_get_frame(block)))) {
+ /* The new updated record owns its possible externally
+ stored fields */
+
+ btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
+ }
+
+ ut_ad(err == DB_SUCCESS);
+
+func_exit:
+ if (page_zip
+ && !(flags & BTR_KEEP_IBUF_BITMAP)
+ && !dict_index_is_clust(index)
+ && page_is_leaf(buf_block_get_frame(block))) {
+ /* Update the free bits in the insert buffer. */
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ return(err);
+}
+
+/** Trim a metadata record during the rollback of instant ALTER TABLE.
+@param[in] entry metadata tuple
+@param[in] index primary key
+@param[in] update update vector for the rollback */
+ATTRIBUTE_COLD
+static void btr_cur_trim_alter_metadata(dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update)
+{
+ ut_ad(index->is_instant());
+ ut_ad(update->is_alter_metadata());
+ ut_ad(entry->is_alter_metadata());
+
+ ut_ad(update->fields[0].field_no == index->first_user_field());
+ ut_ad(update->fields[0].new_val.ext);
+ ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
+ ut_ad(entry->n_fields - 1 == index->n_fields);
+
+ const byte* ptr = static_cast<const byte*>(
+ update->fields[0].new_val.data);
+ ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+ == index->table->space->id);
+
+ ulint n_fields = update->fields[1].field_no;
+ ut_ad(n_fields <= index->n_fields);
+ if (n_fields != index->n_uniq) {
+ ut_ad(n_fields
+ >= index->n_core_fields);
+ entry->n_fields = n_fields;
+ return;
+ }
+
+ /* This is based on dict_table_t::deserialise_columns()
+ and btr_cur_instant_init_low(). */
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space->id,
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+ 0, RW_S_LATCH, &mtr);
+ if (!block) {
+ ut_ad("corruption" == 0);
+ mtr.commit();
+ return;
+ }
+ ut_ad(fil_page_get_type(block->page.frame) == FIL_PAGE_TYPE_BLOB);
+ ut_ad(mach_read_from_4(&block->page.frame
+ [FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO])
+ == FIL_NULL);
+ ut_ad(mach_read_from_4(&block->page.frame
+ [FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN])
+ == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
+ n_fields = mach_read_from_4(
+ &block->page.frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
+ + index->first_user_field();
+ /* Rollback should not increase the number of fields. */
+ ut_ad(n_fields <= index->n_fields);
+ ut_ad(n_fields + 1 <= entry->n_fields);
+ /* dict_index_t::clear_instant_alter() cannot be invoked while
+ rollback of an instant ALTER TABLE transaction is in progress
+ for an is_alter_metadata() record. */
+ ut_ad(n_fields >= index->n_core_fields);
+
+ mtr.commit();
+ entry->n_fields = n_fields + 1;
+}
+
+/** Trim an update tuple due to instant ADD COLUMN, if needed.
+For normal records, the trailing instantly added fields that match
+the initial default values are omitted.
+
+For the special metadata record on a table on which instant
+ADD COLUMN has already been executed, both ADD COLUMN and the
+rollback of ADD COLUMN need to be handled specially.
+
+@param[in,out] entry index entry
+@param[in] index index
+@param[in] update update vector
+@param[in] thr execution thread */
+static inline
+void
+btr_cur_trim(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update,
+ const que_thr_t* thr)
+{
+ if (!index->is_instant()) {
+ } else if (UNIV_UNLIKELY(update->is_metadata())) {
+ /* We are either updating a metadata record
+ (instant ALTER TABLE on a table where instant ALTER was
+ already executed) or rolling back such an operation. */
+ ut_ad(!upd_get_nth_field(update, 0)->orig_len);
+ ut_ad(entry->is_metadata());
+
+ if (thr->graph->trx->in_rollback) {
+ /* This rollback can occur either as part of
+ ha_innobase::commit_inplace_alter_table() rolling
+ back after a failed innobase_add_instant_try(),
+ or as part of crash recovery. Either way, the
+ table will be in the data dictionary cache, with
+ the instantly added columns going to be removed
+ later in the rollback. */
+ ut_ad(index->table->cached);
+ /* The DB_TRX_ID,DB_ROLL_PTR are always last,
+ and there should be some change to roll back.
+ The first field in the update vector is the
+ first instantly added column logged by
+ innobase_add_instant_try(). */
+ ut_ad(update->n_fields > 2);
+ if (update->is_alter_metadata()) {
+ btr_cur_trim_alter_metadata(
+ entry, index, update);
+ return;
+ }
+ ut_ad(!entry->is_alter_metadata());
+
+ ulint n_fields = upd_get_nth_field(update, 0)
+ ->field_no;
+ ut_ad(n_fields + 1 >= entry->n_fields);
+ entry->n_fields = n_fields;
+ }
+ } else {
+ entry->trim(*index);
+ }
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; if this
+ is a secondary index, the caller must
+ mtr_commit(mtr) before latching any
+ further pages */
+{
+ dict_index_t* index;
+ page_cur_t* page_cursor;
+ dberr_t err;
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ rec_t* rec;
+ ulint max_size;
+ ulint new_rec_size;
+ ulint old_rec_size;
+ ulint max_ins_size = 0;
+ dtuple_t* new_entry;
+ roll_ptr_t roll_ptr;
+ ulint i;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index();
+ ut_ad(index->has_locking());
+ ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+ || index->table->is_temporary());
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ /* This is intended only for leaf page updates */
+ ut_ad(page_is_leaf(page));
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->id == trx_id
+ || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+ == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+ ut_ad(fil_page_index_page_check(page));
+ ut_ad(btr_page_get_index_id(page) == index->id);
+
+ *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+ ULINT_UNDEFINED, heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!rec_offs_any_null_extern(rec, *offsets)
+ || thr_get_trx(thr) == trx_roll_crash_recv_trx);
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ if (UNIV_LIKELY(!update->is_metadata())
+ && !row_upd_changes_field_size_or_external(index, *offsets,
+ update)) {
+
+ /* The simplest and the most common case: the update does not
+ change the size of any field and none of the updated fields is
+ externally stored in rec or update, and there is enough space
+ on the compressed page to log the update. */
+
+ return(btr_cur_update_in_place(
+ flags, cursor, *offsets, update,
+ cmpl_info, thr, trx_id, mtr));
+ }
+
+ if (rec_offs_any_extern(*offsets)) {
+any_extern:
+ ut_ad(!index->is_ibuf());
+ /* Externally stored fields are treated in pessimistic
+ update */
+
+ /* prefetch siblings of the leaf for the pessimistic
+ operation. */
+ btr_cur_prefetch_siblings(block, index);
+
+ return(DB_OVERFLOW);
+ }
+
+ if (rec_is_metadata(rec, *index) && index->table->instant) {
+ goto any_extern;
+ }
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+ goto any_extern;
+ }
+ }
+
+ DBUG_LOG("ib_cur",
+ "update " << index->name << " (" << index->id << ") by "
+ << ib::hex(trx_id) << ": "
+ << rec_printer(rec, *offsets).str());
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ if (!*heap) {
+ *heap = mem_heap_create(
+ rec_offs_size(*offsets)
+ + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
+ }
+
+ new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
+ ut_ad(!dtuple_get_n_ext(new_entry));
+
+ /* The page containing the clustered index record
+ corresponding to new_entry is latched in mtr.
+ Thus the following call is safe. */
+ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+ *heap);
+ btr_cur_trim(new_entry, index, update, thr);
+ old_rec_size = rec_offs_size(*offsets);
+ new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+ page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+
+ if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
+ dict_index_get_n_fields(index),
+ block->zip_size())) {
+ goto any_extern;
+ }
+
+ if (!btr_cur_update_alloc_zip(
+ page_zip, page_cursor, *offsets,
+ new_rec_size, true, mtr)) {
+ return(DB_ZIP_OVERFLOW);
+ }
+
+ rec = page_cur_get_rec(page_cursor);
+ }
+
+ /* We limit max record size to 16k even for 64k page size. */
+ if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
+ (!dict_table_is_comp(index->table)
+ && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
+ err = DB_OVERFLOW;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(new_rec_size
+ >= (page_get_free_space_of_empty(page_is_comp(page))
+ / 2))) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+ err = DB_OVERFLOW;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_get_data_size(page)
+ - old_rec_size + new_rec_size
+ < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+
+ /* The page would become too empty */
+ err = DB_UNDERFLOW;
+ goto func_exit;
+ }
+
+ /* We do not attempt to reorganize if the page is compressed.
+ This is because the page may fail to compress after reorganization. */
+ max_size = page_zip
+ ? page_get_max_insert_size(page, 1)
+ : (old_rec_size
+ + page_get_max_insert_size_after_reorganize(page, 1));
+
+ if (!page_zip) {
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ page, 1);
+ }
+
+ if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+ && (max_size >= new_rec_size))
+ || (page_get_n_recs(page) <= 1))) {
+
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+
+ /* There was not enough space, or it did not pay to
+ reorganize: for simplicity, we decide what to do assuming a
+ reorganization is needed, though it might not be necessary */
+
+ err = DB_OVERFLOW;
+ goto func_exit;
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (err != DB_SUCCESS) {
+ /* We may need to update the IBUF_BITMAP_FREE
+ bits after a reorganize that was done in
+ btr_cur_update_alloc_zip(). */
+ goto func_exit;
+ }
+
+ /* Ok, we may do the replacement. Store on the page infimum the
+ explicit locks on rec, before deleting rec (see the comment in
+ btr_cur_pessimistic_update). */
+ if (index->has_locking()) {
+ lock_rec_store_on_page_infimum(block, rec);
+ }
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ ut_ad(new_entry->is_metadata());
+ ut_ad(index->is_instant());
+ /* This can be innobase_add_instant_try() performing a
+ subsequent instant ADD COLUMN, or its rollback by
+ row_undo_mod_clust_low(). */
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ } else {
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ page_cur_delete_rec(page_cursor, *offsets, mtr);
+
+ if (!page_cur_move_to_prev(page_cursor)) {
+ return DB_CORRUPTION;
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+ }
+
+ rec = btr_cur_insert_if_possible(cursor, new_entry, offsets, heap,
+ 0/*n_ext*/, mtr);
+ if (UNIV_UNLIKELY(!rec)) {
+ goto corrupted;
+ }
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ /* We must empty the PAGE_FREE list, because if this
+ was a rollback, the shortened metadata record
+ would have too many fields, and we would be unable to
+ know the size of the freed record. */
+ err = btr_page_reorganize(page_cursor, mtr);
+ if (err != DB_SUCCESS) {
+ goto func_exit;
+ }
+ } else {
+ /* Restore the old explicit lock state on the record */
+ lock_rec_restore_from_page_infimum(*block, rec,
+ block->page.id());
+ }
+
+ ut_ad(err == DB_SUCCESS);
+ if (!page_cur_move_to_next(page_cursor)) {
+corrupted:
+ err = DB_CORRUPTION;
+ }
+
+func_exit:
+ if (!(flags & BTR_KEEP_IBUF_BITMAP)
+ && !dict_index_is_clust(index)) {
+ /* Update the free bits in the insert buffer. */
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ } else if (!index->table->is_temporary()) {
+ ibuf_update_free_bits_low(block, max_ins_size, mtr);
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ /* prefetch siblings of the leaf for the pessimistic
+ operation. */
+ btr_cur_prefetch_siblings(block, index);
+ }
+
+ return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+dberr_t
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+ buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: updated record */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ page = buf_block_get_frame(block);
+
+ if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+ /* Updated record is not the first user record on its page */
+ return DB_SUCCESS;
+ }
+
+ const uint32_t prev_page_no = btr_page_get_prev(page);
+
+ const page_id_t block_id{block->page.id()};
+ const page_id_t prev_id(block_id.space(), prev_page_no);
+ dberr_t err;
+ buf_block_t* prev_block
+ = buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr,
+ BUF_PEEK_IF_IN_POOL, mtr, &err);
+ /* Since we already held an x-latch on prev_block, it must
+ be available and not be corrupted unless the buffer pool got
+ corrupted somehow. */
+ if (UNIV_UNLIKELY(!prev_block)) {
+ return err;
+ }
+ ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT,
+ block->page.frame + FIL_PAGE_OFFSET, 4));
+
+ /* We must already have an x-latch on prev_block! */
+ ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
+
+ lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id,
+ PAGE_HEAP_NO_SUPREMUM,
+ page_rec_get_heap_no(rec));
+ return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
+ cursor may become invalid if *big_rec == NULL
+ || !(flags & BTR_KEEP_POS_FLAG) */
+ rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
+ mem_heap_t** offsets_heap,
+ /*!< in/out: pointer to memory heap
+ that can be emptied */
+ mem_heap_t* entry_heap,
+ /*!< in/out: memory heap for allocating
+ big_rec and the index tuple */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller */
+ upd_t* update, /*!< in/out: update vector; this is allowed to
+ also contain trx id and roll ptr fields.
+ Non-updated columns that are moved offpage will
+ be appended to this. */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction; must be
+ committed before latching any further pages */
+{
+ big_rec_t* big_rec_vec = NULL;
+ big_rec_t* dummy_big_rec;
+ dict_index_t* index;
+ buf_block_t* block;
+ page_zip_des_t* page_zip;
+ rec_t* rec;
+ page_cur_t* page_cursor;
+ dberr_t err;
+ dberr_t optim_err;
+ roll_ptr_t roll_ptr;
+ bool was_first;
+ uint32_t n_reserved = 0;
+
+ *offsets = NULL;
+ *big_rec = NULL;
+
+ block = btr_cur_get_block(cursor);
+ page_zip = buf_block_get_page_zip(block);
+ index = cursor->index();
+ ut_ad(index->has_locking());
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+ MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_ad(!page_zip || !index->table->is_temporary());
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+ || index->table->is_temporary());
+ ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+ || dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->id == trx_id
+ || (flags & ulint(~BTR_KEEP_POS_FLAG))
+ == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+ | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+
+ err = optim_err = btr_cur_optimistic_update(
+ flags | BTR_KEEP_IBUF_BITMAP,
+ cursor, offsets, offsets_heap, update,
+ cmpl_info, thr, trx_id, mtr);
+
+ switch (err) {
+ case DB_ZIP_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_OVERFLOW:
+ break;
+ default:
+ err_exit:
+ /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
+ For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
+ already reset by btr_cur_update_alloc_zip() if the
+ page was recompressed. */
+ if (page_zip
+ && optim_err != DB_ZIP_OVERFLOW
+ && !dict_index_is_clust(index)
+ && page_is_leaf(block->page.frame)) {
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ if (big_rec_vec != NULL) {
+ dtuple_big_rec_free(big_rec_vec);
+ }
+
+ return(err);
+ }
+
+ rec = btr_cur_get_rec(cursor);
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ dtuple_t* new_entry;
+
+ const bool is_metadata = rec_is_metadata(rec, *index);
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ ut_ad(update->is_metadata());
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(index->is_instant());
+ new_entry = row_metadata_to_tuple(
+ rec, index, *offsets, entry_heap,
+ update->info_bits, !thr_get_trx(thr)->in_rollback);
+ ut_ad(new_entry->n_fields
+ == ulint(index->n_fields)
+ + update->is_alter_metadata());
+ } else {
+ new_entry = row_rec_to_index_entry(rec, index, *offsets,
+ entry_heap);
+ }
+
+ /* The page containing the clustered index record
+ corresponding to new_entry is latched in mtr. If the
+ clustered index record is delete-marked, then its externally
+ stored fields cannot have been purged yet, because then the
+ purge would also have removed the clustered index record
+ itself. Thus the following call is safe. */
+ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+ entry_heap);
+ btr_cur_trim(new_entry, index, update, thr);
+
+ /* We have to set appropriate extern storage bits in the new
+ record to be inserted: we have to remember which fields were such */
+
+ ut_ad(!page_is_comp(block->page.frame) || !rec_get_node_ptr_flag(rec));
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ if ((flags & BTR_NO_UNDO_LOG_FLAG)
+ && rec_offs_any_extern(*offsets)) {
+ /* We are in a transaction rollback undoing a row
+ update: we must free possible externally stored fields
+ which got new values in the update, if they are not
+ inherited values. They can be inherited if we have
+ updated the primary key to another value, and then
+ update it back again. */
+
+ ut_ad(big_rec_vec == NULL);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(thr_get_trx(thr)->in_rollback);
+
+ DEBUG_SYNC_C("blob_rollback_middle");
+
+ btr_rec_free_updated_extern_fields(
+ index, rec, block, *offsets, update, true, mtr);
+ }
+
+ ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
+
+ if (page_zip_rec_needs_ext(
+ rec_get_converted_size(index, new_entry, n_ext),
+ page_is_comp(block->page.frame),
+ dict_index_get_n_fields(index),
+ block->zip_size())
+ || (UNIV_UNLIKELY(update->is_alter_metadata())
+ && !dfield_is_ext(dtuple_get_nth_field(
+ new_entry,
+ index->first_user_field())))) {
+ big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
+ if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+ /* We cannot goto return_after_reservations,
+ because we may need to update the
+ IBUF_BITMAP_FREE bits, which was suppressed by
+ BTR_KEEP_IBUF_BITMAP. */
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, block->page.frame,
+ index));
+#endif /* UNIV_ZIP_DEBUG */
+ index->table->space->release_free_extents(n_reserved);
+ err = DB_TOO_BIG_RECORD;
+ goto err_exit;
+ }
+
+ ut_ad(page_is_leaf(block->page.frame));
+ ut_ad(dict_index_is_clust(index));
+ if (UNIV_UNLIKELY(!(flags & BTR_KEEP_POS_FLAG))) {
+ ut_ad(page_zip != NULL);
+ dtuple_convert_back_big_rec(index, new_entry,
+ big_rec_vec);
+ big_rec_vec = NULL;
+ n_ext = dtuple_get_n_ext(new_entry);
+ }
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+
+ if (optim_err == DB_OVERFLOW) {
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the update will not fail because
+ of lack of space */
+
+ err = fsp_reserve_free_extents(
+ &n_reserved, index->table->space,
+ uint32_t(cursor->tree_height / 16 + 3),
+ flags & BTR_NO_UNDO_LOG_FLAG
+ ? FSP_CLEANING : FSP_NORMAL,
+ mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ err = DB_OUT_OF_FILE_SPACE;
+ goto err_exit;
+ }
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+ }
+
+ const ulint max_ins_size = page_zip
+ ? 0
+ : page_get_max_insert_size_after_reorganize(block->page.frame,
+ 1);
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ ut_ad(new_entry->is_metadata());
+ ut_ad(index->is_instant());
+ /* This can be innobase_add_instant_try() performing a
+ subsequent instant ALTER TABLE, or its rollback by
+ row_undo_mod_clust_low(). */
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ } else {
+ btr_search_update_hash_on_delete(cursor);
+
+ /* Store state of explicit locks on rec on the page
+ infimum record, before deleting rec. The page infimum
+ acts as a dummy carrier of the locks, taking care also
+ of lock releases, before we can move the locks back on
+ the actual record. There is a special case: if we are
+ inserting on the root page and the insert causes a
+ call of btr_root_raise_and_insert. Therefore we cannot
+ in the lock system delete the lock structs set on the
+ root page even if the root page carries just node
+ pointers. */
+ lock_rec_store_on_page_infimum(block, rec);
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ page_cur_delete_rec(page_cursor, *offsets, mtr);
+
+ if (!page_cur_move_to_prev(page_cursor)) {
+ err = DB_CORRUPTION;
+ goto return_after_reservations;
+ }
+
+ rec = btr_cur_insert_if_possible(cursor, new_entry,
+ offsets, offsets_heap, n_ext, mtr);
+
+ if (rec) {
+ page_cursor->rec = rec;
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* We must empty the PAGE_FREE list, because if this
+ was a rollback, the shortened metadata record
+ would have too many fields, and we would be unable to
+ know the size of the freed record. */
+ err = btr_page_reorganize(page_cursor, mtr);
+ if (err != DB_SUCCESS) {
+ goto return_after_reservations;
+ }
+ rec = page_cursor->rec;
+ rec_offs_make_valid(rec, index, true, *offsets);
+ if (page_cursor->block->page.id().page_no()
+ == index->page) {
+ btr_set_instant(page_cursor->block, *index,
+ mtr);
+ }
+ } else {
+ lock_rec_restore_from_page_infimum(
+ *btr_cur_get_block(cursor), rec,
+ block->page.id());
+ }
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
+ || rec_is_alter_metadata(rec, *index)) {
+ /* The new inserted record owns its possible externally
+ stored fields */
+ btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
+ rec, index, *offsets, mtr);
+ } else {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+ }
+
+ bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
+ ut_ad(!adjust || page_is_leaf(block->page.frame));
+
+ if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
+ if (adjust) {
+ rec_offs_make_valid(page_cursor->rec, index,
+ true, *offsets);
+ }
+ } else if (!dict_index_is_clust(index)
+ && page_is_leaf(block->page.frame)) {
+ /* Update the free bits in the insert buffer.
+ This is the same block which was skipped by
+ BTR_KEEP_IBUF_BITMAP. */
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+ ibuf_update_free_bits_zip(block, mtr);
+ } else if (!index->table->is_temporary()) {
+ ibuf_update_free_bits_low(block, max_ins_size,
+ mtr);
+ }
+ }
+
+#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
+ if (!big_rec_vec
+ && page_is_leaf(block->page.frame)
+ && !dict_index_is_online_ddl(index)) {
+ mtr->release(index->lock);
+ /* NOTE: We cannot release root block latch here, because it
+ has segment header and already modified in most of cases.*/
+ }
+#endif
+
+ err = DB_SUCCESS;
+ goto return_after_reservations;
+ } else {
+ /* If the page is compressed and it initially
+ compresses very well, and there is a subsequent insert
+ of a badly-compressing record, it is possible for
+ btr_cur_optimistic_update() to return DB_UNDERFLOW and
+ btr_cur_insert_if_possible() to return FALSE. */
+ ut_a(page_zip || optim_err != DB_UNDERFLOW);
+
+ /* Out of space: reset the free bits.
+ This is the same block which was skipped by
+ BTR_KEEP_IBUF_BITMAP. */
+ if (!dict_index_is_clust(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(block->page.frame)) {
+ ibuf_reset_free_bits(block);
+ }
+ }
+
+ if (big_rec_vec != NULL) {
+ ut_ad(page_is_leaf(block->page.frame));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(flags & BTR_KEEP_POS_FLAG);
+
+ /* btr_page_split_and_insert() in
+ btr_cur_pessimistic_insert() invokes
+ mtr->release(index->lock).
+ We must keep the index->lock when we created a
+ big_rec, so that row_upd_clust_rec() can store the
+ big_rec in the same mini-transaction. */
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ mtr_sx_lock_index(index, mtr);
+ }
+
+ /* Was the record to be updated positioned as the first user
+ record on its page? */
+ was_first = page_cur_is_before_first(page_cursor);
+
+ /* Lock checks and undo logging were already performed by
+ btr_cur_upd_lock_and_undo(). We do not try
+ btr_cur_optimistic_insert() because
+ btr_cur_insert_if_possible() already failed above. */
+
+ err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ cursor, offsets, offsets_heap,
+ new_entry, &rec,
+ &dummy_big_rec, n_ext, NULL, mtr);
+ ut_a(err == DB_SUCCESS);
+ ut_a(rec);
+ ut_a(dummy_big_rec == NULL);
+ ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+ page_cursor->rec = rec;
+
+ /* Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && !index->table->is_temporary()) {
+ /* Update PAGE_MAX_TRX_ID in the index page header.
+ It was not updated by btr_cur_pessimistic_insert()
+ because of BTR_NO_LOCKING_FLAG. */
+ page_update_max_trx_id(btr_cur_get_block(cursor),
+ btr_cur_get_page_zip(cursor),
+ trx_id, mtr);
+ }
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+ /* The new inserted record owns its possible externally
+ stored fields */
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
+ index, *offsets, mtr);
+ } else {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+ }
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* We must empty the PAGE_FREE list, because if this
+ was a rollback, the shortened metadata record
+ would have too many fields, and we would be unable to
+ know the size of the freed record. */
+ err = btr_page_reorganize(page_cursor, mtr);
+ if (err != DB_SUCCESS) {
+ goto return_after_reservations;
+ }
+ rec = page_cursor->rec;
+ } else {
+ lock_rec_restore_from_page_infimum(
+ *btr_cur_get_block(cursor), rec, block->page.id());
+ }
+
+ /* If necessary, restore also the correct lock state for a new,
+ preceding supremum record created in a page split. While the old
+ record was nonexistent, the supremum might have inherited its locks
+ from a wrong record. */
+
+ if (!was_first) {
+ err = btr_cur_pess_upd_restore_supremum(
+ btr_cur_get_block(cursor), rec, mtr);
+ }
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(err ||
+ !page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
+ btr_cur_get_page(cursor), index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ index->table->space->release_free_extents(n_reserved);
+ *big_rec = big_rec_vec;
+ return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/** Modify the delete-mark flag of a record.
+@tparam flag the value of the delete-mark flag
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in,out] mtr mini-transaction */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+{
+ if (page_rec_is_comp(rec))
+ {
+ byte *b= &rec[-REC_NEW_INFO_BITS];
+ const byte v= flag
+ ? (*b | REC_INFO_DELETED_FLAG)
+ : (*b & byte(~REC_INFO_DELETED_FLAG));
+ if (*b == v);
+ else if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ *b= v;
+ page_zip_rec_set_deleted(block, rec, flag, mtr);
+ }
+ else
+ mtr->write<1>(*block, b, v);
+ }
+ else
+ {
+ ut_ad(!block->page.zip.data);
+ byte *b= &rec[-REC_OLD_INFO_BITS];
+ const byte v = flag
+ ? (*b | REC_INFO_DELETED_FLAG)
+ : (*b & byte(~REC_INFO_DELETED_FLAG));
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
+ }
+}
+
+template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
+template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
+
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ buf_block_t* block, /*!< in/out: buffer block of the record */
+ rec_t* rec, /*!< in/out: record */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
+ que_thr_t* thr, /*!< in: query thread */
+ const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
+ contains the virtual cols if there are any */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ roll_ptr_t roll_ptr;
+ dberr_t err;
+ trx_t* trx;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(buf_block_get_frame(block) == page_align(rec));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ /* We may already have delete-marked this record
+ when executing an ON DELETE CASCADE operation. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets)
+ == thr_get_trx(thr)->id);
+ return(DB_SUCCESS);
+ }
+
+ err = trx_undo_report_row_operation(thr, index,
+ entry, NULL, 0, rec, offsets,
+ &roll_ptr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* The search latch is not needed here, because
+ the adaptive hash index does not depend on the delete-mark
+ and the delete-mark is being updated in place. */
+
+ btr_rec_set_deleted<true>(block, rec, mtr);
+
+ trx = thr_get_trx(thr);
+
+ DBUG_LOG("ib_cur",
+ "delete-mark clust " << index->table->name
+ << " (" << index->id << ") by "
+ << ib::hex(trx->id) << ": "
+ << rec_printer(rec, offsets).str());
+
+ return btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr,
+ mtr);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return whether compression occurred */
+bool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
+ cursor does not stay valid if !adjust and
+ compression occurs */
+ bool adjust, /*!< in: whether the cursor position should be
+ adjusted even when compression occurs */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+ MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ if (cursor->index()->is_spatial()) {
+ const trx_t* trx = cursor->rtr_info->thr
+ ? thr_get_trx(cursor->rtr_info->thr)
+ : NULL;
+ const buf_block_t* block = btr_cur_get_block(cursor);
+
+ /* Check whether page lock prevents the compression */
+ if (!lock_test_prdt_page_lock(trx, block->page.id())) {
+ return(false);
+ }
+ }
+
+ return btr_cur_compress_recommendation(cursor, mtr)
+ && btr_compress(cursor, adjust, mtr) == DB_SUCCESS;
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
+ delete; cursor stays valid: if deletion
+ succeeds, on function exit it points to the
+ successor of the deleted record */
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+ mtr_t* mtr) /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+{
+ buf_block_t* block;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->is_named_space(cursor->index()->table->space));
+ ut_ad(!cursor->index()->is_dummy);
+
+ /* This is intended only for leaf page deletions */
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(block->page.id().space() == cursor->index()->table->space->id);
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+ ut_ad(!dict_index_is_online_ddl(cursor->index())
+ || cursor->index()->is_clust()
+ || (flags & BTR_CREATE_FLAG));
+
+ rec = btr_cur_get_rec(cursor);
+
+ offsets = rec_get_offsets(rec, cursor->index(), offsets,
+ cursor->index()->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+
+ dberr_t err = DB_SUCCESS;
+ if (rec_offs_any_extern(offsets)
+ || !btr_cur_can_delete_without_compress(cursor,
+ rec_offs_size(offsets),
+ mtr)) {
+ /* prefetch siblings of the leaf for the pessimistic
+ operation. */
+ btr_cur_prefetch_siblings(block, cursor->index());
+ err = DB_FAIL;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index()->page
+ && page_get_n_recs(block->page.frame) == 1
+ + (cursor->index()->is_instant()
+ && !rec_is_metadata(rec, *cursor->index()))
+ && !cursor->index()
+ ->must_avoid_clear_instant_add())) {
+ /* The whole index (and table) becomes logically empty.
+ Empty the whole page. That is, if we are deleting the
+ only user record, also delete the metadata record
+ if one exists for instant ADD COLUMN (not generic ALTER TABLE).
+ If we are deleting the metadata record and the
+ table becomes empty, clean up the whole page. */
+ dict_index_t* index = cursor->index();
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(block->page.frame));
+ if (UNIV_UNLIKELY(!first_rec)) {
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+ ut_ad(!index->is_instant()
+ || rec_is_metadata(first_rec, *index));
+ const bool is_metadata = rec_is_metadata(rec, *index);
+ /* We can remove the metadata when rolling back an
+ instant ALTER TABLE operation, or when deleting the
+ last user record on the page such that only metadata for
+ instant ADD COLUMN (not generic ALTER TABLE) remains. */
+ const bool empty_table = is_metadata
+ || !index->is_instant()
+ || (first_rec != rec
+ && rec_is_add_metadata(first_rec, *index));
+ if (UNIV_LIKELY(empty_table)) {
+ if (UNIV_LIKELY(!is_metadata && !flags)) {
+ lock_update_delete(block, rec);
+ }
+ btr_page_empty(block, buf_block_get_page_zip(block),
+ index, 0, mtr);
+ if (index->is_instant()) {
+ /* MDEV-17383: free metadata BLOBs! */
+ index->clear_instant_alter();
+ }
+
+ page_cur_set_after_last(block,
+ btr_cur_get_page_cur(cursor));
+ goto func_exit;
+ }
+ }
+
+ {
+ page_t* page = buf_block_get_frame(block);
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+
+ if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
+ & REC_INFO_MIN_REC_FLAG)) {
+ /* This should be rolling back instant ADD COLUMN.
+ If this is a recovered transaction, then
+ index->is_instant() will hold until the
+ insert into SYS_COLUMNS is rolled back. */
+ ut_ad(cursor->index()->table->supports_instant());
+ ut_ad(cursor->index()->is_primary());
+ ut_ad(!page_zip);
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ offsets, mtr);
+ /* We must empty the PAGE_FREE list, because
+ after rollback, this deleted metadata record
+ would have too many fields, and we would be
+ unable to know the size of the freed record. */
+ err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+ mtr);
+ goto func_exit;
+ } else {
+ if (!flags) {
+ lock_update_delete(block, rec);
+ }
+
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ if (page_zip) {
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page,
+ cursor->index()));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page,
+ cursor->index()));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* On compressed pages, the IBUF_BITMAP_FREE
+ space is not affected by deleting (purging)
+ records, because it is defined as the minimum
+ of space available *without* reorganize, and
+ space available in the modification log. */
+ } else {
+ const ulint max_ins
+ = page_get_max_insert_size_after_reorganize(
+ page, 1);
+
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ offsets, mtr);
+
+ /* The change buffer does not handle inserts
+ into non-leaf pages, into clustered indexes,
+ or into the change buffer. */
+ if (!cursor->index()->is_clust()
+ && !cursor->index()->table->is_temporary()
+ && !dict_index_is_ibuf(cursor->index())) {
+ ibuf_update_free_bits_low(block, max_ins, mtr);
+ }
+ }
+ }
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return err;
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred and FALSE if not or something
+wrong. */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ dict_index_t* index;
+ rec_t* rec;
+ uint32_t n_reserved = 0;
+ ibool ret = FALSE;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+#ifdef UNIV_DEBUG
+ bool parent_latched = false;
+#endif /* UNIV_DEBUG */
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+ ut_ad(!dict_index_is_online_ddl(index)
+ || dict_index_is_clust(index)
+ || (flags & BTR_CREATE_FLAG));
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->is_named_space(index->table->space));
+ ut_ad(!index->is_dummy);
+ ut_ad(block->page.id().space() == index->table->space->id);
+
+ if (!has_reserved_extents) {
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the node pointer updates will
+ not fail because of lack of space */
+
+ uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
+
+ *err = fsp_reserve_free_extents(&n_reserved,
+ index->table->space,
+ n_extents,
+ FSP_CLEANING, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return(FALSE);
+ }
+ }
+
+ heap = mem_heap_create(1024);
+ rec = btr_cur_get_rec(cursor);
+ page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+
+ if (rec_offs_any_extern(offsets)) {
+ btr_rec_free_externally_stored_fields(index,
+ rec, offsets, block,
+ rollback, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ rec_t* next_rec = NULL;
+ bool min_mark_next_rec = false;
+
+ if (page_is_leaf(page)) {
+ const bool is_metadata = rec_is_metadata(
+ rec, page_rec_is_comp(rec));
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* This should be rolling back instant ALTER TABLE.
+ If this is a recovered transaction, then
+ index->is_instant() will hold until the
+ insert into SYS_COLUMNS is rolled back. */
+ ut_ad(rollback);
+ ut_ad(index->table->supports_instant());
+ ut_ad(index->is_primary());
+ } else if (flags == 0) {
+ lock_update_delete(block, rec);
+ }
+
+ if (block->page.id().page_no() != index->page) {
+ if (page_get_n_recs(page) < 2) {
+ goto discard_page;
+ }
+ } else if (page_get_n_recs(page) == 1
+ + (index->is_instant() && !is_metadata)
+ && !index->must_avoid_clear_instant_add()) {
+ /* The whole index (and table) becomes logically empty.
+ Empty the whole page. That is, if we are deleting the
+ only user record, also delete the metadata record
+ if one exists for instant ADD COLUMN
+ (not generic ALTER TABLE).
+ If we are deleting the metadata record
+ (in the rollback of instant ALTER TABLE) and the
+ table becomes empty, clean up the whole page. */
+
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(page));
+ if (UNIV_UNLIKELY(!first_rec)) {
+ *err = DB_CORRUPTION;
+ goto err_exit;
+ }
+ ut_ad(!index->is_instant()
+ || rec_is_metadata(first_rec, *index));
+ if (is_metadata || !index->is_instant()
+ || (first_rec != rec
+ && rec_is_add_metadata(first_rec, *index))) {
+ btr_page_empty(block, page_zip, index, 0, mtr);
+ if (index->is_instant()) {
+ /* MDEV-17383: free metadata BLOBs! */
+ index->clear_instant_alter();
+ }
+
+ page_cur_set_after_last(
+ block,
+ btr_cur_get_page_cur(cursor));
+ ret = TRUE;
+ goto return_after_reservations;
+ }
+ }
+
+ if (UNIV_LIKELY(!is_metadata)) {
+ btr_search_update_hash_on_delete(cursor);
+ } else {
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ offsets, mtr);
+ /* We must empty the PAGE_FREE list, because
+ after rollback, this deleted metadata record
+ would carry too many fields, and we would be
+ unable to know the size of the freed record. */
+ *err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+ mtr);
+ ut_ad(!ret);
+ goto err_exit;
+ }
+ } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
+ if (page_rec_is_last(rec, page)) {
+discard_page:
+ ut_ad(page_get_n_recs(page) == 1);
+ /* If there is only one record, drop
+ the whole page. */
+
+ btr_discard_page(cursor, mtr);
+
+ ret = TRUE;
+ goto return_after_reservations;
+ }
+
+ if (UNIV_UNLIKELY(!(next_rec = page_rec_get_next(rec)))) {
+ ut_ad(!ret);
+ *err = DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ btr_cur_t cursor;
+ cursor.page_cur.index = index;
+ cursor.page_cur.block = block;
+
+ if (!page_has_prev(page)) {
+ /* If we delete the leftmost node pointer on a
+ non-leaf level, we must mark the new leftmost node
+ pointer as the predefined minimum record */
+
+ min_mark_next_rec = true;
+ } else if (index->is_spatial()) {
+ /* For rtree, if delete the leftmost node pointer,
+ we need to update parent page. */
+ rtr_mbr_t father_mbr;
+ rec_t* father_rec;
+ rec_offs* offsets;
+ ulint len;
+
+ rtr_page_get_father_block(NULL, heap, mtr, NULL,
+ &cursor);
+ father_rec = btr_cur_get_rec(&cursor);
+ offsets = rec_get_offsets(father_rec, index, NULL,
+ 0, ULINT_UNDEFINED, &heap);
+
+ rtr_read_mbr(rec_get_nth_field(
+ father_rec, offsets, 0, &len), &father_mbr);
+
+ rtr_update_mbr_field(&cursor, offsets, NULL,
+ page, &father_mbr, next_rec, mtr);
+ ut_d(parent_latched = true);
+ } else {
+ /* Otherwise, if we delete the leftmost node pointer
+ on a page, we have to change the parent node pointer
+ so that it is equal to the new leftmost node pointer
+ on the page */
+ ret = btr_page_get_father(mtr, &cursor);
+ if (!ret) {
+ *err = DB_CORRUPTION;
+ goto err_exit;
+ }
+ *err = btr_cur_node_ptr_delete(&cursor, mtr);
+ if (*err != DB_SUCCESS) {
+got_err:
+ ret = FALSE;
+ goto err_exit;
+ }
+
+ const ulint level = btr_page_get_level(page);
+ // FIXME: reuse the node_ptr from above
+ dtuple_t* node_ptr = dict_index_build_node_ptr(
+ index, next_rec, block->page.id().page_no(),
+ heap, level);
+
+ *err = btr_insert_on_non_leaf_level(
+ flags, index, level + 1, node_ptr, mtr);
+ if (*err != DB_SUCCESS) {
+ ret = FALSE;
+ goto got_err;
+ }
+
+ ut_d(parent_latched = true);
+ }
+ }
+
+ /* SPATIAL INDEX never use U locks; we can allow page merges
+ while holding X lock on the spatial index tree.
+ Do not allow merges of non-leaf B-tree pages unless it is
+ safe to do so. */
+ {
+ const bool allow_merge = page_is_leaf(page)
+ || dict_index_is_spatial(index)
+ || btr_cur_will_modify_tree(
+ index, page, BTR_INTENTION_DELETE, rec,
+ btr_node_ptr_max_size(index),
+ block->zip_size(), mtr);
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ offsets, mtr);
+
+ if (min_mark_next_rec) {
+ btr_set_min_rec_mark(next_rec, *block, mtr);
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_ad(!parent_latched
+ || btr_check_node_ptr(index, block, mtr));
+
+ if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
+ if (UNIV_LIKELY(allow_merge)) {
+ ret = btr_cur_compress_if_useful(
+ cursor, FALSE, mtr);
+ } else {
+ ib::warn() << "Not merging page "
+ << block->page.id()
+ << " in index " << index->name
+ << " of " << index->table->name;
+ ut_ad("MDEV-14637" == 0);
+ }
+ }
+ }
+
+return_after_reservations:
+ *err = DB_SUCCESS;
+err_exit:
+ mem_heap_free(heap);
+
+#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
+ if (page_is_leaf(page)
+ && !dict_index_is_online_ddl(index)) {
+ mtr->release(index->lock);
+ /* NOTE: We cannot release root block latch here, because it
+ has segment header and already modified in most of cases.*/
+ }
+#endif
+
+ index->table->space->release_free_extents(n_reserved);
+ return(ret);
+}
+
+/** Delete the node pointer in a parent page.
+@param[in,out] parent cursor pointing to parent record
+@param[in,out] mtr mini-transaction */
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
+ MTR_MEMO_PAGE_X_FIX));
+ dberr_t err;
+ ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
+ BTR_CREATE_FLAG, false,
+ mtr);
+ if (err == DB_SUCCESS && !compressed) {
+ btr_cur_compress_if_useful(parent, FALSE, mtr);
+ }
+
+ return err;
+}
+
+/** Represents the cursor for the number of rows estimation. The
+content is used for level-by-level diving and estimation the number of rows
+on each level. */
+class btr_est_cur_t
+{
+ /* Assume a page like:
+ records: (inf, a, b, c, d, sup)
+ index of the record: 0, 1, 2, 3, 4, 5
+ */
+
+ /** Index of the record where the page cursor stopped on this level
+ (index in alphabetical order). In the above example, if the search stopped on
+ record 'c', then nth_rec will be 3. */
+ ulint m_nth_rec;
+
+ /** Number of the records on the page, not counting inf and sup.
+ In the above example n_recs will be 4. */
+ ulint m_n_recs;
+
+ /** Search tuple */
+ const dtuple_t &m_tuple;
+ /** Cursor search mode */
+ page_cur_mode_t m_mode;
+ /** Page cursor which is used for search */
+ page_cur_t m_page_cur;
+ /** Page id of the page to get on level down, can differ from
+ m_block->page.id at the moment when the child's page id is already found, but
+ the child's block has not fetched yet */
+ page_id_t m_page_id;
+ /** Current block */
+ buf_block_t *m_block;
+ /** Page search mode, can differ from m_mode for non-leaf pages, see c-tor
+ comments for details */
+ page_cur_mode_t m_page_mode;
+
+ /** Matched fields and bytes which are used for on-page search, see
+ btr_cur_t::(up|low)_(match|bytes) comments for details */
+ ulint m_up_match= 0;
+ ulint m_up_bytes= 0;
+ ulint m_low_match= 0;
+ ulint m_low_bytes= 0;
+
+public:
+ btr_est_cur_t(dict_index_t *index, const dtuple_t &tuple,
+ page_cur_mode_t mode)
+ : m_tuple(tuple), m_mode(mode),
+ m_page_id(index->table->space_id, index->page), m_block(nullptr)
+ {
+
+ ut_ad(dict_index_check_search_tuple(index, &tuple));
+ ut_ad(dtuple_check_typed(&tuple));
+
+ m_page_cur.index = index;
+ /* We use these modified search modes on non-leaf levels of the B-tree.
+ These let us end up in the right B-tree leaf. In that leaf we use the
+ original search mode. */
+ switch (mode) {
+ case PAGE_CUR_GE:
+ m_page_mode= PAGE_CUR_L;
+ break;
+ case PAGE_CUR_G:
+ m_page_mode= PAGE_CUR_LE;
+ break;
+ default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE ||
+ mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ m_page_mode= mode;
+ break;
+ }
+ }
+
+ /** Retrieve block with m_page_id, release the previously gotten block
+ if necessary. If this is a left border block cursor and both left and right
+ border blocks have the same parent, don't unlatch the parent, as it must be
+ latched to get the right block, and will be unlatched after the right block
+ is fetched.
+ @param level distance from the leaf page level; ULINT_UNDEFINED when
+ fetching the root page
+ @param mtr mtr
+ @param right_parent right border block parent, nullptr if the function
+ is called for the right block itself
+ @return true on success or false otherwise. */
+ bool fetch_child(ulint level, mtr_t &mtr, const buf_block_t *right_parent)
+ {
+ buf_block_t *parent_block= m_block;
+
+ m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level,
+ &mtr, nullptr);
+ if (!m_block)
+ return false;
+
+ if (parent_block && parent_block != right_parent)
+ {
+ ut_ad(mtr.get_savepoint() >= 2);
+ mtr.rollback_to_savepoint(1, 2);
+ }
+
+ return level == ULINT_UNDEFINED ||
+ btr_page_get_level(m_block->page.frame) == level;
+ }
+
+ /** Sets page mode for leaves */
+ void set_page_mode_for_leaves() { m_page_mode= m_mode; }
+
+ /** Does search on the current page. If there is no border in m_tuple, then
+ just move the cursor to the most left or right record.
+ @param level current level on tree.
+ @param root_height root height
+ @param left true if this is left border, false otherwise.
+ @return true on success, false otherwise. */
+ bool search_on_page(ulint level, ulint root_height, bool left)
+ {
+ if (level != btr_page_get_level(m_block->page.frame))
+ return false;
+
+ m_n_recs= page_get_n_recs(m_block->page.frame);
+
+ if (dtuple_get_n_fields(&m_tuple) > 0)
+ {
+ m_up_bytes= m_low_bytes= 0;
+ m_page_cur.block= m_block;
+ if (page_cur_search_with_match(&m_tuple, m_page_mode,
+ &m_up_match, &m_low_match, &m_page_cur,
+ nullptr))
+ return false;
+ m_nth_rec= page_rec_get_n_recs_before(page_cur_get_rec(&m_page_cur));
+ }
+ else if (left)
+ {
+ page_cur_set_before_first(m_block, &m_page_cur);
+ if (level)
+ {
+ if (!page_cur_move_to_next(&m_page_cur))
+ return false;
+ m_nth_rec= 1;
+ }
+ else
+ m_nth_rec= 0;
+ }
+ else
+ {
+ m_nth_rec= m_n_recs;
+ if (!level)
+ {
+ page_cur_set_after_last(m_block, &m_page_cur);
+ ++m_nth_rec;
+ }
+ else
+ {
+ m_page_cur.block= m_block;
+ m_page_cur.rec= page_rec_get_nth(m_block->page.frame, m_nth_rec);
+ }
+ }
+
+ return true;
+ }
+
+ /** Read page id of the current record child.
+ @param offsets offsets array.
+ @param heap heap for offsets array */
+ void read_child_page_id(rec_offs **offsets, mem_heap_t **heap)
+ {
+ const rec_t *node_ptr= page_cur_get_rec(&m_page_cur);
+
+ /* FIXME: get the child page number directly without computing offsets */
+ *offsets= rec_get_offsets(node_ptr, index(), *offsets, 0, ULINT_UNDEFINED,
+ heap);
+
+ /* Go to the child node */
+ m_page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, *offsets));
+ }
+
+ /** @return true if left border should be counted */
+ bool should_count_the_left_border() const
+ {
+ if (dtuple_get_n_fields(&m_tuple) > 0)
+ {
+ ut_ad(!page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
+ return !page_rec_is_supremum(page_cur_get_rec(&m_page_cur));
+ }
+ ut_ad(page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
+ return false;
+ }
+
+ /** @return true if right border should be counted */
+ bool should_count_the_right_border() const
+ {
+ if (dtuple_get_n_fields(&m_tuple) > 0)
+ {
+ const rec_t *rec= page_cur_get_rec(&m_page_cur);
+ ut_ad(!(m_mode == PAGE_CUR_L && page_rec_is_supremum(rec)));
+
+ return (m_mode == PAGE_CUR_LE /* if the range is '<=' */
+ /* and the record was found */
+ && m_low_match >= dtuple_get_n_fields(&m_tuple)) ||
+ (m_mode == PAGE_CUR_L /* or if the range is '<' */
+ /* and there are any records to match the criteria, i.e. if the
+ minimum record on the tree is 5 and x < 7 is specified then the
+ cursor will be positioned at 5 and we should count the border,
+ but if x < 2 is specified, then the cursor will be positioned at
+ 'inf' and we should not count the border */
+ && !page_rec_is_infimum(rec));
+ /* Notice that for "WHERE col <= 'foo'" the server passes to
+ ha_innobase::records_in_range(): min_key=NULL (left-unbounded) which is
+ expected max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
+ unexpected - one would expect flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In
+ this case the cursor will be positioned on the first record to the right
+ of the requested one (can also be positioned on the 'sup') and we should
+ not count the right border. */
+ }
+ ut_ad(page_rec_is_supremum(page_cur_get_rec(&m_page_cur)));
+
+ /* The range specified is without a right border, just 'x > 123'
+ or 'x >= 123' and search_on_page() positioned the cursor on the
+ supremum record on the rightmost page, which must not be counted. */
+ return false;
+ }
+
+ /** @return index */
+ const dict_index_t *index() const { return m_page_cur.index; }
+
+ /** @return current block */
+ const buf_block_t *block() const { return m_block; }
+
+ /** @return current page id */
+ page_id_t page_id() const { return m_page_id; }
+
+ /** Copies block pointer and savepoint from another btr_est_cur_t in the case
+ if both left and right border cursors point to the same block.
+ @param o reference to the other btr_est_cur_t object. */
+ void set_block(const btr_est_cur_t &o) { m_block= o.m_block; }
+
+ /** @return current record number. */
+ ulint nth_rec() const { return m_nth_rec; }
+
+ /** @return number of records in the current page. */
+ ulint n_recs() const { return m_n_recs; }
+};
+
+/** Estimate the number of rows between the left record of the path and the
+right one(non-inclusive) for the certain level on a B-tree. This function
+starts from the page next to the left page and reads a few pages to the right,
+counting their records. If we reach the right page quickly then we know exactly
+how many records there are between left and right records and we set
+is_n_rows_exact to true. After some page is latched, the previous page is
+unlatched. If we cannot reach the right page quickly then we calculate the
+average number of records in the pages scanned so far and assume that all pages
+that we did not scan up to the right page contain the same number of records,
+then we multiply that average to the number of pages between right and left
+records (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to
+false.
+@param level current level.
+@param left_cur the cursor of the left page.
+@param right_page_no right page number.
+@param n_rows_on_prev_level number of rows on the previous level.
+@param[out] is_n_rows_exact true if exact rows number is returned.
+@param[in,out] mtr mtr,
+@return number of rows, not including the borders (exact or estimated). */
+static ha_rows btr_estimate_n_rows_in_range_on_level(
+ ulint level, btr_est_cur_t &left_cur, uint32_t right_page_no,
+ ha_rows n_rows_on_prev_level, bool &is_n_rows_exact, mtr_t &mtr)
+{
+ ha_rows n_rows= 0;
+ uint n_pages_read= 0;
+ /* Do not read more than this number of pages in order not to hurt
+ performance with this code which is just an estimation. If we read this many
+ pages before reaching right_page_no, then we estimate the average from the
+ pages scanned so far. */
+ static constexpr uint n_pages_read_limit= 9;
+ buf_block_t *block= nullptr;
+ const dict_index_t *index= left_cur.index();
+
+ /* Assume by default that we will scan all pages between left and right(non
+ inclusive) pages */
+ is_n_rows_exact= true;
+
+ /* Add records from the left page which are to the right of the record which
+ serves as a left border of the range, if any (we don't include the record
+ itself in this count). */
+ if (left_cur.nth_rec() <= left_cur.n_recs())
+ {
+ n_rows+= left_cur.n_recs() - left_cur.nth_rec();
+ }
+
+ /* Count the records in the pages between left and right (non inclusive)
+ pages */
+
+ const fil_space_t *space= index->table->space;
+ page_id_t page_id(space->id,
+ btr_page_get_next(buf_block_get_frame(left_cur.block())));
+
+ if (page_id.page_no() == FIL_NULL)
+ goto inexact;
+
+ do
+ {
+ page_t *page;
+ buf_block_t *prev_block= block;
+
+ /* Fetch the page. */
+ block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr,
+ nullptr);
+
+ if (prev_block)
+ {
+ ulint savepoint = mtr.get_savepoint();
+ /* Index s-lock, p1, p2 latches, can also be p1 and p2 parent latch if
+ they are not diverged */
+ ut_ad(savepoint >= 3);
+ mtr.rollback_to_savepoint(savepoint - 2, savepoint - 1);
+ }
+
+ if (!block || btr_page_get_level(buf_block_get_frame(block)) != level)
+ goto inexact;
+
+ page= buf_block_get_frame(block);
+
+ /* It is possible but highly unlikely that the page was originally written
+ by an old version of InnoDB that did not initialize FIL_PAGE_TYPE on other
+ than B-tree pages. For example, this could be an almost-empty BLOB page
+ that happens to contain the magic values in the fields
+ that we checked above. */
+
+ n_pages_read++;
+
+ n_rows+= page_get_n_recs(page);
+
+ page_id.set_page_no(btr_page_get_next(page));
+
+ if (n_pages_read == n_pages_read_limit)
+ {
+ /* We read too many pages or we reached the end of the level
+ without passing through right_page_no. */
+ goto inexact;
+ }
+
+ } while (page_id.page_no() != right_page_no);
+
+ if (block)
+ {
+ ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1));
+ mtr.rollback_to_savepoint(mtr.get_savepoint() - 1);
+ }
+
+ return (n_rows);
+
+inexact:
+
+ if (block)
+ {
+ ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1));
+ mtr.rollback_to_savepoint(mtr.get_savepoint() - 1);
+ }
+
+ is_n_rows_exact= false;
+
+ /* We did interrupt before reaching right page */
+
+ if (n_pages_read > 0)
+ {
+ /* The number of pages on this level is
+ n_rows_on_prev_level, multiply it by the
+ average number of recs per page so far */
+ n_rows= n_rows_on_prev_level * n_rows / n_pages_read;
+ }
+ else
+ {
+ n_rows= 10;
+ }
+
+ return (n_rows);
+}
+
+/** Estimates the number of rows in a given index range. Do search in the left
+page, then if there are pages between left and right ones, read a few pages to
+the right, if the right page is reached, count the exact number of rows without
+fetching the right page, the right page will be fetched in the caller of this
+function and the amount of its rows will be added. If the right page is not
+reached, count the estimated(see btr_estimate_n_rows_in_range_on_level() for
+details) rows number, and fetch the right page. If leaves are reached, unlatch
+non-leaf pages except the right leaf parent. After the right leaf page is
+fetched, commit mtr.
+@param[in] index index
+@param[in] range_start range start
+@param[in] range_end range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+ btr_pos_t *range_start,
+ btr_pos_t *range_end)
+{
+ DBUG_ENTER("btr_estimate_n_rows_in_range");
+
+ if (UNIV_UNLIKELY(index->page == FIL_NULL || index->is_corrupted()))
+ DBUG_RETURN(0);
+
+ ut_ad(index->is_btree());
+
+ btr_est_cur_t p1(index, *range_start->tuple, range_start->mode);
+ btr_est_cur_t p2(index, *range_end->tuple, range_end->mode);
+ mtr_t mtr;
+
+ ulint height;
+ ulint root_height= 0; /* remove warning */
+
+ mem_heap_t *heap= NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ rec_offs_init(offsets_);
+
+ mtr.start();
+
+ ut_ad(mtr.get_savepoint() == 0);
+ mtr_s_lock_index(index, &mtr);
+
+ ha_rows table_n_rows= dict_table_get_n_rows(index->table);
+
+ height= ULINT_UNDEFINED;
+
+ /* This becomes true when the two paths do not pass through the same pages
+ anymore. */
+ bool diverged= false;
+ /* This is the height, i.e. the number of levels from the root, where paths
+ are not the same or adjacent any more. */
+ ulint divergence_height= ULINT_UNDEFINED;
+ bool should_count_the_left_border= true;
+ bool should_count_the_right_border= true;
+ bool is_n_rows_exact= true;
+ ha_rows n_rows= 0;
+
+ /* Loop and search until we arrive at the desired level. */
+search_loop:
+ if (!p1.fetch_child(height, mtr, p2.block()))
+ goto error;
+
+ if (height == ULINT_UNDEFINED)
+ {
+ /* We are in the root node */
+ height= btr_page_get_level(buf_block_get_frame(p1.block()));
+ root_height= height;
+ }
+
+ if (!height)
+ {
+ p1.set_page_mode_for_leaves();
+ p2.set_page_mode_for_leaves();
+ }
+
+ if (p1.page_id() == p2.page_id())
+ p2.set_block(p1);
+ else
+ {
+ ut_ad(diverged);
+ if (divergence_height != ULINT_UNDEFINED) {
+ /* We need to call p1.search_on_page() here as
+ btr_estimate_n_rows_in_range_on_level() uses p1.m_n_recs and
+ p1.m_nth_rec. */
+ if (!p1.search_on_page(height, root_height, true))
+ goto error;
+ n_rows= btr_estimate_n_rows_in_range_on_level(
+ height, p1, p2.page_id().page_no(), n_rows, is_n_rows_exact, mtr);
+ }
+ if (!p2.fetch_child(height, mtr, nullptr))
+ goto error;
+ }
+
+ if (height == 0)
+ /* There is no need to release non-leaf pages here as they must already be
+ unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after
+ releasing the index latch, to decrease contention. */
+ mtr.rollback_to_savepoint(0, 1);
+
+ /* There is no need to search on left page if
+ divergence_height != ULINT_UNDEFINED, as it was already searched before
+ btr_estimate_n_rows_in_range_on_level() call */
+ if (divergence_height == ULINT_UNDEFINED &&
+ !p1.search_on_page(height, root_height, true))
+ goto error;
+
+ if (!p2.search_on_page(height, root_height, false))
+ goto error;
+
+ if (!diverged && (p1.nth_rec() != p2.nth_rec()))
+ {
+ ut_ad(p1.page_id() == p2.page_id());
+ diverged= true;
+ if (p1.nth_rec() < p2.nth_rec())
+ {
+ /* We do not count the borders (nor the left nor the right one), thus
+ "- 1". */
+ n_rows= p2.nth_rec() - p1.nth_rec() - 1;
+
+ if (n_rows > 0)
+ {
+ /* There is at least one row between the two borders pointed to by p1
+ and p2, so on the level below the slots will point to non-adjacent
+ pages. */
+ divergence_height= root_height - height;
+ }
+ }
+ else
+ {
+ /* It is possible that p1->nth_rec > p2->nth_rec if, for example, we have
+ a single page tree which contains (inf, 5, 6, supr) and we select where x
+ > 20 and x < 30; in this case p1->nth_rec will point to the supr record
+ and p2->nth_rec will point to 6. */
+ n_rows= 0;
+ should_count_the_left_border= false;
+ should_count_the_right_border= false;
+ }
+ }
+ else if (diverged && divergence_height == ULINT_UNDEFINED)
+ {
+
+ if (p1.nth_rec() < p1.n_recs() || p2.nth_rec() > 1)
+ {
+ ut_ad(p1.page_id() != p2.page_id());
+ divergence_height= root_height - height;
+
+ n_rows= 0;
+
+ if (p1.nth_rec() < p1.n_recs())
+ {
+ n_rows+= p1.n_recs() - p1.nth_rec();
+ }
+
+ if (p2.nth_rec() > 1)
+ {
+ n_rows+= p2.nth_rec() - 1;
+ }
+ }
+ }
+ else if (divergence_height != ULINT_UNDEFINED)
+ {
+ /* All records before the right page was already counted. Add records from
+ p2->page_no which are to the left of the record which servers as a right
+ border of the range, if any (we don't include the record itself in this
+ count). */
+ if (p2.nth_rec() > 1)
+ n_rows+= p2.nth_rec() - 1;
+ }
+
+ if (height)
+ {
+ ut_ad(height > 0);
+ height--;
+ ut_ad(mtr.memo_contains(p1.index()->lock, MTR_MEMO_S_LOCK));
+ ut_ad(mtr.memo_contains_flagged(p1.block(), MTR_MEMO_PAGE_S_FIX));
+ p1.read_child_page_id(&offsets, &heap);
+ ut_ad(mtr.memo_contains(p2.index()->lock, MTR_MEMO_S_LOCK));
+ ut_ad(mtr.memo_contains_flagged(p2.block(), MTR_MEMO_PAGE_S_FIX));
+ p2.read_child_page_id(&offsets, &heap);
+ goto search_loop;
+ }
+
+ should_count_the_left_border=
+ should_count_the_left_border && p1.should_count_the_left_border();
+ should_count_the_right_border=
+ should_count_the_right_border && p2.should_count_the_right_border();
+
+ mtr.commit();
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+
+
+ range_start->page_id= p1.page_id();
+ range_end->page_id= p2.page_id();
+
+ /* Here none of the borders were counted. For example, if on the leaf level
+ we descended to:
+ (inf, a, b, c, d, e, f, sup)
+ ^ ^
+ path1 path2
+ then n_rows will be 2 (c and d). */
+
+ if (is_n_rows_exact)
+ {
+ /* Only fiddle to adjust this off-by-one if the number is exact, otherwise
+ we do much grosser adjustments below. */
+
+ /* If both paths end up on the same record on the leaf level. */
+ if (p1.page_id() == p2.page_id() && p1.nth_rec() == p2.nth_rec())
+ {
+
+ /* n_rows can be > 0 here if the paths were first different and then
+ converged to the same record on the leaf level.
+ For example:
+ SELECT ... LIKE 'wait/synch/rwlock%'
+ mode1=PAGE_CUR_GE,
+ tuple1="wait/synch/rwlock"
+ path1[0]={nth_rec=58, n_recs=58,
+ page_no=3, page_level=1}
+ path1[1]={nth_rec=56, n_recs=55,
+ page_no=119, page_level=0}
+
+ mode2=PAGE_CUR_G
+ tuple2="wait/synch/rwlock"
+ path2[0]={nth_rec=57, n_recs=57,
+ page_no=3, page_level=1}
+ path2[1]={nth_rec=56, n_recs=55,
+ page_no=119, page_level=0} */
+
+ /* If the range is such that we should count both borders, then avoid
+ counting that record twice - once as a left border and once as a right
+ border. Some of the borders should not be counted, e.g. [3,3). */
+ n_rows= should_count_the_left_border && should_count_the_right_border;
+ }
+ else
+ n_rows+= should_count_the_left_border + should_count_the_right_border;
+ }
+
+ if (root_height > divergence_height && !is_n_rows_exact)
+ /* In trees whose height is > 1 our algorithm tends to underestimate:
+ multiply the estimate by 2: */
+ n_rows*= 2;
+
+ DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows););
+
+ /* Do not estimate the number of rows in the range to over 1 / 2 of the
+ estimated rows in the whole table */
+
+ if (n_rows > table_n_rows / 2 && !is_n_rows_exact)
+ {
+
+ n_rows= table_n_rows / 2;
+
+ /* If there are just 0 or 1 rows in the table, then we estimate all rows
+ are in the range */
+
+ if (n_rows == 0)
+ n_rows= table_n_rows;
+ }
+
+ DBUG_RETURN(n_rows);
+
+error:
+ mtr.commit();
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+
+ DBUG_RETURN(0);
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the offset of the pointer to the externally stored part of a field.
+@return offset of the pointer to the externally stored part */
+static
+ulint
+btr_rec_get_field_ref_offs(
+/*=======================*/
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: index of the external field */
+{
+ ulint field_ref_offs;
+ ulint local_len;
+
+ ut_a(rec_offs_nth_extern(offsets, n));
+ field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
+ ut_a(len_is_stored(local_len));
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/** Gets a pointer to the externally stored part of a field.
+@param rec record
+@param offsets rec_get_offsets(rec)
+@param n index of the externally stored field
+@return pointer to the externally stored part */
+#define btr_rec_get_field_ref(rec, offsets, n) \
+ ((rec) + btr_rec_get_field_ref_offs(offsets, n))
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in] rec record
+@param[in] offsets array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+ const rec_t* rec,
+ const rec_offs* offsets)
+{
+ ulint n_fields;
+ ulint total_extern_len = 0;
+ ulint i;
+
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+ if (!rec_offs_any_extern(offsets)) {
+ return(0);
+ }
+
+ n_fields = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ ulint extern_len = mach_read_from_4(
+ btr_rec_get_field_ref(rec, offsets, i)
+ + BTR_EXTERN_LEN + 4);
+
+ total_extern_len += ut_calc_align(
+ extern_len, ulint(srv_page_size));
+ }
+ }
+
+ return total_extern_len >> srv_page_size_shift;
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: clustered index record */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint i, /*!< in: field number */
+ bool val, /*!< in: value to set */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ byte* data;
+ ulint local_len;
+ ulint byte_val;
+
+ data = rec_get_nth_field(rec, offsets, i, &local_len);
+ ut_ad(rec_offs_nth_extern(offsets, i));
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+ if (val) {
+ byte_val &= ~BTR_EXTERN_OWNER_FLAG;
+ } else {
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ byte_val |= BTR_EXTERN_OWNER_FLAG;
+ }
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+ page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
+ } else {
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
+ + BTR_EXTERN_LEN, byte_val);
+ }
+}
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+ ut_ad(rec_offs_any_extern(offsets));
+
+ for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (rec_offs_nth_extern(offsets, i)
+ && !upd_get_field_by_field_no(update, i, false)) {
+ btr_cur_set_ownership_of_extern_field(
+ block, rec, index, offsets, i, false, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+ buf_block_t* block, /*!< in/out: index page */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+ if (!rec_offs_any_extern(offsets)) {
+ return;
+ }
+
+ const ulint n = rec_offs_n_fields(offsets);
+
+ for (ulint i = 0; i < n; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ btr_cur_set_ownership_of_extern_field(
+ block, rec, index, offsets, i, true, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return part length */
+static
+uint32_t
+btr_blob_get_part_len(
+/*==================*/
+ const byte* blob_header) /*!< in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return page number or FIL_NULL if no more pages */
+static
+uint32_t
+btr_blob_get_next_page_no(
+/*======================*/
+ const byte* blob_header) /*!< in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/** Deallocate a buffer block that was reserved for a BLOB part.
+@param block buffer block
+@param all flag whether to remove a ROW_FORMAT=COMPRESSED page
+@param mtr mini-transaction to commit */
+static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
+{
+ const page_id_t page_id(block->page.id());
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ mtr->commit();
+
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
+ if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
+ /* Attempt to deallocate the redundant copy of the uncompressed page
+ if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
+ buf_LRU_free_page(bpage, false);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Helper class used while writing blob pages, during insert or update. */
+struct btr_blob_log_check_t {
+ /** Persistent cursor on a clusterex index record with blobs. */
+ btr_pcur_t* m_pcur;
+ /** Mini transaction holding the latches for m_pcur */
+ mtr_t* m_mtr;
+ /** rec_get_offsets(rec, index); offset of clust_rec */
+ const rec_offs* m_offsets;
+ /** The block containing clustered record */
+ buf_block_t** m_block;
+ /** The clustered record pointer */
+ rec_t** m_rec;
+ /** The blob operation code */
+ enum blob_op m_op;
+
+ /** Constructor
+ @param[in] pcur persistent cursor on a clustered
+ index record with blobs.
+ @param[in] mtr mini-transaction holding latches for
+ pcur.
+ @param[in] offsets offsets of the clust_rec
+ @param[in,out] block record block containing pcur record
+ @param[in,out] rec the clustered record pointer
+ @param[in] op the blob operation code */
+ btr_blob_log_check_t(
+ btr_pcur_t* pcur,
+ mtr_t* mtr,
+ const rec_offs* offsets,
+ buf_block_t** block,
+ rec_t** rec,
+ enum blob_op op)
+ : m_pcur(pcur),
+ m_mtr(mtr),
+ m_offsets(offsets),
+ m_block(block),
+ m_rec(rec),
+ m_op(op)
+ {
+ ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
+ ut_ad((*m_block)->page.frame == page_align(*m_rec));
+ ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
+ }
+
+ /** Check if there is enough space in log file. Commit and re-start the
+ mini transaction. */
+ void check()
+ {
+ dict_index_t* index = m_pcur->index();
+ ulint offs = 0;
+ uint32_t page_no = FIL_NULL;
+
+ if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
+ offs = page_offset(*m_rec);
+ page_no = (*m_block)->page.id().page_no();
+ (*m_block)->page.fix();
+ ut_ad(page_no != FIL_NULL);
+ } else {
+ btr_pcur_store_position(m_pcur, m_mtr);
+ }
+ m_mtr->commit();
+
+ DEBUG_SYNC_C("blob_write_middle");
+
+ const mtr_log_t log_mode = m_mtr->get_log_mode();
+ m_mtr->start();
+ m_mtr->set_log_mode(log_mode);
+ index->set_modified(*m_mtr);
+
+ log_free_check();
+
+ DEBUG_SYNC_C("blob_write_middle_after_check");
+
+ if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
+ dberr_t err;
+ if (UNIV_LIKELY(index->page != page_no)) {
+ ut_a(btr_root_block_get(index, RW_SX_LATCH,
+ m_mtr, &err));
+ }
+ m_pcur->btr_cur.page_cur.block = btr_block_get(
+ *index, page_no, RW_X_LATCH, false, m_mtr);
+ /* The page should not be evicted or corrupted while
+ we are holding a buffer-fix on it. */
+ m_pcur->btr_cur.page_cur.block->page.unfix();
+ m_pcur->btr_cur.page_cur.rec
+ = m_pcur->btr_cur.page_cur.block->page.frame
+ + offs;
+ } else {
+ ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
+ mtr_sx_lock_index(index, m_mtr);
+ ut_a(m_pcur->restore_position(
+ BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED,
+ m_mtr) == btr_pcur_t::SAME_ALL);
+ }
+
+ *m_block = btr_pcur_get_block(m_pcur);
+ *m_rec = btr_pcur_get_rec(m_pcur);
+
+ rec_offs_make_valid(*m_rec, index, true,
+ const_cast<rec_offs*>(m_offsets));
+
+ ut_ad(m_mtr->memo_contains_page_flagged(
+ *m_rec,
+ MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+
+ ut_ad((m_op == BTR_STORE_INSERT_BULK)
+ == !m_mtr->memo_contains_flagged(&index->lock,
+ MTR_MEMO_SX_LOCK
+ | MTR_MEMO_X_LOCK));
+ }
+};
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+
+TODO: If the allocation extends the tablespace, it will not be redo logged, in
+any mini-transaction. Tablespace extension should be redo-logged, so that
+recovery will not fail when the big_rec was written to the extended portion of
+the file, in case the file was somehow truncated in the crash.
+
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+ btr_pcur_t* pcur, /*!< in: a persistent cursor */
+ rec_offs* offsets, /*!< in/out: rec_get_offsets() on
+ pcur. the "external storage" flags
+ in offsets will correctly correspond
+ to rec when this function returns */
+ const big_rec_t*big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* btr_mtr, /*!< in/out: mtr containing the
+ latches to the clustered index. can be
+ committed and restarted. */
+ enum blob_op op) /*! in: operation code */
+{
+ byte* field_ref;
+ ulint extern_len;
+ ulint store_len;
+ ulint i;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ page_zip_des_t* page_zip;
+ z_stream c_stream;
+ dberr_t error = DB_SUCCESS;
+ dict_index_t* index = pcur->index();
+ buf_block_t* rec_block = btr_pcur_get_block(pcur);
+ rec_t* rec = btr_pcur_get_rec(pcur);
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
+ ut_ad(op == BTR_STORE_INSERT_BULK
+ || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+ ut_a(dict_index_is_clust(index));
+
+ if (!fil_page_index_page_check(page_align(rec))) {
+ if (op != BTR_STORE_INSERT_BULK) {
+ return DB_PAGE_CORRUPTED;
+ }
+ }
+
+ btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
+ &rec, op);
+ page_zip = buf_block_get_page_zip(rec_block);
+
+ if (page_zip) {
+ int err;
+
+ /* Zlib deflate needs 128 kilobytes for the default
+ window size, plus 512 << memLevel, plus a few
+ kilobytes for small objects. We use reduced memLevel
+ to limit the memory consumption, and preallocate the
+ heap, hoping to avoid memory fragmentation. */
+ heap = mem_heap_create(250000);
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, int(page_zip_level),
+ Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* All pointers to externally stored columns in the record
+ must either be zero or they must be pointers to inherited
+ columns, owned by this record or an earlier record version. */
+ for (i = 0; i < big_rec_vec->n_fields; i++) {
+ field_ref = btr_rec_get_field_ref(
+ rec, offsets, big_rec_vec->fields[i].field_no);
+
+ ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ /* Either this must be an update in place,
+ or the BLOB must be inherited, or the BLOB pointer
+ must be zero (will be written in this function). */
+ ut_a(op == BTR_STORE_UPDATE
+ || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+ || !memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+ }
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+ /* Space available in compressed page to carry blob data */
+ const ulint payload_size_zip = rec_block->physical_size()
+ - FIL_PAGE_DATA;
+
+ /* Space available in uncompressed page to carry blob data */
+ const ulint payload_size = payload_size_zip
+ - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
+
+ /* We have to create a file segment to the tablespace
+ for each field and put the pointer to the field in rec */
+
+ for (i = 0; i < big_rec_vec->n_fields; i++) {
+ const ulint field_no = big_rec_vec->fields[i].field_no;
+
+ field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* A zero BLOB pointer should have been initially inserted. */
+ ut_a(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ extern_len = big_rec_vec->fields[i].len;
+ MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
+ ut_a(extern_len > 0);
+
+ uint32_t prev_page_no = FIL_NULL;
+
+ if (page_zip) {
+ int err = deflateReset(&c_stream);
+ ut_a(err == Z_OK);
+
+ c_stream.next_in = (Bytef*)
+ big_rec_vec->fields[i].data;
+ c_stream.avail_in = static_cast<uInt>(extern_len);
+ }
+
+ for (ulint blob_npages = 0;; ++blob_npages) {
+ buf_block_t* block;
+ const ulint commit_freq = 4;
+ uint32_t r_extents;
+
+ ut_ad(page_align(field_ref) == page_align(rec));
+
+ if (!(blob_npages % commit_freq)) {
+
+ redo_log.check();
+
+ field_ref = btr_rec_get_field_ref(
+ rec, offsets, field_no);
+
+ page_zip = buf_block_get_page_zip(rec_block);
+ }
+
+ ut_ad(btr_mtr->get_already_latched(
+ page_id_t{index->table->space_id, index->page},
+ MTR_MEMO_PAGE_SX_FIX));
+
+ mtr.start();
+ index->set_modified(mtr);
+ mtr.set_log_mode_sub(*btr_mtr);
+
+ rec_block->page.fix();
+ rec_block->page.lock.x_lock();
+
+ mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!btr_search_check_marked_free_index(rec_block));
+#endif
+
+ uint32_t hint_prev = prev_page_no;
+ if (hint_prev == FIL_NULL) {
+ hint_prev = rec_block->page.id().page_no();
+ }
+
+ error = fsp_reserve_free_extents(
+ &r_extents, index->table->space, 1,
+ FSP_BLOB, &mtr, 1);
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+alloc_fail:
+ mtr.commit();
+ goto func_exit;
+ }
+
+ block = btr_page_alloc(index, hint_prev + 1,
+ FSP_NO_DIR, 0, &mtr, &mtr,
+ &error);
+
+ index->table->space->release_free_extents(r_extents);
+ if (!block) {
+ goto alloc_fail;
+ }
+
+ const uint32_t space_id = block->page.id().space();
+ const uint32_t page_no = block->page.id().page_no();
+
+ if (prev_page_no == FIL_NULL) {
+ } else if (buf_block_t* prev_block =
+ buf_page_get_gen(page_id_t(space_id,
+ prev_page_no),
+ rec_block->zip_size(),
+ RW_X_LATCH, nullptr,
+ BUF_GET, &mtr, &error)) {
+ if (page_zip) {
+ mtr.write<4>(*prev_block,
+ prev_block->page.frame
+ + FIL_PAGE_NEXT,
+ page_no);
+ memcpy_aligned<4>(
+ buf_block_get_page_zip(
+ prev_block)
+ ->data + FIL_PAGE_NEXT,
+ prev_block->page.frame
+ + FIL_PAGE_NEXT, 4);
+ } else {
+ mtr.write<4>(*prev_block,
+ BTR_BLOB_HDR_NEXT_PAGE_NO
+ + FIL_PAGE_DATA
+ + prev_block->page.frame,
+ page_no);
+ }
+ } else {
+ goto alloc_fail;
+ }
+
+ ut_ad(!page_has_siblings(block->page.frame));
+ ut_ad(!fil_page_get_type(block->page.frame));
+
+ if (page_zip) {
+ int err;
+ page_zip_des_t* blob_page_zip;
+
+ mtr.write<1>(*block,
+ FIL_PAGE_TYPE + 1
+ + block->page.frame,
+ prev_page_no == FIL_NULL
+ ? FIL_PAGE_TYPE_ZBLOB
+ : FIL_PAGE_TYPE_ZBLOB2);
+ block->page.zip.data[FIL_PAGE_TYPE + 1]
+ = block->page.frame[FIL_PAGE_TYPE + 1];
+
+ c_stream.next_out = block->page.frame
+ + FIL_PAGE_DATA;
+ c_stream.avail_out = static_cast<uInt>(
+ payload_size_zip);
+
+ err = deflate(&c_stream, Z_FINISH);
+ ut_a(err == Z_OK || err == Z_STREAM_END);
+ ut_a(err == Z_STREAM_END
+ || c_stream.avail_out == 0);
+
+ mtr.memcpy(*block,
+ FIL_PAGE_DATA,
+ page_zip_get_size(page_zip)
+ - FIL_PAGE_DATA
+ - c_stream.avail_out);
+ /* Copy the page to compressed storage,
+ because it will be flushed to disk
+ from there. */
+ blob_page_zip = buf_block_get_page_zip(block);
+ ut_ad(blob_page_zip);
+ ut_ad(page_zip_get_size(blob_page_zip)
+ == page_zip_get_size(page_zip));
+ memcpy(blob_page_zip->data, block->page.frame,
+ page_zip_get_size(page_zip));
+
+ if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+ goto next_zip_page;
+ }
+
+ if (err == Z_STREAM_END) {
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_LEN, 0);
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_LEN + 4,
+ c_stream.total_in);
+ } else {
+ memset(field_ref + BTR_EXTERN_LEN,
+ 0, 8);
+ }
+
+ if (prev_page_no == FIL_NULL) {
+ ut_ad(blob_npages == 0);
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_SPACE_ID,
+ space_id);
+
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_PAGE_NO,
+ page_no);
+
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_NEXT);
+ }
+
+ /* We compress a page when finish bulk insert.*/
+ if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
+ page_zip_write_blob_ptr(
+ rec_block, rec, index, offsets,
+ field_no, &mtr);
+ }
+
+next_zip_page:
+ prev_page_no = page_no;
+
+ /* Commit mtr and release the
+ uncompressed page frame to save memory. */
+ btr_blob_free(block, FALSE, &mtr);
+
+ if (err == Z_STREAM_END) {
+ break;
+ }
+ } else {
+ mtr.write<1>(*block, FIL_PAGE_TYPE + 1
+ + block->page.frame,
+ FIL_PAGE_TYPE_BLOB);
+
+ if (extern_len > payload_size) {
+ store_len = payload_size;
+ } else {
+ store_len = extern_len;
+ }
+
+ mtr.memcpy<mtr_t::MAYBE_NOP>(
+ *block,
+ FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
+ + block->page.frame,
+ static_cast<const byte*>
+ (big_rec_vec->fields[i].data)
+ + big_rec_vec->fields[i].len
+ - extern_len, store_len);
+ mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
+ + FIL_PAGE_DATA
+ + block->page.frame,
+ store_len);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
+ + FIL_PAGE_DATA, 4, 0xff);
+
+ extern_len -= store_len;
+
+ ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
+ + field_ref));
+ mtr.write<4>(*rec_block,
+ BTR_EXTERN_LEN + 4 + field_ref,
+ big_rec_vec->fields[i].len
+ - extern_len);
+
+ if (prev_page_no == FIL_NULL) {
+ ut_ad(blob_npages == 0);
+ mtr.write<4,mtr_t::MAYBE_NOP>(
+ *rec_block,
+ field_ref + BTR_EXTERN_SPACE_ID,
+ space_id);
+
+ mtr.write<4>(*rec_block, field_ref
+ + BTR_EXTERN_PAGE_NO,
+ page_no);
+
+ mtr.write<4>(*rec_block, field_ref
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_DATA);
+ }
+
+ prev_page_no = page_no;
+
+ mtr.commit();
+
+ if (extern_len == 0) {
+ break;
+ }
+ }
+ }
+
+ DBUG_EXECUTE_IF("btr_store_big_rec_extern",
+ error = DB_OUT_OF_FILE_SPACE;
+ goto func_exit;);
+
+ rec_offs_make_nth_extern(offsets, field_no);
+ }
+
+func_exit:
+ if (page_zip) {
+ deflateEnd(&c_stream);
+ }
+
+ if (heap != NULL) {
+ mem_heap_free(heap);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ /* All pointers to externally stored columns in the record
+ must be valid. */
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+
+ field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+ /* The pointer must not be zero if the operation
+ succeeded. */
+ ut_a(0 != memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE)
+ || error != DB_SUCCESS);
+ /* The column must not be disowned by this record. */
+ ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+ }
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+ return(error);
+}
+
+/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
+@param block uncompressed BLOB page
+@param op operation
+@return whether the type is invalid */
+static bool btr_check_blob_fil_page_type(const buf_block_t& block,
+ const char *op)
+{
+ uint16_t type= fil_page_get_type(block.page.frame);
+
+ if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB));
+ else if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
+ {
+ /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
+ pages. Do not print anything about the type mismatch when reading
+ a BLOB page that may be from old versions. */
+ bool fail= space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags);
+ if (fail)
+ sql_print_error("InnoDB: FIL_PAGE_TYPE=%u on BLOB %s file %s page %u",
+ type, op, space->chain.start->name,
+ block.page.id().page_no());
+ space->release();
+ return fail;
+ }
+ return false;
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ buf_block_t* block, /*!< in/out: page of field_ref */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ bool rollback, /*!< in: performing rollback? */
+ mtr_t* local_mtr) /*!< in: mtr
+ containing the latch to data an an
+ X-latch to the index tree */
+{
+ const uint32_t space_id = mach_read_from_4(
+ field_ref + BTR_EXTERN_SPACE_ID);
+
+ ut_ad(index->is_primary());
+ ut_ad(block->page.lock.have_x());
+ ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
+ ut_ad(index->table->space_id == index->table->space->id);
+ ut_ad(local_mtr->is_named_space(index->table->space));
+
+ if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* In the rollback, we may encounter a clustered index
+ record with some unwritten off-page columns. There is
+ nothing to free then. */
+ ut_a(rollback);
+ return;
+ }
+
+ ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
+ & ~((BTR_EXTERN_OWNER_FLAG
+ | BTR_EXTERN_INHERITED_FLAG) << 24)));
+ ut_ad(space_id == index->table->space_id);
+
+ const ulint ext_zip_size = index->table->space->zip_size();
+ /* !rec holds in a call from purge when field_ref is in an undo page */
+ ut_ad(rec || !block->page.zip.data);
+
+ for (;;) {
+ mtr_t mtr;
+
+ mtr.start();
+ mtr.set_spaces(*local_mtr);
+ mtr.set_log_mode_sub(*local_mtr);
+
+ ut_ad(!index->table->is_temporary()
+ || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
+
+ const uint32_t page_no = mach_read_from_4(
+ field_ref + BTR_EXTERN_PAGE_NO);
+ buf_block_t* ext_block;
+
+ if (/* There is no external storage data */
+ page_no == FIL_NULL
+ /* This field does not own the externally stored field */
+ || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+ & BTR_EXTERN_OWNER_FLAG)
+ /* Rollback and inherited field */
+ || (rollback
+ && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+ & BTR_EXTERN_INHERITED_FLAG))) {
+skip_free:
+ /* Do not free */
+ mtr.commit();
+
+ return;
+ }
+
+ ext_block = buf_page_get(page_id_t(space_id, page_no),
+ ext_zip_size, RW_X_LATCH, &mtr);
+
+ if (!ext_block) {
+ goto skip_free;
+ }
+
+ /* The buffer pool block containing the BLOB pointer is
+ exclusively latched by local_mtr. To satisfy some design
+ constraints, we must recursively latch it in mtr as well. */
+ block->fix();
+ block->page.lock.x_lock();
+
+ mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!btr_search_check_marked_free_index(block));
+#endif
+
+ const page_t* page = buf_block_get_frame(ext_block);
+
+ if (ext_zip_size) {
+ /* Note that page_zip will be NULL
+ in row_purge_upd_exist_or_extern(). */
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ default:
+ MY_ASSERT_UNREACHABLE();
+ }
+ const uint32_t next_page_no = mach_read_from_4(
+ page + FIL_PAGE_NEXT);
+
+ btr_page_free(index, ext_block, &mtr, true,
+ local_mtr->memo_contains(
+ *index->table->space));
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+ next_page_no);
+ memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
+ page_zip_write_blob_ptr(block, rec, index,
+ offsets, i, &mtr);
+ } else {
+ mtr.write<4>(*block,
+ BTR_EXTERN_PAGE_NO + field_ref,
+ next_page_no);
+ mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+ BTR_EXTERN_LEN
+ + 4 + field_ref,
+ 0U);
+ }
+ } else {
+ ut_ad(!block->page.zip.data);
+ btr_check_blob_fil_page_type(*ext_block, "purge");
+
+ const uint32_t next_page_no = mach_read_from_4(
+ page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO);
+ btr_page_free(index, ext_block, &mtr, true,
+ local_mtr->memo_contains(
+ *index->table->space));
+
+ mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
+ next_page_no);
+ /* Zero out the BLOB length. If the server
+ crashes during the execution of this function,
+ trx_rollback_all_recovered() could
+ dereference the half-deleted BLOB, fetching a
+ wrong prefix for the BLOB. */
+ mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+ BTR_EXTERN_LEN + 4
+ + field_ref, 0U);
+ }
+
+ /* Commit mtr and release the BLOB block to save memory. */
+ btr_blob_free(ext_block, TRUE, &mtr);
+ }
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ buf_block_t* block, /*!< in: index page of rec */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(index->is_primary());
+ ut_ad(page_rec_is_leaf(rec));
+ /* Free possible externally stored fields in the record */
+
+ ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+ n_fields = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ btr_free_externally_stored_field(
+ index, btr_rec_get_field_ref(rec, offsets, i),
+ rec, offsets, block, i, rollback, mtr);
+ }
+ }
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /*!< in/out: record */
+ buf_block_t* block, /*!< in: index page of rec */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector */
+ bool rollback,/*!< in: performing rollback? */
+ mtr_t* mtr) /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+
+ /* Free possible externally stored fields in the record */
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+ if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+ ulint len;
+ byte* data = rec_get_nth_field(
+ rec, offsets, ufield->field_no, &len);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ btr_free_externally_stored_field(
+ index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ rec, offsets, block,
+ ufield->field_no, rollback, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB. The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+ byte* buf, /*!< out: the externally stored part of
+ the field, or a prefix of it */
+ uint32_t len, /*!< in: length of buf, in bytes */
+ page_id_t id, /*!< in: page identifier of the first BLOB page */
+ uint32_t offset) /*!< in: offset on the first BLOB page */
+{
+ ulint copied_len = 0;
+
+ for (;;) {
+ mtr_t mtr;
+ buf_block_t* block;
+ const page_t* page;
+ const byte* blob_header;
+ ulint part_len;
+ ulint copy_len;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
+ if (!block || btr_check_blob_fil_page_type(*block, "read")) {
+ mtr.commit();
+ return copied_len;
+ }
+ page = buf_block_get_frame(block);
+
+ blob_header = page + offset;
+ part_len = btr_blob_get_part_len(blob_header);
+ copy_len = ut_min(part_len, len - copied_len);
+
+ memcpy(buf + copied_len,
+ blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+ copied_len += copy_len;
+
+ id.set_page_no(btr_blob_get_next_page_no(blob_header));
+
+ mtr_commit(&mtr);
+
+ if (id.page_no() == FIL_NULL || copy_len != part_len) {
+ MEM_CHECK_DEFINED(buf, copied_len);
+ return(copied_len);
+ }
+
+ /* On other BLOB pages except the first the BLOB header
+ always is at the page data start: */
+
+ offset = FIL_PAGE_DATA;
+
+ ut_ad(copied_len <= len);
+ }
+}
+
+/** Copies the prefix of a compressed BLOB.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out] buf the externally stored part of the field,
+or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size
+@param[in] id page identifier of the BLOB pages
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_zblob_prefix(
+ byte* buf,
+ uint32_t len,
+ ulint zip_size,
+ page_id_t id,
+ uint32_t offset)
+{
+ ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+ mem_heap_t* heap;
+ int err;
+ z_stream d_stream;
+
+ d_stream.next_out = buf;
+ d_stream.avail_out = static_cast<uInt>(len);
+ d_stream.next_in = Z_NULL;
+ d_stream.avail_in = 0;
+
+ /* Zlib inflate needs 32 kilobytes for the default
+ window size, plus a few kilobytes for small objects. */
+ heap = mem_heap_create(40000);
+ page_zip_set_alloc(&d_stream, heap);
+
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(id.space());
+
+ err = inflateInit(&d_stream);
+ ut_a(err == Z_OK);
+
+ for (;;) {
+ buf_page_t* bpage;
+ uint32_t next_page_no;
+
+ /* There is no latch on bpage directly. Instead,
+ bpage is protected by the B-tree page latch that
+ is being held on the clustered index record, or,
+ in row_merge_copy_blobs(), by an exclusive table lock. */
+ bpage = buf_page_get_zip(id, zip_size);
+
+ if (UNIV_UNLIKELY(!bpage)) {
+ ib::error() << "Cannot load compressed BLOB " << id;
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (fil_page_get_type(bpage->zip.data) != page_type)) {
+
+ ib::error() << "Unexpected type "
+ << fil_page_get_type(bpage->zip.data)
+ << " of compressed BLOB page " << id;
+
+ ut_ad(0);
+ goto end_of_blob;
+ }
+
+ next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+ if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+ /* When the BLOB begins at page header,
+ the compressed data payload does not
+ immediately follow the next page pointer. */
+ offset = FIL_PAGE_DATA;
+ } else {
+ offset += 4;
+ }
+
+ d_stream.next_in = bpage->zip.data + offset;
+ d_stream.avail_in = uInt(zip_size - offset);
+
+ err = inflate(&d_stream, Z_NO_FLUSH);
+ switch (err) {
+ case Z_OK:
+ if (!d_stream.avail_out) {
+ goto end_of_blob;
+ }
+ break;
+ case Z_STREAM_END:
+ if (next_page_no == FIL_NULL) {
+ goto end_of_blob;
+ }
+ /* fall through */
+ default:
+inflate_error:
+ ib::error() << "inflate() of compressed BLOB page "
+ << id
+ << " returned " << err
+ << " (" << d_stream.msg << ")";
+
+ case Z_BUF_ERROR:
+ goto end_of_blob;
+ }
+
+ if (next_page_no == FIL_NULL) {
+ if (!d_stream.avail_in) {
+ ib::error()
+ << "Unexpected end of compressed "
+ << "BLOB page " << id;
+ } else {
+ err = inflate(&d_stream, Z_FINISH);
+ switch (err) {
+ case Z_STREAM_END:
+ case Z_BUF_ERROR:
+ break;
+ default:
+ goto inflate_error;
+ }
+ }
+
+end_of_blob:
+ bpage->lock.s_unlock();
+ bpage->unfix();
+ goto func_exit;
+ }
+
+ bpage->lock.s_unlock();
+ bpage->unfix();
+
+ /* On other BLOB pages except the first
+ the BLOB header always is at the page header: */
+
+ id.set_page_no(next_page_no);
+ offset = FIL_PAGE_NEXT;
+ page_type = FIL_PAGE_TYPE_ZBLOB2;
+ }
+
+func_exit:
+ inflateEnd(&d_stream);
+ mem_heap_free(heap);
+ MEM_CHECK_DEFINED(buf, d_stream.total_out);
+ return(d_stream.total_out);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out] buf the externally stored part of the
+field, or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] id page identifier of the first BLOB page
+@param[in] offset offset on the first BLOB page
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+ byte* buf,
+ uint32_t len,
+ ulint zip_size,
+ page_id_t id,
+ uint32_t offset)
+{
+ if (len == 0)
+ return 0;
+
+ return zip_size
+ ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
+ : btr_copy_blob_prefix(buf, len, id, offset);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] buf the field, or a prefix of it
+@param[in] len length of buf, in bytes
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] local_len length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+ byte* buf,
+ ulint len,
+ ulint zip_size,
+ const byte* data,
+ ulint local_len)
+{
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(local_len >= len)) {
+ memcpy(buf, data, len);
+ return(len);
+ }
+
+ memcpy(buf, data, local_len);
+ data += local_len;
+
+ ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+ /* The externally stored part of the column has been
+ (partially) deleted. Signal the half-deleted BLOB
+ to the caller. */
+
+ return(0);
+ }
+
+ uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+ uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+ uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+ len -= local_len;
+
+ return(local_len
+ + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+ uint32_t(len),
+ zip_size,
+ page_id_t(
+ space_id,
+ page_no),
+ offset));
+}
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out] len length of the whole field
+@param[in] data 'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] local_len length of data
+@param[in,out] heap mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+ ulint* len,
+ const byte* data,
+ ulint zip_size,
+ ulint local_len,
+ mem_heap_t* heap)
+{
+ byte* buf;
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ uint32_t space_id = mach_read_from_4(data + local_len
+ + BTR_EXTERN_SPACE_ID);
+ uint32_t page_no = mach_read_from_4(data + local_len
+ + BTR_EXTERN_PAGE_NO);
+ uint32_t offset = mach_read_from_4(data + local_len
+ + BTR_EXTERN_OFFSET);
+
+ /* Currently a BLOB cannot be bigger than 4 GB; we
+ leave the 4 upper bytes in the length field unused */
+
+ uint32_t extern_len = mach_read_from_4(data + local_len
+ + BTR_EXTERN_LEN + 4);
+
+ buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
+
+ memcpy(buf, data, local_len);
+ *len = local_len
+ + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+ extern_len,
+ zip_size,
+ page_id_t(
+ space_id,
+ page_no),
+ offset);
+
+ return(buf);
+}
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in] rec record in a clustered index; must be
+protected by a lock or a page latch
+@param[in] offset array returned by rec_get_offsets()
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] no field number
+@param[out] len length of the field
+@param[in,out] heap mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint zip_size,
+ ulint no,
+ ulint* len,
+ mem_heap_t* heap)
+{
+ ulint local_len;
+ const byte* data;
+
+ ut_a(rec_offs_nth_extern(offsets, no));
+
+ /* An externally stored field can contain some initial
+ data from the field, and in the last 20 bytes it has the
+ space id, page number, and offset where the rest of the
+ field data is stored, and the data length in addition to
+ the data stored locally. We may need to store some data
+ locally to get the local record length above the 128 byte
+ limit so that field offsets are stored in two bytes, and
+ the extern bit is available in those two bytes. */
+
+ data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ if (UNIV_UNLIKELY
+ (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* The externally stored field was not written yet.
+ This record should only be seen by
+ trx_rollback_recovered() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+ return(NULL);
+ }
+
+ return(btr_copy_externally_stored_field(len, data,
+ zip_size, local_len, heap));
+}
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000..642db0e9
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,820 @@
+/*****************************************************************************
+
+Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created 05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "mysqld.h"
+
+#include <list>
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+ /** persistent cursor where btr_defragment_n_pages should start */
+ btr_pcur_t * const pcur;
+ /** completion signal */
+ pthread_cond_t *cond;
+ /** timestamp of last time this index is processed by defragment thread */
+ ulonglong last_processed= 0;
+
+ btr_defragment_item_t(btr_pcur_t *pcur, pthread_cond_t *cond)
+ : pcur(pcur), cond(cond) {}
+};
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
+static btr_defragment_wq_t btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+static mysql_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+Atomic_counter<ulint> btr_defragment_compression_failures;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+Atomic_counter<ulint> btr_defragment_failures;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+Atomic_counter<ulint> btr_defragment_count;
+
+bool btr_defragment_active;
+static void btr_defragment_chunk(void*);
+
+static tpool::timer* btr_defragment_timer;
+static tpool::task_group task_group(1);
+static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
+static void btr_defragment_start();
+
+static void submit_defragment_task(void*arg=0)
+{
+ srv_thread_pool->submit_task(&btr_defragment_task);
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+ srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+ mysql_mutex_init(btr_defragment_mutex_key, &btr_defragment_mutex,
+ nullptr);
+ btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
+ btr_defragment_active = true;
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+ if (!btr_defragment_timer)
+ return;
+ delete btr_defragment_timer;
+ btr_defragment_timer = 0;
+ task_group.cancel_pending(&btr_defragment_task);
+ mysql_mutex_lock(&btr_defragment_mutex);
+ std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ while(iter != btr_defragment_wq.end()) {
+ btr_defragment_item_t* item = *iter;
+ iter = btr_defragment_wq.erase(iter);
+ if (item->cond) {
+ pthread_cond_signal(item->cond);
+ }
+ }
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ mysql_mutex_destroy(&btr_defragment_mutex);
+ btr_defragment_active = false;
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index) /*!< Index to find. */
+{
+ mysql_mutex_lock(&btr_defragment_mutex);
+ for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ return true;
+ }
+ }
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ return false;
+}
+
+/** Defragment an index.
+@param pcur persistent cursor
+@param thd current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd)
+{
+ dict_stats_empty_defrag_summary(pcur->index());
+ pthread_cond_t cond;
+ pthread_cond_init(&cond, nullptr);
+ btr_defragment_item_t item(pcur, &cond);
+ mysql_mutex_lock(&btr_defragment_mutex);
+ btr_defragment_wq.push_back(&item);
+ if (btr_defragment_wq.size() == 1)
+ /* Kick off defragmentation work */
+ btr_defragment_start();
+ bool interrupted= false;
+ for (;;)
+ {
+ timespec abstime;
+ set_timespec(abstime, 1);
+ if (!my_cond_timedwait(&cond, &btr_defragment_mutex.m_mutex, &abstime))
+ break;
+ if (thd_killed(thd))
+ {
+ item.cond= nullptr;
+ interrupted= true;
+ break;
+ }
+ }
+
+ pthread_cond_destroy(&cond);
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ return interrupted;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table) /*!< Index to be removed. */
+{
+ mysql_mutex_lock(&btr_defragment_mutex);
+ for (auto item : btr_defragment_wq)
+ {
+ if (item->cond && table == item->pcur->index()->table)
+ {
+ pthread_cond_signal(item->cond);
+ item->cond= nullptr;
+ }
+ }
+ mysql_mutex_unlock(&btr_defragment_mutex);
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index)
+{
+ if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+ && index->table->space_id != 0 // do not track system tables
+ && !index->table->is_temporary()
+ && index->stat_defrag_modified_counter
+ >= srv_defragment_stats_accuracy) {
+ dict_stats_defrag_pool_add(index);
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+static
+ulint
+btr_defragment_calc_n_recs_for_size(
+ buf_block_t* block, /*!< in: B-tree page */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint size_limit, /*!< in: size limit to fit records in */
+ ulint* n_recs_size) /*!< out: actual size of the records that fit
+ in size_limit. */
+{
+ page_t* page = buf_block_get_frame(block);
+ ulint n_recs = 0;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ ulint size = 0;
+ page_cur_t cur;
+
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+ page_cur_set_before_first(block, &cur);
+ while (rec_t* cur_rec = page_cur_move_to_next(&cur)) {
+ if (page_rec_is_supremum(cur_rec)) {
+ break;
+ }
+ offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ulint rec_size = rec_offs_size(offsets);
+ size += rec_size;
+ if (size > size_limit) {
+ size = size - rec_size;
+ break;
+ }
+ n_recs ++;
+ }
+ *n_recs_size = size;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return n_recs;
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_search_father_node_ptr(
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ btr_cur_t* cursor, /*!< in: cursor pointing to user record,
+ out: cursor on node pointer record,
+ its page x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+ dict_index_t* index = btr_cur_get_index(cursor);
+ ut_ad(!index->is_spatial());
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(dict_index_get_page(index) != page_no);
+
+ const auto level = btr_page_get_level(btr_cur_get_page(cursor));
+
+ const rec_t* user_rec = btr_cur_get_rec(cursor);
+ ut_a(page_rec_is_user_rec(user_rec));
+
+ if (btr_cur_search_to_nth_level(level + 1,
+ dict_index_build_node_ptr(index,
+ user_rec, 0,
+ heap, level),
+ RW_X_LATCH,
+ cursor, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ const rec_t* node_ptr = btr_cur_get_rec(cursor);
+ ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive()
+ || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+ offsets = nullptr;
+ }
+
+ return(offsets);
+}
+
+static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor)
+{
+ rec_t *rec=
+ page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+ if (UNIV_UNLIKELY(!rec))
+ return false;
+ cursor->page_cur.rec= rec;
+ mem_heap_t *heap= mem_heap_create(100);
+ const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr);
+ mem_heap_free(heap);
+ return got;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation.
+@retval nullptr if corruption was noticed */
+static
+buf_block_t*
+btr_defragment_merge_pages(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* from_block, /*!< in: origin of merge */
+ buf_block_t* to_block, /*!< in: destination of merge */
+ ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */
+ ulint reserved_space, /*!< in: space reserved for future
+ insert to avoid immediate page split */
+ ulint* max_data_size, /*!< in/out: max data size to
+ fit in a single compressed page. */
+ mem_heap_t* heap, /*!< in/out: pointer to memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_t* from_page = buf_block_get_frame(from_block);
+ page_t* to_page = buf_block_get_frame(to_block);
+ ulint level = btr_page_get_level(from_page);
+ ulint n_recs = page_get_n_recs(from_page);
+ ulint new_data_size = page_get_data_size(to_page);
+ ulint max_ins_size =
+ page_get_max_insert_size(to_page, n_recs);
+ ulint max_ins_size_reorg =
+ page_get_max_insert_size_after_reorganize(
+ to_page, n_recs);
+ ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+ ? max_ins_size_reorg - reserved_space : 0;
+ ulint move_size = 0;
+ ulint n_recs_to_move = 0;
+ rec_t* rec = NULL;
+ ulint target_n_recs = 0;
+ rec_t* orig_pred;
+
+ // Estimate how many records can be moved from the from_page to
+ // the to_page.
+ if (zip_size) {
+ ulint page_diff = srv_page_size - *max_data_size;
+ max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+ ? max_ins_size_to_use - page_diff : 0;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+
+ // If max_ins_size >= move_size, we can move the records without
+ // reorganizing the page, otherwise we need to reorganize the page
+ // first to release more space.
+ if (move_size > max_ins_size) {
+ dberr_t err = btr_page_reorganize_block(page_zip_level,
+ to_block, index, mtr);
+ if (err != DB_SUCCESS) {
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ ibuf_reset_free_bits(to_block);
+ }
+ // If reorganization fails, that means page is
+ // not compressable. There's no point to try
+ // merging into this page. Continue to the
+ // next page.
+ return err == DB_FAIL ? from_block : nullptr;
+ }
+ ut_ad(page_validate(to_page, index));
+ max_ins_size = page_get_max_insert_size(to_page, n_recs);
+ if (max_ins_size < move_size) {
+ return nullptr;
+ }
+ }
+
+ // Move records to pack to_page more full.
+ orig_pred = NULL;
+ target_n_recs = n_recs_to_move;
+ dberr_t err;
+ while (n_recs_to_move > 0) {
+ if (!(rec = page_rec_get_nth(from_page, n_recs_to_move + 1))) {
+ return nullptr;
+ }
+ orig_pred = page_copy_rec_list_start(
+ to_block, from_block, rec, index, mtr, &err);
+ if (orig_pred)
+ break;
+ if (err != DB_FAIL) {
+ return nullptr;
+ }
+
+ // If we reach here, that means compression failed after packing
+ // n_recs_to_move number of records to to_page. We try to reduce
+ // the targeted data size on the to_page by
+ // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+ btr_defragment_compression_failures++;
+ max_ins_size_to_use =
+ move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ : 0;
+ if (max_ins_size_to_use == 0) {
+ n_recs_to_move = 0;
+ move_size = 0;
+ break;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+ }
+ // If less than target_n_recs are moved, it means there are
+ // compression failures during page_copy_rec_list_start. Adjust
+ // the max_data_size estimation to reduce compression failures
+ // in the following runs.
+ if (target_n_recs > n_recs_to_move
+ && *max_data_size > new_data_size + move_size) {
+ *max_data_size = new_data_size + move_size;
+ }
+ // Set ibuf free bits if necessary.
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ if (zip_size) {
+ ibuf_reset_free_bits(to_block);
+ } else {
+ ibuf_update_free_bits_if_full(
+ to_block,
+ srv_page_size,
+ ULINT_UNDEFINED);
+ }
+ }
+ btr_cur_t parent;
+ parent.page_cur.index = index;
+ parent.page_cur.block = from_block;
+
+ if (!btr_page_search_father(mtr, &parent)) {
+ to_block = nullptr;
+ } else if (n_recs_to_move == n_recs) {
+ /* The whole page is merged with the previous page,
+ free it. */
+ lock_update_merge_left(*to_block, orig_pred,
+ from_block->page.id());
+ btr_search_drop_page_hash_index(from_block, false);
+ if (btr_level_list_remove(*from_block, *index, mtr)
+ != DB_SUCCESS
+ || btr_cur_node_ptr_delete(&parent, mtr) != DB_SUCCESS
+ || btr_page_free(index, from_block, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+ } else {
+ // There are still records left on the page, so
+ // increment n_defragmented. Node pointer will be changed
+ // so remove the old node pointer.
+ if (n_recs_to_move > 0) {
+ // Part of the page is merged to left, remove
+ // the merged records, update record locks and
+ // node pointer.
+ dtuple_t* node_ptr;
+ page_delete_rec_list_start(rec, from_block,
+ index, mtr);
+ lock_update_split_and_merge(to_block,
+ orig_pred,
+ from_block);
+ // FIXME: reuse the node_ptr!
+ if (btr_cur_node_ptr_delete(&parent, mtr)
+ != DB_SUCCESS) {
+ return nullptr;
+ }
+ rec = page_rec_get_next(
+ page_get_infimum_rec(from_page));
+ if (!rec) {
+ return nullptr;
+ }
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, page_get_page_no(from_page),
+ heap, level);
+ if (btr_insert_on_non_leaf_level(0, index, level+1,
+ node_ptr, mtr)
+ != DB_SUCCESS) {
+ return nullptr;
+ }
+ }
+ to_block = from_block;
+ }
+ return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+static
+buf_block_t*
+btr_defragment_n_pages(
+ buf_block_t* block, /*!< in: starting block for defragmentation */
+ dict_index_t* index, /*!< in: index tree */
+ uint n_pages,/*!< in: number of pages to defragment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ /* We will need to load the n+1 block because if the last page is freed
+ and we need to modify the prev_page_no of that block. */
+ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+ page_t* first_page;
+ buf_block_t* current_block;
+ ulint total_data_size = 0;
+ ulint total_n_recs = 0;
+ ulint data_size_per_rec;
+ ulint optimal_page_size;
+ ulint reserved_space;
+ ulint max_data_size = 0;
+ uint n_defragmented = 0;
+ uint n_new_slots;
+ mem_heap_t* heap;
+ ibool end_of_index = FALSE;
+
+ /* It doesn't make sense to call this function with n_pages = 1. */
+ ut_ad(n_pages > 1);
+
+ if (!page_is_leaf(block->page.frame)) {
+ return NULL;
+ }
+
+ if (!index->table->space || !index->table->space_id) {
+ /* Ignore space 0. */
+ return NULL;
+ }
+
+ if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+ n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+ }
+
+ first_page = buf_block_get_frame(block);
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* 1. Load the pages and calculate the total data size. */
+ blocks[0] = block;
+ for (uint i = 1; i <= n_pages; i++) {
+ page_t* page = buf_block_get_frame(blocks[i-1]);
+ uint32_t page_no = btr_page_get_next(page);
+ total_data_size += page_get_data_size(page);
+ total_n_recs += page_get_n_recs(page);
+ if (page_no == FIL_NULL) {
+ n_pages = i;
+ end_of_index = TRUE;
+ break;
+ }
+
+ blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
+ mtr);
+ if (!blocks[i]) {
+ return nullptr;
+ }
+ }
+
+ if (n_pages == 1) {
+ if (!page_has_prev(first_page)) {
+ /* last page in the index */
+ if (dict_index_get_page(index)
+ == page_get_page_no(first_page))
+ return NULL;
+ /* given page is the last page.
+ Lift the records to father. */
+ dberr_t err;
+ btr_lift_page_up(index, block, mtr, &err);
+ }
+ return NULL;
+ }
+
+ /* 2. Calculate how many pages data can fit in. If not compressable,
+ return early. */
+ ut_a(total_n_recs != 0);
+ data_size_per_rec = total_data_size / total_n_recs;
+ // For uncompressed pages, the optimal data size if the free space of a
+ // empty page.
+ optimal_page_size = page_get_free_space_of_empty(
+ page_is_comp(first_page));
+ // For compressed pages, we take compression failures into account.
+ if (zip_size) {
+ ulint size = 0;
+ uint i = 0;
+ // We estimate the optimal data size of the index use samples of
+ // data size. These samples are taken when pages failed to
+ // compress due to insertion on the page. We use the average
+ // of all samples we have as the estimation. Different pages of
+ // the same index vary in compressibility. Average gives a good
+ // enough estimation.
+ for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+ if (index->stat_defrag_data_size_sample[i] == 0) {
+ break;
+ }
+ size += index->stat_defrag_data_size_sample[i];
+ }
+ if (i != 0) {
+ size /= i;
+ optimal_page_size = ut_min(optimal_page_size, size);
+ }
+ max_data_size = optimal_page_size;
+ }
+
+ reserved_space = ut_min(static_cast<ulint>(
+ static_cast<double>(optimal_page_size)
+ * (1 - srv_defragment_fill_factor)),
+ (data_size_per_rec
+ * srv_defragment_fill_factor_n_recs));
+ optimal_page_size -= reserved_space;
+ n_new_slots = uint((total_data_size + optimal_page_size - 1)
+ / optimal_page_size);
+ if (n_new_slots >= n_pages) {
+ /* Can't defragment. */
+ if (end_of_index)
+ return NULL;
+ return blocks[n_pages-1];
+ }
+
+ /* 3. Defragment pages. */
+ heap = mem_heap_create(256);
+ // First defragmented page will be the first page.
+ current_block = blocks[0];
+ // Start from the second page.
+ for (uint i = 1; i < n_pages; i ++) {
+ buf_block_t* new_block = btr_defragment_merge_pages(
+ index, blocks[i], current_block, zip_size,
+ reserved_space, &max_data_size, heap, mtr);
+ if (new_block != current_block) {
+ n_defragmented ++;
+ current_block = new_block;
+ if (!new_block) {
+ break;
+ }
+ }
+ }
+ mem_heap_free(heap);
+ n_defragmented ++;
+ btr_defragment_count++;
+ if (n_pages == n_defragmented) {
+ btr_defragment_failures++;
+ } else {
+ index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+ }
+ if (end_of_index)
+ return NULL;
+ return current_block;
+}
+
+
+
+void btr_defragment_start() {
+ if (!srv_defragment)
+ return;
+ ut_ad(!btr_defragment_wq.empty());
+ submit_defragment_task();
+}
+
+
+/**
+Callback used by defragment timer
+
+Throttling "sleep", is implemented via rescheduling the
+threadpool timer, which, when fired, will resume the work again,
+where it is left.
+
+The state (current item) is stored in function parameter.
+*/
+static void btr_defragment_chunk(void*)
+{
+ THD *thd = innobase_create_background_thd("InnoDB defragment");
+ set_current_thd(thd);
+
+ btr_defragment_item_t* item = nullptr;
+ mtr_t mtr;
+
+ mysql_mutex_lock(&btr_defragment_mutex);
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+ if (!item) {
+ if (btr_defragment_wq.empty()) {
+release_and_exit:
+ mysql_mutex_unlock(&btr_defragment_mutex);
+func_exit:
+ set_current_thd(nullptr);
+ destroy_background_thd(thd);
+ return;
+ }
+ item = *btr_defragment_wq.begin();
+ ut_ad(item);
+ }
+
+ if (!item->cond) {
+processed:
+ btr_defragment_wq.remove(item);
+ item = nullptr;
+ continue;
+ }
+
+ mysql_mutex_unlock(&btr_defragment_mutex);
+
+ ulonglong now = my_interval_timer();
+ ulonglong elapsed = now - item->last_processed;
+
+ if (elapsed < srv_defragment_interval) {
+ /* If we see an index again before the interval
+ determined by the configured frequency is reached,
+ we just sleep until the interval pass. Since
+ defragmentation of all indices queue up on a single
+ thread, it's likely other indices that follow this one
+ don't need to sleep again. */
+ int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
+ if (sleep_ms) {
+ btr_defragment_timer->set_time(sleep_ms, 0);
+ goto func_exit;
+ }
+ }
+ log_free_check();
+ mtr_start(&mtr);
+ dict_index_t *index = item->pcur->index();
+ index->set_modified(mtr);
+ /* To follow the latching order defined in WL#6326,
+ acquire index->lock X-latch. This entitles us to
+ acquire page latches in any order for the index. */
+ mtr_x_lock_index(index, &mtr);
+ if (buf_block_t *last_block =
+ item->pcur->restore_position(
+ BTR_PURGE_TREE_ALREADY_LATCHED, &mtr)
+ == btr_pcur_t::CORRUPTED
+ ? nullptr
+ : btr_defragment_n_pages(btr_pcur_get_block(item->pcur),
+ index, srv_defragment_n_pages,
+ &mtr)) {
+ /* If we haven't reached the end of the index,
+ place the cursor on the last record of last page,
+ store the cursor position, and put back in queue. */
+ page_t* last_page = buf_block_get_frame(last_block);
+ rec_t* rec = page_rec_get_prev(
+ page_get_supremum_rec(last_page));
+ if (rec && page_rec_is_user_rec(rec)) {
+ page_cur_position(rec, last_block,
+ btr_pcur_get_page_cur(
+ item->pcur));
+ }
+ btr_pcur_store_position(item->pcur, &mtr);
+ mtr_commit(&mtr);
+ /* Update the last_processed time of this index. */
+ item->last_processed = now;
+ mysql_mutex_lock(&btr_defragment_mutex);
+ } else {
+ mtr_commit(&mtr);
+ /* Reaching the end of the index. */
+ dict_stats_empty_defrag_stats(index);
+ if (dberr_t err= dict_stats_save_defrag_stats(index)) {
+ ib::error() << "Saving defragmentation stats for table "
+ << index->table->name
+ << " index " << index->name()
+ << " failed with error " << err;
+ } else {
+ err = dict_stats_save_defrag_summary(index,
+ thd);
+
+ if (err != DB_SUCCESS) {
+ ib::error() << "Saving defragmentation summary for table "
+ << index->table->name
+ << " index " << index->name()
+ << " failed with error " << err;
+ }
+ }
+
+ mysql_mutex_lock(&btr_defragment_mutex);
+ if (item->cond) {
+ pthread_cond_signal(item->cond);
+ }
+ goto processed;
+ }
+ }
+
+ goto release_and_exit;
+}
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
new file mode 100644
index 00000000..54dd15ac
--- /dev/null
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -0,0 +1,667 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.cc
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+ btr_pcur_t* cursor) /*!< in, out: persistent cursor */
+{
+ ut_free(cursor->old_rec_buf);
+ memset(&cursor->btr_cur.page_cur, 0, sizeof(page_cur_t));
+ cursor->old_rec_buf = NULL;
+ cursor->old_rec = NULL;
+ cursor->old_n_core_fields = 0;
+ cursor->old_n_fields = 0;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ rec_t* rec;
+ dict_index_t* index;
+ ulint offs;
+
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ block = btr_pcur_get_block(cursor);
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+ page_cursor = btr_pcur_get_page_cur(cursor);
+
+ rec = page_cur_get_rec(page_cursor);
+ offs = rec - block->page.frame;
+ ut_ad(block->page.id().page_no()
+ == page_get_page_no(block->page.frame));
+ ut_ad(block->page.buf_fix_count());
+ /* For spatial index, when we do positioning on parent
+ buffer if necessary, it might not hold latches, but the
+ tree must be locked to prevent change on the page */
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX)
+ || (index->is_spatial()
+ && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK)));
+
+ if (page_is_empty(block->page.frame)) {
+ /* It must be an empty index tree; NOTE that in this case
+ we do not store the modify_clock, but always do a search
+ if we restore the cursor position */
+
+ ut_a(!page_has_siblings(block->page.frame));
+ ut_ad(page_is_leaf(block->page.frame));
+ ut_ad(block->page.id().page_no() == index->page);
+
+ if (page_rec_is_supremum_low(offs)) {
+ cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+ } else {
+before_first:
+ cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+ }
+
+ return;
+ }
+
+ if (page_rec_is_supremum_low(offs)) {
+ rec = page_rec_get_prev(rec);
+ if (UNIV_UNLIKELY(!rec || page_rec_is_infimum(rec))) {
+ ut_ad("corrupted index" == 0);
+ cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+ return;
+ }
+
+ ut_ad(!page_rec_is_infimum(rec));
+ if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) {
+#if 0 /* MDEV-22867 had to relax this */
+ /* If the table is emptied during an ALGORITHM=NOCOPY
+ DROP COLUMN ... that is not ALGORITHM=INSTANT,
+ then we must preserve any instant ADD metadata. */
+ ut_ad(index->table->instant
+ || block->page.id().page_no() != index->page);
+#endif
+ ut_ad(index->is_instant()
+ || block->page.id().page_no() != index->page);
+ ut_ad(page_get_n_recs(block->page.frame) == 1);
+ ut_ad(page_is_leaf(block->page.frame));
+ ut_ad(!page_has_prev(block->page.frame));
+ cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+ return;
+ }
+
+ cursor->rel_pos = BTR_PCUR_AFTER;
+ } else if (page_rec_is_infimum_low(offs)) {
+ rec = page_rec_get_next(rec);
+
+ if (UNIV_UNLIKELY(!rec)) {
+ ut_ad("corrupted page" == 0);
+ goto before_first;
+ }
+
+ if (rec_is_metadata(rec, *index)) {
+ ut_ad(!page_has_prev(block->page.frame));
+ rec = page_rec_get_next(rec);
+ ut_ad(rec);
+ if (!rec || page_rec_is_supremum(rec)) {
+ goto before_first;
+ }
+ }
+
+ cursor->rel_pos = BTR_PCUR_BEFORE;
+ } else {
+ cursor->rel_pos = BTR_PCUR_ON;
+ }
+
+ if (index->is_ibuf()) {
+ ut_ad(!index->table->not_redundant());
+ cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec));
+ } else {
+ cursor->old_n_fields = static_cast<uint16>(
+ dict_index_get_n_unique_in_tree(index));
+ if (index->is_spatial() && !page_rec_is_leaf(rec)) {
+ ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index)
+ == DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+ /* For R-tree, we have to compare
+ the child page numbers as well. */
+ cursor->old_n_fields
+ = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+ }
+ }
+
+ cursor->old_n_core_fields = index->n_core_fields;
+ cursor->old_rec = rec_copy_prefix_to_buf(rec, index,
+ cursor->old_n_fields,
+ &cursor->old_rec_buf,
+ &cursor->buf_size);
+ cursor->block_when_stored.store(block);
+
+ /* Function try to check if block is S/X latch. */
+ cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is
+ copied */
+{
+ ut_free(pcur_receive->old_rec_buf);
+ memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+ if (pcur_donate->old_rec_buf) {
+
+ pcur_receive->old_rec_buf = (byte*)
+ ut_malloc_nokey(pcur_donate->buf_size);
+
+ memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+ pcur_donate->buf_size);
+ pcur_receive->old_rec = pcur_receive->old_rec_buf
+ + (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+ }
+
+ pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields;
+ pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/** Optimistically latches the leaf page or pages requested.
+@param[in] block guessed buffer block
+@param[in,out] pcur cursor
+@param[in,out] latch_mode BTR_SEARCH_LEAF, ...
+@param[in,out] mtr mini-transaction
+@return true if success */
+TRANSACTIONAL_TARGET
+static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block,
+ btr_pcur_t *pcur,
+ btr_latch_mode *latch_mode,
+ mtr_t *mtr)
+{
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.in_file());
+ ut_ad(block->page.frame);
+
+ static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
+ static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
+ static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) ==
+ (RW_S_LATCH ^ RW_X_LATCH), "");
+
+ const rw_lock_type_t mode=
+ rw_lock_type_t(*latch_mode & (RW_X_LATCH | RW_S_LATCH));
+
+ switch (*latch_mode) {
+ default:
+ ut_ad(*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF);
+ return buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr);
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ page_id_t id{0};
+ uint32_t left_page_no;
+ ulint zip_size;
+ buf_block_t *left_block= nullptr;
+ {
+ transactional_shared_lock_guard<block_lock> g{block->page.lock};
+ if (block->modify_clock != pcur->modify_clock)
+ return false;
+ id= block->page.id();
+ zip_size= block->zip_size();
+ left_page_no= btr_page_get_prev(block->page.frame);
+ }
+
+ if (left_page_no != FIL_NULL)
+ {
+ left_block=
+ buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size,
+ mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+ if (left_block &&
+ btr_page_get_next(left_block->page.frame) != id.page_no())
+ {
+release_left_block:
+ mtr->release_last_page();
+ return false;
+ }
+ }
+
+ if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr))
+ {
+ if (btr_page_get_prev(block->page.frame) == left_page_no)
+ {
+ /* block was already buffer-fixed while entering the function and
+ buf_page_optimistic_get() buffer-fixes it again. */
+ ut_ad(2 <= block->page.buf_fix_count());
+ *latch_mode= btr_latch_mode(mode);
+ return true;
+ }
+
+ mtr->release_last_page();
+ }
+
+ ut_ad(block->page.buf_fix_count());
+ if (left_block)
+ goto release_left_block;
+ return false;
+ }
+}
+
+/** Structure acts as functor to do the latching of leaf pages.
+It returns true if latching of leaf pages succeeded and false
+otherwise. */
+struct optimistic_latch_leaves
+{
+ btr_pcur_t *const cursor;
+ btr_latch_mode *const latch_mode;
+ mtr_t *const mtr;
+
+ bool operator()(buf_block_t *hint) const
+ {
+ return hint &&
+ btr_pcur_optimistic_latch_leaves(hint, cursor, latch_mode, mtr);
+ }
+};
+
+/** Restores the stored position of a persistent cursor bufferfixing
+the page and obtaining the specified latches. If the cursor position
+was saved when the
+(1) cursor was positioned on a user record: this function restores the
+position to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the
+position to the last record LESS than the user record which was the
+successor of the page infimum;
+(3) cursor was positioned on the page supremum: restores to the first
+record GREATER than the user record which was the predecessor of the
+supremum.
+(4) cursor was positioned before the first or after the last in an
+empty tree: restores to before first or after the last in the tree.
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param mtr mini-transaction
+@return btr_pcur_t::SAME_ALL cursor position on user rec and points on
+the record with the same field values as in the stored record,
+btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the
+record with the same unique field values as in the stored record,
+btr_pcur_t::NOT_SAME cursor position is not on user rec or points on
+the record with not the samebuniq field values as in the stored */
+btr_pcur_t::restore_status
+btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
+{
+ dict_index_t* index;
+ dtuple_t* tuple;
+ page_cur_mode_t mode;
+ page_cur_mode_t old_mode;
+ mem_heap_t* heap;
+
+ ut_ad(mtr->is_active());
+ ut_ad(pos_state == BTR_PCUR_WAS_POSITIONED
+ || pos_state == BTR_PCUR_IS_POSITIONED);
+
+ index = btr_cur_get_index(&btr_cur);
+
+ if (UNIV_UNLIKELY
+ (rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+ || rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+ /* In these cases we do not try an optimistic restoration,
+ but always do a search */
+
+ if (btr_cur.open_leaf(rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+ index, restore_latch_mode, mtr)
+ != DB_SUCCESS) {
+ return restore_status::CORRUPTED;
+ }
+
+ latch_mode =
+ BTR_LATCH_MODE_WITHOUT_INTENTION(restore_latch_mode);
+ pos_state = BTR_PCUR_IS_POSITIONED;
+ block_when_stored.clear();
+
+ return restore_status::NOT_SAME;
+ }
+
+ ut_a(old_rec);
+ ut_a(old_n_core_fields);
+ ut_a(old_n_core_fields <= index->n_core_fields);
+ ut_a(old_n_fields);
+
+ static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), "");
+ static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), "");
+
+ switch (restore_latch_mode | 4) {
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ /* Try optimistic restoration. */
+ if (block_when_stored.run_with_hint(
+ optimistic_latch_leaves{this, &restore_latch_mode,
+ mtr})) {
+ pos_state = BTR_PCUR_IS_POSITIONED;
+ latch_mode = restore_latch_mode;
+
+ if (rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+ const rec_t* rec;
+ rec_offs offsets1_[REC_OFFS_NORMAL_SIZE];
+ rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets1 = offsets1_;
+ rec_offs* offsets2 = offsets2_;
+ rec = btr_pcur_get_rec(this);
+
+ rec_offs_init(offsets1_);
+ rec_offs_init(offsets2_);
+
+ heap = mem_heap_create(256);
+ ut_ad(old_n_core_fields
+ == index->n_core_fields);
+
+ offsets1 = rec_get_offsets(
+ old_rec, index, offsets1,
+ old_n_core_fields,
+ old_n_fields, &heap);
+ offsets2 = rec_get_offsets(
+ rec, index, offsets2,
+ index->n_core_fields,
+ old_n_fields, &heap);
+
+ ut_ad(!cmp_rec_rec(old_rec,
+ rec, offsets1, offsets2,
+ index));
+ mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+ return restore_status::SAME_ALL;
+ }
+ /* This is the same record as stored,
+ may need to be adjusted for BTR_PCUR_BEFORE/AFTER,
+ depending on search mode and direction. */
+ if (btr_pcur_is_on_user_rec(this)) {
+ pos_state
+ = BTR_PCUR_IS_POSITIONED_OPTIMISTIC;
+ }
+ return restore_status::NOT_SAME;
+ }
+ }
+
+ /* If optimistic restoration did not succeed, open the cursor anew */
+
+ heap = mem_heap_create(256);
+
+ tuple = dtuple_create(heap, old_n_fields);
+
+ dict_index_copy_types(tuple, index, old_n_fields);
+
+ rec_copy_prefix_to_dtuple(tuple, old_rec, index,
+ old_n_core_fields,
+ old_n_fields, heap);
+ ut_ad(dtuple_check_typed(tuple));
+
+ /* Save the old search mode of the cursor */
+ old_mode = search_mode;
+
+ switch (rel_pos) {
+ case BTR_PCUR_ON:
+ mode = PAGE_CUR_LE;
+ break;
+ case BTR_PCUR_AFTER:
+ mode = PAGE_CUR_G;
+ break;
+ case BTR_PCUR_BEFORE:
+ mode = PAGE_CUR_L;
+ break;
+ default:
+ MY_ASSERT_UNREACHABLE();
+ mode = PAGE_CUR_UNSUPP;
+ }
+
+ if (btr_pcur_open_with_no_init(tuple, mode, restore_latch_mode,
+ this, mtr) != DB_SUCCESS) {
+ mem_heap_free(heap);
+ return restore_status::CORRUPTED;
+ }
+
+ /* Restore the old search mode */
+ search_mode = old_mode;
+
+ ut_ad(rel_pos == BTR_PCUR_ON
+ || rel_pos == BTR_PCUR_BEFORE
+ || rel_pos == BTR_PCUR_AFTER);
+ rec_offs offsets[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets);
+ restore_status ret_val= restore_status::NOT_SAME;
+ if (rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(this)) {
+ ulint n_matched_fields= 0;
+ if (!cmp_dtuple_rec_with_match(
+ tuple, btr_pcur_get_rec(this), index,
+ rec_get_offsets(btr_pcur_get_rec(this), index, offsets,
+ index->n_core_fields, ULINT_UNDEFINED, &heap),
+ &n_matched_fields)) {
+
+ /* We have to store the NEW value for the modify clock,
+ since the cursor can now be on a different page!
+ But we can retain the value of old_rec */
+
+ block_when_stored.store(btr_pcur_get_block(this));
+ modify_clock= buf_block_get_modify_clock(
+ block_when_stored.block());
+
+ mem_heap_free(heap);
+
+ return restore_status::SAME_ALL;
+ }
+ if (n_matched_fields >= index->n_uniq)
+ ret_val= restore_status::SAME_UNIQ;
+ }
+
+ mem_heap_free(heap);
+
+ /* We have to store new position information, modify_clock etc.,
+ to the cursor because it can now be on a different page, the record
+ under it may have been removed, etc. */
+
+ btr_pcur_store_position(this, mtr);
+
+ return ret_val;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+dberr_t
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+ cursor->old_rec = nullptr;
+
+ const page_t* page = btr_pcur_get_page(cursor);
+ const uint32_t next_page_no = btr_page_get_next(page);
+
+ switch (next_page_no) {
+ case 0:
+ case 1:
+ case FIL_NULL:
+ return DB_CORRUPTION;
+ }
+
+ if (UNIV_UNLIKELY(next_page_no == btr_pcur_get_block(cursor)
+ ->page.id().page_no())) {
+ return DB_CORRUPTION;
+ }
+
+ dberr_t err;
+ buf_block_t* next_block = btr_block_get(
+ *cursor->index(), next_page_no,
+ rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)),
+ page_is_leaf(page), mtr, &err);
+
+ if (UNIV_UNLIKELY(!next_block)) {
+ return err;
+ }
+
+ const page_t* next_page = buf_block_get_frame(next_block);
+
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+ page + FIL_PAGE_OFFSET, 4))) {
+ return DB_CORRUPTION;
+ }
+
+ page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+ ut_d(page_check_dir(next_page));
+
+ const auto s = mtr->get_savepoint();
+ mtr->rollback_to_savepoint(s - 2, s - 1);
+ return DB_SUCCESS;
+}
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+static
+bool
+btr_pcur_move_backward_from_page(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first
+ record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(btr_pcur_is_before_first_on_page(cursor));
+ ut_ad(!btr_pcur_is_before_first_in_tree(cursor));
+
+ const auto latch_mode = cursor->latch_mode;
+ ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+
+ btr_pcur_store_position(cursor, mtr);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), "");
+ static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), "");
+
+ if (UNIV_UNLIKELY(cursor->restore_position(
+ btr_latch_mode(4 | latch_mode), mtr)
+ == btr_pcur_t::CORRUPTED)) {
+ return true;
+ }
+
+ buf_block_t* block = btr_pcur_get_block(cursor);
+
+ if (page_has_prev(block->page.frame)) {
+ buf_block_t* left_block
+ = mtr->at_savepoint(mtr->get_savepoint() - 1);
+ const page_t* const left = left_block->page.frame;
+ if (memcmp_aligned<4>(left + FIL_PAGE_NEXT,
+ block->page.frame
+ + FIL_PAGE_OFFSET, 4)) {
+ /* This should be the right sibling page, or
+ if there is none, the current block. */
+ ut_ad(left_block == block
+ || !memcmp_aligned<4>(left + FIL_PAGE_PREV,
+ block->page.frame
+ + FIL_PAGE_OFFSET, 4));
+ /* The previous one must be the left sibling. */
+ left_block
+ = mtr->at_savepoint(mtr->get_savepoint() - 2);
+ ut_ad(!memcmp_aligned<4>(left_block->page.frame
+ + FIL_PAGE_NEXT,
+ block->page.frame
+ + FIL_PAGE_OFFSET, 4));
+ }
+ if (btr_pcur_is_before_first_on_page(cursor)) {
+ page_cur_set_after_last(left_block,
+ &cursor->btr_cur.page_cur);
+ /* Release the right sibling. */
+ } else {
+ /* Release the left sibling. */
+ block = left_block;
+ }
+ mtr->release(*block);
+ }
+
+ cursor->latch_mode = latch_mode;
+ cursor->old_rec = nullptr;
+ return false;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+bool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_rec = nullptr;
+
+ if (btr_pcur_is_before_first_on_page(cursor)) {
+ return (!btr_pcur_is_before_first_in_tree(cursor)
+ && !btr_pcur_move_backward_from_page(cursor, mtr));
+ }
+
+ return btr_pcur_move_to_prev_on_page(cursor) != nullptr;
+}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
new file mode 100644
index 00000000..8435047c
--- /dev/null
+++ b/storage/innobase/btr/btr0sea.cc
@@ -0,0 +1,2328 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.cc
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "srv0mon.h"
+
+/** Is search system enabled.
+Search system is protected by array of latches. */
+char btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+ulong btr_ahi_parts;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+ulint btr_search_n_succ = 0;
+/** Number of failed adaptive hash index lookups */
+ulint btr_search_n_hash_fail = 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+#ifdef UNIV_PFS_RWLOCK
+mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** The adaptive hash index */
+btr_search_sys_t btr_search_sys;
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT 16U
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT 100U
+
+/** Compute a hash value of a record in a page.
+@param[in] rec index record
+@param[in] offsets return value of rec_get_offsets()
+@param[in] n_fields number of complete fields to fold
+@param[in] n_bytes number of bytes to fold in the last field
+@param[in] index_id index tree ID
+@return the hash value */
+static inline
+ulint
+rec_fold(
+ const rec_t* rec,
+ const rec_offs* offsets,
+ ulint n_fields,
+ ulint n_bytes,
+ index_id_t tree_id)
+{
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+ ulint n_fields_rec;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!page_rec_is_metadata(rec));
+ ut_ad(n_fields > 0 || n_bytes > 0);
+
+ n_fields_rec = rec_offs_n_fields(offsets);
+ ut_ad(n_fields <= n_fields_rec);
+ ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+ if (n_fields > n_fields_rec) {
+ n_fields = n_fields_rec;
+ }
+
+ if (n_fields == n_fields_rec) {
+ n_bytes = 0;
+ }
+
+ fold = ut_fold_ull(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/** Determine the number of accessed key fields.
+@param[in] n_fields number of complete fields
+@param[in] n_bytes number of bytes in an incomplete last field
+@return number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+ ulint n_fields,
+ ulint n_bytes)
+{
+ return(n_fields + (n_bytes > 0 ? 1 : 0));
+}
+
+/** Determine the number of accessed key fields.
+@param[in] cursor b-tree cursor
+@return number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+ const btr_cur_t* cursor)
+{
+ return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes));
+}
+
+/** This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success.
+@param[in] index index handler */
+static void btr_search_check_free_space_in_heap(const dict_index_t *index)
+{
+ /* Note that we peek the value of heap->free_block without reserving
+ the latch: this is ok, because we will not guarantee that there will
+ be enough free space in the hash table. */
+
+ buf_block_t *block= buf_block_alloc();
+ auto part= btr_search_sys.get_part(*index);
+
+ part->latch.wr_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled || part->heap->free_block)
+ buf_block_free(block);
+ else
+ part->heap->free_block= block;
+
+ part->latch.wr_unlock();
+}
+
+/** Set index->ref_count = 0 on all indexes of a table.
+@param[in,out] table table handler */
+static void btr_search_disable_ref_count(dict_table_t *table)
+{
+ for (dict_index_t *index= dict_table_get_first_index(table); index;
+ index= dict_table_get_next_index(index))
+ index->search_info->ref_count= 0;
+}
+
+/** Lazily free detached metadata when removing the last reference. */
+ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
+{
+ ut_ad(index->freed());
+ dict_table_t *table= index->table;
+ table->autoinc_mutex.wr_lock();
+
+ /* Perform the skipped steps of dict_index_remove_from_cache_low(). */
+ UT_LIST_REMOVE(table->freed_indexes, index);
+ index->lock.free();
+ dict_mem_index_free(index);
+
+ if (!UT_LIST_GET_LEN(table->freed_indexes) &&
+ !UT_LIST_GET_LEN(table->indexes))
+ {
+ ut_ad(!table->id);
+ table->autoinc_mutex.wr_unlock();
+ table->autoinc_mutex.destroy();
+ dict_mem_table_free(table);
+ return;
+ }
+
+ table->autoinc_mutex.wr_unlock();
+}
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable()
+{
+ dict_table_t* table;
+
+ dict_sys.freeze(SRW_LOCK_CALL);
+
+ btr_search_x_lock_all();
+
+ if (!btr_search_enabled) {
+ dict_sys.unfreeze();
+ btr_search_x_unlock_all();
+ return;
+ }
+
+ btr_search_enabled = false;
+
+ /* Clear the index->search_info->ref_count of every index in
+ the data dictionary cache. */
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ btr_search_disable_ref_count(table);
+ }
+
+ for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table;
+ table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+ btr_search_disable_ref_count(table);
+ }
+
+ dict_sys.unfreeze();
+
+ /* Set all block->index = NULL. */
+ buf_pool.clear_hash_index();
+
+ /* Clear the adaptive hash index. */
+ btr_search_sys.clear();
+
+ btr_search_x_unlock_all();
+}
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize)
+{
+ if (!resize) {
+ mysql_mutex_lock(&buf_pool.mutex);
+ bool changed = srv_buf_pool_old_size != srv_buf_pool_size;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ if (changed) {
+ return;
+ }
+ }
+
+ btr_search_x_lock_all();
+ ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
+
+ if (btr_search_sys.parts[0].heap) {
+ ut_ad(btr_search_enabled);
+ btr_search_x_unlock_all();
+ return;
+ }
+
+ btr_search_sys.alloc(hash_size);
+
+ btr_search_enabled = true;
+ btr_search_x_unlock_all();
+}
+
+/** Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent.
+@param[in,out] info search info
+@param[in] cursor cursor which was just positioned */
+static void btr_search_info_update_hash(btr_search_t *info, btr_cur_t *cursor)
+{
+ dict_index_t* index = cursor->index();
+ int cmp;
+
+ if (dict_index_is_ibuf(index)) {
+ /* So many deletes are performed on an insert buffer tree
+ that we do not consider a hash index useful on it: */
+
+ return;
+ }
+
+ uint16_t n_unique = dict_index_get_n_unique_in_tree(index);
+
+ if (info->n_hash_potential == 0) {
+
+ goto set_new_recomm;
+ }
+
+ /* Test if the search would have succeeded using the recommended
+ hash prefix */
+
+ if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+ info->n_hash_potential++;
+
+ return;
+ }
+
+ cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+ cursor->low_match, cursor->low_bytes);
+
+ if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+ goto set_new_recomm;
+ }
+
+ cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+ cursor->up_match, cursor->up_bytes);
+
+ if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+ goto increment_potential;
+ }
+
+set_new_recomm:
+ /* We have to set a new recommendation; skip the hash analysis
+ for a while to avoid unnecessary CPU time usage when there is no
+ chance for success */
+
+ info->hash_analysis = 0;
+
+ cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+ cursor->low_match, cursor->low_bytes);
+ info->left_side = cmp >= 0;
+ info->n_hash_potential = cmp != 0;
+
+ if (cmp == 0) {
+ /* For extra safety, we set some sensible values here */
+ info->n_fields = 1;
+ info->n_bytes = 0;
+ } else if (cmp > 0) {
+ info->n_hash_potential = 1;
+
+ if (cursor->up_match >= n_unique) {
+
+ info->n_fields = n_unique;
+ info->n_bytes = 0;
+
+ } else if (cursor->low_match < cursor->up_match) {
+
+ info->n_fields = static_cast<uint16_t>(
+ cursor->low_match + 1);
+ info->n_bytes = 0;
+ } else {
+ info->n_fields = static_cast<uint16_t>(
+ cursor->low_match);
+ info->n_bytes = static_cast<uint16_t>(
+ cursor->low_bytes + 1);
+ }
+ } else {
+ if (cursor->low_match >= n_unique) {
+
+ info->n_fields = n_unique;
+ info->n_bytes = 0;
+ } else if (cursor->low_match > cursor->up_match) {
+
+ info->n_fields = static_cast<uint16_t>(
+ cursor->up_match + 1);
+ info->n_bytes = 0;
+ } else {
+ info->n_fields = static_cast<uint16_t>(
+ cursor->up_match);
+ info->n_bytes = static_cast<uint16_t>(
+ cursor->up_bytes + 1);
+ }
+ }
+}
+
+/** Update the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return TRUE if building a (new) hash index on the block is recommended
+@param[in,out] info search info
+@param[in,out] block buffer block */
+static
+bool
+btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
+{
+ ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+
+ info->last_hash_succ = FALSE;
+ ut_ad(block->page.frame);
+ ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+ if ((block->n_hash_helps > 0)
+ && (info->n_hash_potential > 0)
+ && (block->n_fields == info->n_fields)
+ && (block->n_bytes == info->n_bytes)
+ && (block->left_side == info->left_side)) {
+
+ if ((block->index)
+ && (block->curr_n_fields == info->n_fields)
+ && (block->curr_n_bytes == info->n_bytes)
+ && (block->curr_left_side == info->left_side)) {
+
+ /* The search would presumably have succeeded using
+ the hash index */
+
+ info->last_hash_succ = TRUE;
+ }
+
+ block->n_hash_helps++;
+ } else {
+ block->n_hash_helps = 1;
+ block->n_fields = info->n_fields;
+ block->n_bytes = info->n_bytes;
+ block->left_side = info->left_side;
+ }
+
+ if ((block->n_hash_helps > page_get_n_recs(block->page.frame)
+ / BTR_SEARCH_PAGE_BUILD_LIMIT)
+ && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+ if ((!block->index)
+ || (block->n_hash_helps
+ > 2U * page_get_n_recs(block->page.frame))
+ || (block->n_fields != block->curr_n_fields)
+ || (block->n_bytes != block->curr_n_bytes)
+ || (block->left_side != block->curr_left_side)) {
+
+ /* Build a new hash index on the page */
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Maximum number of records in a page */
+constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+__attribute__((nonnull))
+/**
+Insert an entry into the hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@param table hash table
+@param heap memory heap
+@param fold folded value of the record
+@param block buffer block containing the record
+@param data the record
+@retval true on success
+@retval false if no more memory could be allocated */
+static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
+ ulint fold,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t *block, /*!< buffer block of data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t *data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(block->page.frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ ut_ad(btr_search_enabled);
+
+ hash_cell_t *cell= &table->array[table->calc_hash(fold)];
+
+ for (ha_node_t *prev= static_cast<ha_node_t*>(cell->node); prev;
+ prev= prev->next)
+ {
+ if (prev->fold == fold)
+ {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t *prev_block= prev->block;
+ ut_a(prev_block->page.frame == page_align(prev->data));
+ ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
+ ut_a(block->n_pointers++ < MAX_N_POINTERS);
+
+ prev->block= block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ prev->data= data;
+ return true;
+ }
+ }
+
+ /* We have to allocate a new chain node */
+ ha_node_t *node= static_cast<ha_node_t*>(mem_heap_alloc(heap, sizeof *node));
+
+ if (!node)
+ return false;
+
+ ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(block->n_pointers++ < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ node->fold= fold;
+ node->next= nullptr;
+
+ ha_node_t *prev= static_cast<ha_node_t*>(cell->node);
+ if (!prev)
+ cell->node= node;
+ else
+ {
+ while (prev->next)
+ prev= prev->next;
+ prev->next= node;
+ }
+ return true;
+}
+
+__attribute__((nonnull))
+/** Delete a record.
+@param table hash table
+@param heap memory heap
+@param del_node record to be deleted */
+static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap,
+ ha_node_t *del_node)
+{
+ ut_ad(btr_search_enabled);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(del_node->block->page.frame == page_align(del_node->data));
+ ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ const ulint fold= del_node->fold;
+
+ HASH_DELETE(ha_node_t, next, table, fold, del_node);
+
+ ha_node_t *top= static_cast<ha_node_t*>(mem_heap_get_top(heap, sizeof *top));
+
+ if (del_node != top)
+ {
+ /* Compact the heap of nodes by moving the top in the place of del_node. */
+ *del_node= *top;
+ hash_cell_t *cell= &table->array[table->calc_hash(top->fold)];
+
+ /* Look for the pointer to the top node, to update it */
+ if (cell->node == top)
+ /* The top node is the first in the chain */
+ cell->node= del_node;
+ else
+ {
+ /* We have to look for the predecessor */
+ ha_node_t *node= static_cast<ha_node_t*>(cell->node);
+
+ while (top != HASH_GET_NEXT(next, node))
+ node= static_cast<ha_node_t*>(HASH_GET_NEXT(next, node));
+
+ /* Now we have the predecessor node */
+ node->next= del_node;
+ }
+ }
+
+ /* Free the occupied space */
+ mem_heap_free_top(heap, sizeof *top);
+}
+
+__attribute__((nonnull))
+/** Delete all pointers to a page.
+@param table hash table
+@param heap memory heap
+@param page record to be deleted */
+static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap,
+ ulint fold, const page_t *page)
+{
+ for (ha_node_t *node= ha_chain_get_first(table, fold); node; )
+ {
+ if (page_align(ha_node_get_data(node)) == page)
+ {
+ ha_delete_hash_node(table, heap, node);
+ /* The deletion may compact the heap of nodes and move other nodes! */
+ node= ha_chain_get_first(table, fold);
+ }
+ else
+ node= ha_chain_get_next(node);
+ }
+#ifdef UNIV_DEBUG
+ /* Check that all nodes really got deleted */
+ for (ha_node_t *node= ha_chain_get_first(table, fold); node;
+ node= ha_chain_get_next(node))
+ ut_ad(page_align(ha_node_get_data(node)) != page);
+#endif /* UNIV_DEBUG */
+}
+
+/** Delete a record if found.
+@param table hash table
+@param heap memory heap for the hash bucket chain
+@param fold folded value of the searched data
+@param data pointer to the record
+@return whether the record was found */
+static bool ha_search_and_delete_if_found(hash_table_t *table,
+ mem_heap_t *heap,
+ ulint fold, const rec_t *data)
+{
+ if (ha_node_t *node= ha_search_with_data(table, fold, data))
+ {
+ ha_delete_hash_node(table, heap, node);
+ return true;
+ }
+
+ return false;
+}
+
+__attribute__((nonnull))
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table hash table
+@param fold folded value of the searched data
+@param data pointer to the data
+@param new_data new pointer to the data
+@return whether the element was found */
+static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold,
+ const rec_t *data,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ /** block containing new_data */
+ buf_block_t *new_block,
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ const rec_t *new_data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(new_block->page.frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ if (!btr_search_enabled)
+ return false;
+
+ if (ha_node_t *node= ha_search_with_data(table, fold, data))
+ {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
+ ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
+ node->block= new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data= new_data;
+
+ return true;
+ }
+
+ return false;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+#else
+# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d)
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+ ha_search_and_update_if_found(table,fold,data,new_data)
+#endif
+
+/** Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index.
+@param[in] info search info
+@param[in] block buffer block where cursor positioned
+@param[in] cursor cursor */
+static
+void
+btr_search_update_hash_ref(
+ const btr_search_t* info,
+ buf_block_t* block,
+ const btr_cur_t* cursor)
+{
+ ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+
+ ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+ ut_ad(page_align(btr_cur_get_rec(cursor)) == block->page.frame);
+ ut_ad(page_is_leaf(block->page.frame));
+ assert_block_ahi_valid(block);
+
+ dict_index_t* index = block->index;
+
+ if (!index || !info->n_hash_potential) {
+ return;
+ }
+
+ if (index != cursor->index()) {
+ ut_ad(index->id == cursor->index()->id);
+ btr_search_drop_page_hash_index(block, false);
+ return;
+ }
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_ad(index == cursor->index());
+ ut_ad(!dict_index_is_ibuf(index));
+ auto part = btr_search_sys.get_part(*index);
+ part->latch.wr_lock(SRW_LOCK_CALL);
+ ut_ad(!block->index || block->index == index);
+
+ if (block->index
+ && (block->curr_n_fields == info->n_fields)
+ && (block->curr_n_bytes == info->n_bytes)
+ && (block->curr_left_side == info->left_side)
+ && btr_search_enabled) {
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ const rec_t* rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_user_rec(rec)) {
+ goto func_exit;
+ }
+
+ ulint fold = rec_fold(
+ rec,
+ rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap),
+ block->curr_n_fields,
+ block->curr_n_bytes, index->id);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ ha_insert_for_fold(&part->table, part->heap, fold, block, rec);
+
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+func_exit:
+ part->latch.wr_unlock();
+}
+
+/** Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@param[in,out] cursor guess cursor position
+@param[in] can_only_compare_to_cursor_rec
+ if we do not have a latch on the page of cursor,
+ but a latch corresponding search system, then
+ ONLY the columns of the record UNDER the cursor
+ are protected, not the next or previous record
+ in the chain: we cannot look at the next or
+ previous record to check our guess!
+@param[in] tuple data tuple
+@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE
+@return whether a match was found */
+static
+bool
+btr_search_check_guess(
+ btr_cur_t* cursor,
+ bool can_only_compare_to_cursor_rec,
+ const dtuple_t* tuple,
+ ulint mode)
+{
+ rec_t* rec;
+ ulint n_unique;
+ ulint match;
+ int cmp;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ bool success = false;
+ rec_offs_init(offsets_);
+
+ n_unique = dict_index_get_n_unique_in_tree(cursor->index());
+
+ rec = btr_cur_get_rec(cursor);
+
+ if (UNIV_UNLIKELY(!page_rec_is_user_rec(rec)
+ || !page_rec_is_leaf(rec))) {
+ ut_ad("corrupted index" == 0);
+ return false;
+ } else if (cursor->index()->table->not_redundant()) {
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ break;
+ default:
+ ut_ad("corrupted index" == 0);
+ return false;
+ }
+ }
+
+ match = 0;
+
+ offsets = rec_get_offsets(rec, cursor->index(), offsets,
+ cursor->index()->n_core_fields,
+ n_unique, &heap);
+ cmp = cmp_dtuple_rec_with_match(tuple, rec, cursor->index(), offsets,
+ &match);
+
+ if (mode == PAGE_CUR_GE) {
+ if (cmp > 0) {
+ goto exit_func;
+ }
+
+ cursor->up_match = match;
+
+ if (match >= n_unique) {
+ success = true;
+ goto exit_func;
+ }
+ } else if (mode == PAGE_CUR_LE) {
+ if (cmp < 0) {
+ goto exit_func;
+ }
+
+ cursor->low_match = match;
+
+ } else if (mode == PAGE_CUR_G) {
+ if (cmp >= 0) {
+ goto exit_func;
+ }
+ } else if (mode == PAGE_CUR_L) {
+ if (cmp <= 0) {
+ goto exit_func;
+ }
+ }
+
+ if (can_only_compare_to_cursor_rec) {
+ /* Since we could not determine if our guess is right just by
+ looking at the record under the cursor, return FALSE */
+ goto exit_func;
+ }
+
+ match = 0;
+
+ if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+ const rec_t* prev_rec = page_rec_get_prev(rec);
+
+ if (UNIV_UNLIKELY(!prev_rec)) {
+ ut_ad("corrupted index" == 0);
+ goto exit_func;
+ }
+
+ if (page_rec_is_infimum(prev_rec)) {
+ success = !page_has_prev(page_align(prev_rec));
+ goto exit_func;
+ }
+
+ if (cursor->index()->table->not_redundant()) {
+ switch (rec_get_status(prev_rec)) {
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ break;
+ default:
+ ut_ad("corrupted index" == 0);
+ goto exit_func;
+ }
+ }
+
+ offsets = rec_get_offsets(prev_rec, cursor->index(), offsets,
+ cursor->index()->n_core_fields,
+ n_unique, &heap);
+ cmp = cmp_dtuple_rec_with_match(tuple, prev_rec,
+ cursor->index(), offsets,
+ &match);
+ if (mode == PAGE_CUR_GE) {
+ success = cmp > 0;
+ } else {
+ success = cmp >= 0;
+ }
+ } else {
+ ut_ad(!page_rec_is_supremum(rec));
+
+ const rec_t* next_rec = page_rec_get_next(rec);
+
+ if (UNIV_UNLIKELY(!next_rec)) {
+ ut_ad("corrupted index" == 0);
+ goto exit_func;
+ }
+
+ if (page_rec_is_supremum(next_rec)) {
+ if (!page_has_next(page_align(next_rec))) {
+ cursor->up_match = 0;
+ success = true;
+ }
+
+ goto exit_func;
+ }
+
+ if (cursor->index()->table->not_redundant()) {
+ switch (rec_get_status(next_rec)) {
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ break;
+ default:
+ ut_ad("corrupted index" == 0);
+ goto exit_func;
+ }
+ }
+
+ offsets = rec_get_offsets(next_rec, cursor->index(), offsets,
+ cursor->index()->n_core_fields,
+ n_unique, &heap);
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, next_rec, cursor->index(), offsets, &match);
+ if (mode == PAGE_CUR_LE) {
+ success = cmp < 0;
+ cursor->up_match = match;
+ } else {
+ success = cmp <= 0;
+ }
+ }
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+
+static
+void
+btr_search_failure(btr_search_t* info, btr_cur_t* cursor)
+{
+ cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ ++info->n_hash_fail;
+
+ if (info->n_hash_succ > 0) {
+ --info->n_hash_succ;
+ }
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+ info->last_hash_succ = FALSE;
+}
+
+/** Clear the adaptive hash index on all pages in the buffer pool. */
+inline void buf_pool_t::clear_hash_index()
+{
+ ut_ad(!resizing);
+ ut_ad(!btr_search_enabled);
+
+ std::set<dict_index_t*> garbage;
+
+ for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
+ {
+ for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
+ block != end; block++)
+ {
+ dict_index_t *index= block->index;
+ assert_block_ahi_valid(block);
+
+ /* We can clear block->index and block->n_pointers when
+ holding all AHI latches exclusively; see the comments in buf0buf.h */
+
+ if (!index)
+ {
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(!block->n_pointers);
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ continue;
+ }
+
+ ut_d(const auto s= block->page.state());
+ /* Another thread may have set the state to
+ REMOVE_HASH in buf_LRU_block_remove_hashed().
+
+ The state change in buf_pool_t::realloc() is not observable
+ here, because in that case we would have !block->index.
+
+ In the end, the entire adaptive hash index will be removed. */
+ ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH);
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ block->n_pointers= 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ if (index->freed())
+ garbage.insert(index);
+ block->index= nullptr;
+ }
+ }
+
+ for (dict_index_t *index : garbage)
+ btr_search_lazy_free(index);
+}
+
+/** Get a buffer block from an adaptive hash index pointer.
+This function does not return if the block is not identified.
+@param ptr pointer to within a page frame
+@return pointer to block, never NULL */
+inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const
+{
+ chunk_t::map *chunk_map = chunk_t::map_ref;
+ ut_ad(chunk_t::map_ref == chunk_t::map_reg);
+ ut_ad(!resizing);
+
+ chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
+ ut_a(it != chunk_map->begin());
+
+ chunk_t *chunk= it == chunk_map->end()
+ ? chunk_map->rbegin()->second
+ : (--it)->second;
+
+ const size_t offs= size_t(ptr - chunk->blocks->page.frame) >>
+ srv_page_size_shift;
+ ut_a(offs < chunk->size);
+
+ buf_block_t *block= &chunk->blocks[offs];
+ /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
+ block[n].frame == block->page.frame + n * srv_page_size. Check it. */
+ ut_ad(block->page.frame == page_align(ptr));
+ /* Read the state of the block without holding hash_lock.
+ A state transition to REMOVE_HASH is possible during
+ this execution. */
+ ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH);
+
+ return block;
+}
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out] index index
+@param[in,out] info index search info
+@param[in] tuple logical record
+@param[in] mode PAGE_CUR_L, ....
+@param[in] latch_mode BTR_SEARCH_LEAF, ...
+@param[out] cursor tree cursor
+@param[in] mtr mini-transaction
+@return whether the search succeeded */
+TRANSACTIONAL_TARGET
+bool
+btr_search_guess_on_hash(
+ dict_index_t* index,
+ btr_search_t* info,
+ const dtuple_t* tuple,
+ ulint mode,
+ ulint latch_mode,
+ btr_cur_t* cursor,
+ mtr_t* mtr)
+{
+ ulint fold;
+ index_id_t index_id;
+
+ ut_ad(mtr->is_active());
+ ut_ad(index->is_btree() || index->is_ibuf());
+
+ /* Note that, for efficiency, the struct info may not be protected by
+ any latch here! */
+
+ if (latch_mode > BTR_MODIFY_LEAF
+ || !info->last_hash_succ || !info->n_hash_potential
+ || (tuple->info_bits & REC_INFO_MIN_REC_FLAG)) {
+ return false;
+ }
+
+ ut_ad(index->is_btree());
+ ut_ad(!index->table->is_temporary());
+
+ ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+ compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
+ compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
+
+ cursor->n_fields = info->n_fields;
+ cursor->n_bytes = info->n_bytes;
+
+ if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) {
+ return false;
+ }
+
+ index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ info->n_hash_succ++;
+#endif
+ fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+ cursor->fold = fold;
+ cursor->flag = BTR_CUR_HASH;
+
+ auto part = btr_search_sys.get_part(*index);
+ const rec_t* rec;
+
+ part->latch.rd_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled) {
+ goto ahi_release_and_fail;
+ }
+
+ rec = static_cast<const rec_t*>(
+ ha_search_and_get_data(&part->table, fold));
+
+ if (!rec) {
+ahi_release_and_fail:
+ part->latch.rd_unlock();
+fail:
+ btr_search_failure(info, cursor);
+ return false;
+ }
+
+ buf_block_t* block = buf_pool.block_from_ahi(rec);
+
+ buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(
+ block->page.id().fold());
+ bool got_latch;
+ {
+ transactional_shared_lock_guard<page_hash_latch> g{
+ buf_pool.page_hash.lock_get(chain)};
+ got_latch = (latch_mode == BTR_SEARCH_LEAF)
+ ? block->page.lock.s_lock_try()
+ : block->page.lock.x_lock_try();
+ }
+
+ if (!got_latch) {
+ goto ahi_release_and_fail;
+ }
+
+ const auto state = block->page.state();
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+ ut_ad(state == buf_page_t::REMOVE_HASH);
+block_and_ahi_release_and_fail:
+ if (latch_mode == BTR_SEARCH_LEAF) {
+ block->page.lock.s_unlock();
+ } else {
+ block->page.lock.x_unlock();
+ }
+ goto ahi_release_and_fail;
+ }
+
+ ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+ ut_ad(state < buf_page_t::READ_FIX || latch_mode == BTR_SEARCH_LEAF);
+
+ if (index != block->index && index_id == block->index->id) {
+ ut_a(block->index->freed());
+ goto block_and_ahi_release_and_fail;
+ }
+
+ block->page.fix();
+ block->page.set_accessed();
+ buf_page_make_young_if_needed(&block->page);
+ static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF},
+ "");
+ static_assert(ulint{MTR_MEMO_PAGE_X_FIX} == ulint{BTR_MODIFY_LEAF},
+ "");
+
+ part->latch.rd_unlock();
+
+ ++buf_pool.stat.n_page_gets;
+
+ mtr->memo_push(block, mtr_memo_type_t(latch_mode));
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ btr_cur_position(index, (rec_t*) rec, block, cursor);
+
+ /* Check the validity of the guess within the page */
+
+ /* If we only have the latch on search system, not on the
+ page, it only protects the columns of the record the cursor
+ is positioned on. We cannot look at the next of the previous
+ record to determine if our guess for the cursor position is
+ right. */
+ if (index_id != btr_page_get_index_id(block->page.frame)
+ || !btr_search_check_guess(cursor, false, tuple, mode)) {
+ mtr->release_last_page();
+ goto fail;
+ }
+
+ if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) {
+
+ info->n_hash_potential++;
+ }
+
+ info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ btr_search_n_succ++;
+#endif
+ return true;
+}
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out] block block containing index page, s- or x-latched, or an
+ index page for which we know that
+ block->buf_fix_count == 0 or it is an index page which
+ has already been removed from the buf_pool.page_hash
+ i.e.: it is in state BUF_BLOCK_REMOVE_HASH
+@param[in] garbage_collect drop ahi only if the index is marked
+ as freed */
+void btr_search_drop_page_hash_index(buf_block_t* block,
+ bool garbage_collect)
+{
+ ulint n_fields;
+ ulint n_bytes;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+retry:
+ if (!block->index) {
+ return;
+ }
+
+ ut_d(const auto state = block->page.state());
+ ut_ad(state == buf_page_t::REMOVE_HASH
+ || state >= buf_page_t::UNFIXED);
+ ut_ad(state == buf_page_t::REMOVE_HASH
+ || !(~buf_page_t::LRU_MASK & state)
+ || block->page.lock.have_any());
+ ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+ ut_ad(page_is_leaf(block->page.frame));
+
+ /* We must not dereference block->index here, because it could be freed
+ if (!index->table->get_ref_count() && !dict_sys.frozen()).
+ Determine the ahi_slot based on the block contents. */
+
+ const index_id_t index_id
+ = btr_page_get_index_id(block->page.frame);
+
+ auto part = btr_search_sys.get_part(index_id,
+ block->page.id().space());
+
+ part->latch.rd_lock(SRW_LOCK_CALL);
+
+ dict_index_t* index = block->index;
+ bool is_freed = index && index->freed();
+
+ if (is_freed) {
+ part->latch.rd_unlock();
+ part->latch.wr_lock(SRW_LOCK_CALL);
+ if (index != block->index) {
+ part->latch.wr_unlock();
+ goto retry;
+ }
+ } else if (garbage_collect) {
+ part->latch.rd_unlock();
+ return;
+ }
+
+ assert_block_ahi_valid(block);
+
+ if (!index || !btr_search_enabled) {
+ if (is_freed) {
+ part->latch.wr_unlock();
+ } else {
+ part->latch.rd_unlock();
+ }
+ return;
+ }
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(btr_search_enabled);
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_a(index_id == index->id);
+ ut_ad(!dict_index_is_ibuf(index));
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+
+ /* NOTE: The AHI fields of block must not be accessed after
+ releasing search latch, as the index page might only be s-latched! */
+
+ if (!is_freed) {
+ part->latch.rd_unlock();
+ }
+
+ ut_a(n_fields > 0 || n_bytes > 0);
+
+ const page_t* const page = block->page.frame;
+ ulint n_recs = page_get_n_recs(page);
+ if (!n_recs) {
+ ut_ad("corrupted adaptive hash index" == 0);
+ return;
+ }
+
+ /* Calculate and cache fold values into an array for fast deletion
+ from the hash index */
+
+ rec = page_get_infimum_rec(page);
+ rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+ ulint* folds;
+ ulint n_cached = 0;
+ ulint prev_fold = 0;
+
+ if (rec && rec_is_metadata(rec, *index)) {
+ rec = page_rec_get_next_low(rec, page_is_comp(page));
+ if (!--n_recs) {
+ /* The page only contains the hidden metadata record
+ for instant ALTER TABLE that the adaptive hash index
+ never points to. */
+ folds = nullptr;
+ goto all_deleted;
+ }
+ }
+
+ folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint));
+ heap = nullptr;
+ offsets = nullptr;
+
+ while (rec) {
+ if (n_cached >= n_recs) {
+ ut_ad(page_rec_is_supremum(rec));
+ break;
+ }
+ ut_ad(page_rec_is_user_rec(rec));
+ offsets = rec_get_offsets(
+ rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes),
+ &heap);
+ const ulint fold = rec_fold(rec, offsets, n_fields, n_bytes,
+ index_id);
+
+ if (fold == prev_fold && prev_fold != 0) {
+
+ goto next_rec;
+ }
+
+ /* Remove all hash nodes pointing to this page from the
+ hash chain */
+ folds[n_cached++] = fold;
+
+next_rec:
+ rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+ if (!rec || page_rec_is_supremum(rec)) {
+ break;
+ }
+ prev_fold = fold;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+all_deleted:
+ if (!is_freed) {
+ part->latch.wr_lock(SRW_LOCK_CALL);
+
+ if (UNIV_UNLIKELY(!block->index)) {
+ /* Someone else has meanwhile dropped the
+ hash index */
+ goto cleanup;
+ }
+
+ ut_a(block->index == index);
+ }
+
+ if (block->curr_n_fields != n_fields
+ || block->curr_n_bytes != n_bytes) {
+
+ /* Someone else has meanwhile built a new hash index on the
+ page, with different parameters */
+
+ part->latch.wr_unlock();
+
+ ut_free(folds);
+ goto retry;
+ }
+
+ for (ulint i = 0; i < n_cached; i++) {
+ ha_remove_all_nodes_to_page(&part->table, part->heap,
+ folds[i], page);
+ }
+
+ switch (index->search_info->ref_count--) {
+ case 0:
+ ut_error;
+ case 1:
+ if (index->freed()) {
+ btr_search_lazy_free(index);
+ }
+ }
+
+ block->index = nullptr;
+
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+ MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
+cleanup:
+ assert_block_ahi_valid(block);
+ part->latch.wr_unlock();
+
+ ut_free(folds);
+}
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in] page_id page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
+{
+ buf_block_t* block;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ /* If the caller has a latch on the page, then the caller must
+ have a x-latch on the page and it must have already dropped
+ the hash index for the page. Because of the x-latch that we
+ are possibly holding, we cannot s-latch the page, but must
+ (recursively) x-latch it, even though we are only reading. */
+
+ block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL,
+ BUF_PEEK_IF_IN_POOL, &mtr);
+
+ if (block && block->index) {
+ /* In all our callers, the table handle should
+ be open, or we should be in the process of
+ dropping the table (preventing eviction). */
+ DBUG_ASSERT(block->index->table->get_ref_count()
+ || dict_sys.locked());
+ btr_search_drop_page_hash_index(block, false);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/** Build a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible, and does not build a hash index if not.
+@param[in,out] index index for which to build.
+@param[in,out] block index page, s-/x- latched.
+@param[in,out] ahi_latch the adaptive search latch
+@param[in] n_fields hash this many full fields
+@param[in] n_bytes hash this many bytes of the next field
+@param[in] left_side hash for searches from left side */
+static
+void
+btr_search_build_page_hash_index(
+ dict_index_t* index,
+ buf_block_t* block,
+ srw_spin_lock* ahi_latch,
+ uint16_t n_fields,
+ uint16_t n_bytes,
+ bool left_side)
+{
+ const rec_t* rec;
+ ulint fold;
+ ulint next_fold;
+ ulint n_cached;
+ ulint n_recs;
+ ulint* folds;
+ const rec_t** recs;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ ut_ad(!index->table->is_temporary());
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ rec_offs_init(offsets_);
+ ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch);
+ ut_ad(index);
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(page_is_leaf(block->page.frame));
+
+ ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+ ut_ad(block->page.id().page_no() >= 3);
+
+ ahi_latch->rd_lock(SRW_LOCK_CALL);
+
+ const bool enabled = btr_search_enabled;
+ const bool rebuild = enabled && block->index
+ && (block->curr_n_fields != n_fields
+ || block->curr_n_bytes != n_bytes
+ || block->curr_left_side != left_side);
+
+ ahi_latch->rd_unlock();
+
+ if (!enabled) {
+ return;
+ }
+
+ if (rebuild) {
+ btr_search_drop_page_hash_index(block, false);
+ }
+
+ /* Check that the values for hash index build are sensible */
+
+ if (n_fields == 0 && n_bytes == 0) {
+
+ return;
+ }
+
+ if (dict_index_get_n_unique_in_tree(index)
+ < btr_search_get_n_fields(n_fields, n_bytes)) {
+ return;
+ }
+
+ page_t* page = buf_block_get_frame(block);
+ n_recs = page_get_n_recs(page);
+
+ if (n_recs == 0) {
+
+ return;
+ }
+
+ rec = page_rec_get_next_const(page_get_infimum_rec(page));
+ if (!rec) return;
+
+ if (rec_is_metadata(rec, *index)) {
+ rec = page_rec_get_next_const(rec);
+ if (!rec || !--n_recs) return;
+ }
+
+ /* Calculate and cache fold values and corresponding records into
+ an array for fast insertion to the hash index */
+
+ folds = static_cast<ulint*>(ut_malloc_nokey(n_recs * sizeof *folds));
+ recs = static_cast<const rec_t**>(
+ ut_malloc_nokey(n_recs * sizeof *recs));
+
+ n_cached = 0;
+
+ ut_a(index->id == btr_page_get_index_id(page));
+
+ offsets = rec_get_offsets(
+ rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes),
+ &heap);
+ ut_ad(page_rec_is_supremum(rec)
+ || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0));
+
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+
+ if (left_side) {
+
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+
+ while (const rec_t* next_rec = page_rec_get_next_const(rec)) {
+ if (page_rec_is_supremum(next_rec)) {
+
+ if (!left_side) {
+
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+
+ break;
+ }
+
+ offsets = rec_get_offsets(
+ next_rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes), &heap);
+ next_fold = rec_fold(next_rec, offsets, n_fields,
+ n_bytes, index->id);
+
+ if (fold != next_fold) {
+ /* Insert an entry into the hash index */
+
+ if (left_side) {
+
+ folds[n_cached] = next_fold;
+ recs[n_cached] = next_rec;
+ n_cached++;
+ } else {
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+ }
+
+ rec = next_rec;
+ fold = next_fold;
+ }
+
+ btr_search_check_free_space_in_heap(index);
+
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled) {
+ goto exit_func;
+ }
+
+ /* This counter is decremented every time we drop page
+ hash index entries and is incremented here. Since we can
+ rebuild hash index for a page that is already hashed, we
+ have to take care not to increment the counter in that
+ case. */
+ if (!block->index) {
+ assert_block_ahi_empty(block);
+ index->search_info->ref_count++;
+ } else if (block->curr_n_fields != n_fields
+ || block->curr_n_bytes != n_bytes
+ || block->curr_left_side != left_side) {
+ goto exit_func;
+ }
+
+ block->n_hash_helps = 0;
+
+ block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS;
+ block->curr_n_bytes = n_bytes & ((1U << 15) - 1);
+ block->curr_left_side = left_side;
+ block->index = index;
+
+ {
+ auto part = btr_search_sys.get_part(*index);
+ for (ulint i = 0; i < n_cached; i++) {
+ ha_insert_for_fold(&part->table, part->heap,
+ folds[i], block, recs[i]);
+ }
+ }
+
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+ MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
+exit_func:
+ assert_block_ahi_valid(block);
+ ahi_latch->wr_unlock();
+
+ ut_free(folds);
+ ut_free(recs);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/** Updates the search info.
+@param[in,out] info search info
+@param[in,out] cursor cursor which was just positioned */
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor)
+{
+ srw_spin_lock* ahi_latch = &btr_search_sys.get_part(*cursor->index())
+ ->latch;
+ buf_block_t* block = btr_cur_get_block(cursor);
+
+ /* NOTE that the following two function calls do NOT protect
+ info or block->n_fields etc. with any semaphore, to save CPU time!
+ We cannot assume the fields are consistent when we return from
+ those functions! */
+
+ btr_search_info_update_hash(info, cursor);
+
+ bool build_index = btr_search_update_block_hash_info(info, block);
+
+ if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+ btr_search_check_free_space_in_heap(cursor->index());
+ }
+
+ if (cursor->flag == BTR_CUR_HASH_FAIL) {
+ /* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+ btr_search_update_hash_ref(info, block, cursor);
+ }
+
+ if (build_index) {
+ /* Note that since we did not protect block->n_fields etc.
+ with any semaphore, the values can be inconsistent. We have
+ to check inside the function call that they make sense. */
+ btr_search_build_page_hash_index(cursor->index(), block,
+ ahi_latch,
+ block->n_fields,
+ block->n_bytes,
+ block->left_side);
+ }
+}
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out] new_block destination page
+@param[in,out] block source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+ buf_block_t* new_block,
+ buf_block_t* block)
+{
+ ut_ad(block->page.lock.have_x());
+ ut_ad(new_block->page.lock.have_x());
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ dict_index_t* index = block->index;
+ if (!index) {
+ index = new_block->index;
+ } else {
+ ut_ad(!new_block->index || index == new_block->index);
+ }
+ assert_block_ahi_valid(block);
+ assert_block_ahi_valid(new_block);
+
+ srw_spin_lock* ahi_latch = index
+ ? &btr_search_sys.get_part(*index)->latch
+ : nullptr;
+
+ if (new_block->index) {
+drop_exit:
+ btr_search_drop_page_hash_index(block, false);
+ return;
+ }
+
+ if (!index) {
+ return;
+ }
+
+ ahi_latch->rd_lock(SRW_LOCK_CALL);
+
+ if (index->freed()) {
+ ahi_latch->rd_unlock();
+ goto drop_exit;
+ }
+
+ if (block->index) {
+ uint16_t n_fields = block->curr_n_fields;
+ uint16_t n_bytes = block->curr_n_bytes;
+ bool left_side = block->curr_left_side;
+
+ new_block->n_fields = block->curr_n_fields;
+ new_block->n_bytes = block->curr_n_bytes;
+ new_block->left_side = left_side;
+
+ ahi_latch->rd_unlock();
+
+ ut_a(n_fields > 0 || n_bytes > 0);
+
+ btr_search_build_page_hash_index(
+ index, new_block, ahi_latch,
+ n_fields, n_bytes, left_side);
+ ut_ad(n_fields == block->curr_n_fields);
+ ut_ad(n_bytes == block->curr_n_bytes);
+ ut_ad(left_side == block->curr_left_side);
+ return;
+ }
+
+ ahi_latch->rd_unlock();
+}
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in] cursor cursor which was positioned on the record to delete
+ using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t *cursor)
+{
+ buf_block_t* block;
+ const rec_t* rec;
+ ulint fold;
+ dict_index_t* index;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t* heap = NULL;
+ rec_offs_init(offsets_);
+
+ ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(block->page.lock.have_x());
+
+ assert_block_ahi_valid(block);
+ index = block->index;
+
+ if (!index) {
+
+ return;
+ }
+
+ ut_ad(!cursor->index()->table->is_temporary());
+
+ if (index != cursor->index()) {
+ btr_search_drop_page_hash_index(block, false);
+ return;
+ }
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ ut_a(index == cursor->index());
+ ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
+ ut_ad(!dict_index_is_ibuf(index));
+
+ rec = btr_cur_get_rec(cursor);
+
+ fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap),
+ block->curr_n_fields, block->curr_n_bytes, index->id);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ auto part = btr_search_sys.get_part(*index);
+
+ part->latch.wr_lock(SRW_LOCK_CALL);
+ assert_block_ahi_valid(block);
+
+ if (block->index && btr_search_enabled) {
+ ut_a(block->index == index);
+
+ if (ha_search_and_delete_if_found(&part->table, part->heap,
+ fold, rec)) {
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+ } else {
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+ }
+
+ assert_block_ahi_valid(block);
+ }
+
+ part->latch.wr_unlock();
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in] cursor cursor which was positioned to the place to insert
+ using btr_cur_search_, and the new record has been
+ inserted next to the cursor.
+@param[in] ahi_latch the adaptive hash index latch */
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+ srw_spin_lock *ahi_latch)
+{
+ buf_block_t* block;
+ dict_index_t* index;
+ rec_t* rec;
+
+ ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch);
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ rec = btr_cur_get_rec(cursor);
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(block->page.lock.have_x());
+
+ index = block->index;
+
+ if (!index) {
+
+ return;
+ }
+
+ ut_ad(!cursor->index()->table->is_temporary());
+
+ if (index != cursor->index()) {
+ ut_ad(index->id == cursor->index()->id);
+ btr_search_drop_page_hash_index(block, false);
+ return;
+ }
+
+ ut_a(cursor->index() == index);
+ ut_ad(!dict_index_is_ibuf(index));
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+ if (!block->index || !btr_search_enabled) {
+
+ goto func_exit;
+ }
+
+ ut_a(block->index == index);
+
+ if ((cursor->flag == BTR_CUR_HASH)
+ && (cursor->n_fields == block->curr_n_fields)
+ && (cursor->n_bytes == block->curr_n_bytes)
+ && !block->curr_left_side) {
+ if (const rec_t *new_rec = page_rec_get_next_const(rec)) {
+ if (ha_search_and_update_if_found(
+ &btr_search_sys.get_part(*cursor->index())
+ ->table,
+ cursor->fold, rec, block, new_rec)) {
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+ }
+ } else {
+ ut_ad("corrupted page" == 0);
+ }
+
+func_exit:
+ assert_block_ahi_valid(block);
+ ahi_latch->wr_unlock();
+ } else {
+ ahi_latch->wr_unlock();
+
+ btr_search_update_hash_on_insert(cursor, ahi_latch);
+ }
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out] cursor cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor
+@param[in] ahi_latch the adaptive hash index latch */
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+ srw_spin_lock *ahi_latch)
+{
+ buf_block_t* block;
+ dict_index_t* index;
+ const rec_t* rec;
+ const rec_t* ins_rec;
+ const rec_t* next_rec;
+ ulint fold;
+ ulint ins_fold;
+ ulint next_fold = 0; /* remove warning (??? bug ???) */
+ ulint n_fields;
+ ulint n_bytes;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch);
+ ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+
+ if (!btr_search_enabled) {
+ return;
+ }
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(block->page.lock.have_x());
+ assert_block_ahi_valid(block);
+
+ index = block->index;
+
+ if (!index) {
+
+ return;
+ }
+
+ ut_ad(block->page.id().space() == index->table->space_id);
+ btr_search_check_free_space_in_heap(index);
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(!cursor->index()->table->is_temporary());
+
+ if (index != cursor->index()) {
+ ut_ad(index->id == cursor->index()->id);
+drop:
+ btr_search_drop_page_hash_index(block, false);
+ return;
+ }
+
+ ut_a(index == cursor->index());
+ ut_ad(!dict_index_is_ibuf(index));
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+ const bool left_side = block->curr_left_side;
+
+ ins_rec = page_rec_get_next_const(rec);
+ if (UNIV_UNLIKELY(!ins_rec)) goto drop;
+ next_rec = page_rec_get_next_const(ins_rec);
+ if (UNIV_UNLIKELY(!next_rec)) goto drop;
+
+ offsets = rec_get_offsets(ins_rec, index, offsets,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id);
+
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(
+ next_rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes), &heap);
+ next_fold = rec_fold(next_rec, offsets, n_fields,
+ n_bytes, index->id);
+ }
+
+ /* We must not look up "part" before acquiring ahi_latch. */
+ btr_search_sys_t::partition* part= nullptr;
+ bool locked = false;
+
+ if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) {
+ offsets = rec_get_offsets(
+ rec, index, offsets, index->n_core_fields,
+ btr_search_get_n_fields(n_fields, n_bytes), &heap);
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+ } else {
+ if (left_side) {
+ locked = true;
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+ goto check_next_rec;
+ }
+
+ if (fold != ins_fold) {
+
+ if (!locked) {
+ locked = true;
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ }
+
+ if (!left_side) {
+ ha_insert_for_fold(&part->table, part->heap,
+ fold, block, rec);
+ } else {
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ }
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+check_next_rec:
+ if (page_rec_is_supremum(next_rec)) {
+
+ if (!left_side) {
+ if (!locked) {
+ locked = true;
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ }
+
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+ goto function_exit;
+ }
+
+ if (ins_fold != next_fold) {
+ if (!locked) {
+ locked = true;
+ ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+ if (!btr_search_enabled || !block->index) {
+ goto function_exit;
+ }
+
+ part = btr_search_sys.get_part(*index);
+ }
+
+ if (!left_side) {
+ ha_insert_for_fold(&part->table, part->heap,
+ ins_fold, block, ins_rec);
+ } else {
+ ha_insert_for_fold(&part->table, part->heap,
+ next_fold, block, next_rec);
+ }
+ MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+ }
+
+function_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ if (locked) {
+ ahi_latch->wr_unlock();
+ }
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+__attribute__((nonnull))
+/** @return whether a range of the cells is valid */
+static bool ha_validate(const hash_table_t *table,
+ ulint start_index, ulint end_index)
+{
+ ut_a(start_index <= end_index);
+ ut_a(end_index < table->n_cells);
+
+ bool ok= true;
+
+ for (ulint i= start_index; i <= end_index; i++)
+ {
+ for (auto node= static_cast<const ha_node_t*>(table->array[i].node); node;
+ node= node->next)
+ {
+ if (table->calc_hash(node->fold) != i) {
+ ib::error() << "Hash table node fold value " << node->fold
+ << " does not match the cell number " << i;
+ ok= false;
+ }
+ }
+ }
+
+ return ok;
+}
+
+/** Validates the search system for given hash table.
+@param thd connection, for checking if CHECK TABLE has been killed
+@param hash_table_id hash table to validate
+@return true if ok */
+static bool btr_search_hash_table_validate(THD *thd, ulint hash_table_id)
+{
+ ha_node_t* node;
+ bool ok = true;
+ ulint i;
+ ulint cell_count;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+
+ btr_search_x_lock_all();
+ if (!btr_search_enabled || (thd && thd_kill_level(thd))) {
+func_exit:
+ btr_search_x_unlock_all();
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return ok;
+ }
+
+ /* How many cells to check before temporarily releasing
+ search latches. */
+ ulint chunk_size = 10000;
+
+ rec_offs_init(offsets_);
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ auto &part = btr_search_sys.parts[hash_table_id];
+
+ cell_count = part.table.n_cells;
+
+ for (i = 0; i < cell_count; i++) {
+ /* We release search latches every once in a while to
+ give other queries a chance to run. */
+ if ((i != 0) && ((i % chunk_size) == 0)) {
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ btr_search_x_unlock_all();
+
+ std::this_thread::yield();
+
+ btr_search_x_lock_all();
+
+ if (!btr_search_enabled
+ || (thd && thd_kill_level(thd))) {
+ goto func_exit;
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ ulint curr_cell_count = part.table.n_cells;
+
+ if (cell_count != curr_cell_count) {
+
+ cell_count = curr_cell_count;
+
+ if (i >= cell_count) {
+ break;
+ }
+ }
+ }
+
+ node = static_cast<ha_node_t*>(part.table.array[i].node);
+
+ for (; node != NULL; node = node->next) {
+ const buf_block_t* block
+ = buf_pool.block_from_ahi((byte*) node->data);
+ index_id_t page_index_id;
+
+ if (UNIV_LIKELY(block->page.in_file())) {
+ /* The space and offset are only valid
+ for file blocks. It is possible that
+ the block is being freed
+ (BUF_BLOCK_REMOVE_HASH, see the
+ assertion and the comment below) */
+ const page_id_t id(block->page.id());
+ if (const buf_page_t* hash_page
+ = buf_pool.page_hash.get(
+ id, buf_pool.page_hash.cell_get(
+ id.fold()))) {
+ ut_ad(hash_page == &block->page);
+ goto state_ok;
+ }
+ }
+
+ /* When a block is being freed,
+ buf_LRU_search_and_free_block() first removes
+ the block from buf_pool.page_hash by calling
+ buf_LRU_block_remove_hashed_page(). Then it
+ invokes btr_search_drop_page_hash_index(). */
+ ut_a(block->page.state() == buf_page_t::REMOVE_HASH);
+state_ok:
+ ut_ad(!dict_index_is_ibuf(block->index));
+ ut_ad(block->page.id().space()
+ == block->index->table->space_id);
+
+ const page_t* page = block->page.frame;
+
+ page_index_id = btr_page_get_index_id(page);
+
+ offsets = rec_get_offsets(
+ node->data, block->index, offsets,
+ block->index->n_core_fields,
+ btr_search_get_n_fields(block->curr_n_fields,
+ block->curr_n_bytes),
+ &heap);
+
+ const ulint fold = rec_fold(
+ node->data, offsets,
+ block->curr_n_fields,
+ block->curr_n_bytes,
+ page_index_id);
+
+ if (node->fold != fold) {
+ ok = FALSE;
+
+ ib::error() << "Error in an adaptive hash"
+ << " index pointer to page "
+ << block->page.id()
+ << ", ptr mem address "
+ << reinterpret_cast<const void*>(
+ node->data)
+ << ", index id " << page_index_id
+ << ", node fold " << node->fold
+ << ", rec fold " << fold;
+
+ fputs("InnoDB: Record ", stderr);
+ rec_print_new(stderr, node->data, offsets);
+ fprintf(stderr, "\nInnoDB: on that page."
+ " Page mem address %p, is hashed %p,"
+ " n fields %lu\n"
+ "InnoDB: side %lu\n",
+ (void*) page, (void*) block->index,
+ (ulong) block->curr_n_fields,
+ (ulong) block->curr_left_side);
+ ut_ad(0);
+ }
+ }
+ }
+
+ for (i = 0; i < cell_count; i += chunk_size) {
+ /* We release search latches every once in a while to
+ give other queries a chance to run. */
+ if (i != 0) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ btr_search_x_unlock_all();
+
+ std::this_thread::yield();
+
+ btr_search_x_lock_all();
+
+ if (!btr_search_enabled
+ || (thd && thd_kill_level(thd))) {
+ goto func_exit;
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ ulint curr_cell_count = part.table.n_cells;
+
+ if (cell_count != curr_cell_count) {
+
+ cell_count = curr_cell_count;
+
+ if (i >= cell_count) {
+ break;
+ }
+ }
+ }
+
+ ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+ if (!ha_validate(&part.table, i, end_index)) {
+ ok = false;
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto func_exit;
+}
+
+/** Validates the search system.
+@param thd connection, for checking if CHECK TABLE has been killed
+@return true if ok */
+bool btr_search_validate(THD *thd)
+{
+ for (ulint i= 0; i < btr_ahi_parts; ++i)
+ if (!btr_search_hash_table_validate(thd, i))
+ return(false);
+ return true;
+}
+
+#ifdef UNIV_DEBUG
+bool btr_search_check_marked_free_index(const buf_block_t *block)
+{
+ const index_id_t index_id= btr_page_get_index_id(block->page.frame);
+ auto part= btr_search_sys.get_part(index_id, block->page.id().space());
+
+ part->latch.rd_lock(SRW_LOCK_CALL);
+
+ bool is_freed= block->index && block->index->freed();
+
+ part->latch.rd_unlock();
+
+ return is_freed;
+}
+#endif /* UNIV_DEBUG */
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */