diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/ibuf | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-upstream.tar.xz mariadb-10.5-upstream.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | storage/innobase/ibuf/ibuf0ibuf.cc | 4811 |
1 files changed, 4811 insertions, 0 deletions
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc new file mode 100644 index 00000000..9288a496 --- /dev/null +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -0,0 +1,4811 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ibuf/ibuf0ibuf.cc +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "ibuf0ibuf.h" +#include "sync0sync.h" +#include "btr0sea.h" + +/** Number of bits describing a single page */ +#define IBUF_BITS_PER_PAGE 4 +/** The start address for an insert buffer bitmap page bitmap */ +#define IBUF_BITMAP PAGE_DATA + +#include "buf0buf.h" +#include "buf0rea.h" +#include "fsp0fsp.h" +#include "trx0sys.h" +#include "fil0fil.h" +#include "rem0rec.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "row0upd.h" +#include "dict0boot.h" +#include "fut0lst.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "que0que.h" +#include "srv0start.h" /* srv_shutdown_state */ +#include "rem0cmp.h" + +/* STRUCTURE OF AN INSERT BUFFER RECORD + +In versions < 4.1.x: + +1. The first field is the page number. +2. The second field is an array which stores type info for each subsequent + field. We store the information which affects the ordering of records, and + also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it + is 10 bytes. +3. Next we have the fields of the actual index record. + +In versions >= 4.1.x: + +Note that contary to what we planned in the 1990's, there will only be one +insert buffer tree, and that is in the system tablespace of InnoDB. + +1. The first field is the space id. +2. The second field is a one-byte marker (0) which differentiates records from + the < 4.1.x storage format. +3. The third field is the page number. +4. The fourth field contains the type info, where we have also added 2 bytes to + store the charset. In the compressed table format of 5.0.x we must add more + information here so that we can build a dummy 'index' struct which 5.0.x + can use in the binary search on the index page in the ibuf merge phase. +5. The rest of the fields contain the fields of the actual index record. + +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. + +In versions >= 5.5: + +The optional marker byte at the start of the fourth field is replaced by +mandatory 3 fields, totaling 4 bytes: + + 1. 2 bytes: Counter field, used to sort records within a (space id, page + no) in the order they were added. This is needed so that for example the + sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled + correctly. + + 2. 1 byte: Operation type (see ibuf_op_t). + + 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT. + +To ensure older records, which do not have counters to enforce correct +sorting, are merged before any new records, ibuf_insert checks if we're +trying to insert to a position that contains old-style records, and if so, +refuses the insert. Thus, ibuf pages are gradually converted to the new +format as their corresponding buffer pool pages are read into memory. +*/ + + +/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM + +If an OS thread performs any operation that brings in disk pages from +non-system tablespaces into the buffer pool, or creates such a page there, +then the operation may have as a side effect an insert buffer index tree +compression. Thus, the tree latch of the insert buffer tree may be acquired +in the x-mode, and also the file space latch of the system tablespace may +be acquired in the x-mode. + +Also, an insert to an index in a non-system tablespace can have the same +effect. How do we know this cannot lead to a deadlock of OS threads? There +is a problem with the i\o-handler threads: they break the latching order +because they own x-latches to pages which are on a lower level than the +insert buffer tree latch, its page latches, and the tablespace latch an +insert buffer operation can reserve. + +The solution is the following: Let all the tree and page latches connected +with the insert buffer be later in the latching order than the fsp latch and +fsp page latches. + +Insert buffer pages must be such that the insert buffer is never invoked +when these pages are accessed as this would result in a recursion violating +the latching order. We let a special i/o-handler thread take care of i/o to +the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap +pages and the first inode page, which contains the inode of the ibuf tree: let +us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead +access both non-ibuf and ibuf pages. + +Then an i/o-handler for the insert buffer never needs to access recursively the +insert buffer tree and thus obeys the latching order. On the other hand, other +i/o-handlers for other tablespaces may require access to the insert buffer, +but because all kinds of latches they need to access there are later in the +latching order, no violation of the latching order occurs in this case, +either. + +A problem is how to grow and contract an insert buffer tree. As it is later +in the latching order than the fsp management, we have to reserve the fsp +latch first, before adding or removing pages from the insert buffer tree. +We let the insert buffer tree have its own file space management: a free +list of pages linked to the tree root. To prevent recursive using of the +insert buffer when adding pages to the tree, we must first load these pages +to memory, obtaining a latch on them, and only after that add them to the +free list of the insert buffer tree. More difficult is removing of pages +from the free list. If there is an excess of pages in the free list of the +ibuf tree, they might be needed if some thread reserves the fsp latch, +intending to allocate more file space. So we do the following: if a thread +reserves the fsp latch, we check the writer count field of the latch. If +this field has value 1, it means that the thread did not own the latch +before entering the fsp system, and the mtr of the thread contains no +modifications to the fsp pages. Now we are free to reserve the ibuf latch, +and check if there is an excess of pages in the free list. We can then, in a +separate mini-transaction, take them out of the free list and free them to +the fsp system. + +To avoid deadlocks in the ibuf system, we divide file pages into three levels: + +(1) non-ibuf pages, +(2) ibuf tree pages and the pages in the ibuf tree free list, and +(3) ibuf bitmap pages. + +No OS thread is allowed to access higher level pages if it has latches to +lower level pages; even if the thread owns a B-tree latch it must not access +the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead +is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle +exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively +level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e., +it uses synchronous aio, it can access any pages, as long as it obeys the +access order rules. */ + +/** Operations that can currently be buffered. */ +ulong innodb_change_buffering; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Dump the change buffer at startup */ +my_bool ibuf_dump; +/** Flag to control insert buffer debugging. */ +uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** The insert buffer control structure */ +ibuf_t ibuf; + +/** @name Offsets to the per-page bits in the insert buffer bitmap */ +/* @{ */ +#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the + amount of free space */ +#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered + changes for the page */ +#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of + the ibuf tree, excluding the + root page, or is in the free + list of the ibuf */ +/* @} */ + +#define IBUF_REC_FIELD_SPACE 0 /*!< in the pre-4.1 format, + the page number. later, the space_id */ +#define IBUF_REC_FIELD_MARKER 1 /*!< starting with 4.1, a marker + consisting of 1 byte that is 0 */ +#define IBUF_REC_FIELD_PAGE 2 /*!< starting with 4.1, the + page number */ +#define IBUF_REC_FIELD_METADATA 3 /* the metadata field */ +#define IBUF_REC_FIELD_USER 4 /* first user field */ + +/* Various constants for checking the type of an ibuf record and extracting +data from it. For details, see the description of the record format at the +top of this file. */ + +/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record +The fourth column in the MySQL 5.5 format contains an operation +type, counter, and some flags. */ +/* @{ */ +#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at + the beginning of the fourth field */ + +/* Offsets for the fields at the beginning of the fourth field */ +#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */ +#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */ +#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */ + +/* Record flag masks */ +#define IBUF_REC_COMPACT 0x1 /*!< Set in + IBUF_REC_OFFSET_FLAGS if the + user index is in COMPACT + format or later */ + + +/** The mutex used to block pessimistic inserts to ibuf trees */ +static ib_mutex_t ibuf_pessimistic_insert_mutex; + +/** The mutex protecting the insert buffer structs */ +static ib_mutex_t ibuf_mutex; + +/** The mutex protecting the insert buffer bitmaps */ +static ib_mutex_t ibuf_bitmap_mutex; + +/** The area in pages from which contract looks for page numbers for merge */ +const ulint IBUF_MERGE_AREA = 8; + +/** Inside the merge area, pages which have at most 1 per this number less +buffered entries compared to maximum volume that can buffered for a single +page are merged along with the page whose buffer became full */ +const ulint IBUF_MERGE_THRESHOLD = 4; + +/** In ibuf_contract at most this number of pages is read to memory in one +batch, in order to merge the entries for them in the insert buffer */ +const ulint IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA; + +/** If the combined size of the ibuf trees exceeds ibuf.max_size by +this many pages, we start to contract it synchronous contract, but do +not insert */ +const ulint IBUF_CONTRACT_DO_NOT_INSERT = 10; + +/* TODO: how to cope with drop table if there are records in the insert +buffer for the indexes of the table? Is there actually any problem, +because ibuf merge is done to a page when it is read in, and it is +still physically like the index page even if the index would have been +dropped! So, there seems to be no problem. */ + +/******************************************************************//** +Sets the flag in the current mini-transaction record indicating we're +inside an insert buffer routine. */ +UNIV_INLINE +void +ibuf_enter( +/*=======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(!mtr->is_inside_ibuf()); + mtr->enter_ibuf(); +} + +/******************************************************************//** +Sets the flag in the current mini-transaction record indicating we're +exiting an insert buffer routine. */ +UNIV_INLINE +void +ibuf_exit( +/*======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->is_inside_ibuf()); + mtr->exit_ibuf(); +} + +/**************************************************************//** +Commits an insert buffer mini-transaction and sets the persistent +cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */ +UNIV_INLINE +void +ibuf_btr_pcur_commit_specify_mtr( +/*=============================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_d(ibuf_exit(mtr)); + btr_pcur_commit_specify_mtr(pcur, mtr); +} + +/******************************************************************//** +Gets the ibuf header page and x-latches it. +@return insert buffer header page */ +static +page_t* +ibuf_header_page_get( +/*=================*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + + ut_ad(!ibuf_inside(mtr)); + page_t* page = NULL; + + block = buf_page_get( + page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), + 0, RW_X_LATCH, mtr); + + if (block) { + buf_block_dbg_add_level(block, SYNC_IBUF_HEADER); + page = buf_block_get_frame(block); + } + + return page; +} + +/** Acquire the change buffer root page. +@param[in,out] mtr mini-transaction +@return change buffer root page, SX-latched */ +static buf_block_t *ibuf_tree_root_get(mtr_t *mtr) +{ + buf_block_t* block; + + ut_ad(ibuf_inside(mtr)); + ut_ad(mutex_own(&ibuf_mutex)); + + mtr_sx_lock_index(ibuf.index, mtr); + + /* only segment list access is exclusive each other */ + block = buf_page_get( + page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO), + 0, RW_SX_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); + + ut_ad(page_get_space_id(block->frame) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(block->frame) == FSP_IBUF_TREE_ROOT_PAGE_NO); + ut_ad(ibuf.empty == page_is_empty(block->frame)); + + return block; +} + +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +void +ibuf_close(void) +/*============*/ +{ + if (!ibuf.index) { + return; + } + + mutex_free(&ibuf_pessimistic_insert_mutex); + + mutex_free(&ibuf_mutex); + + mutex_free(&ibuf_bitmap_mutex); + + dict_table_t* ibuf_table = ibuf.index->table; + rw_lock_free(&ibuf.index->lock); + dict_mem_index_free(ibuf.index); + dict_mem_table_free(ibuf_table); + ibuf.index = NULL; +} + +/******************************************************************//** +Updates the size information of the ibuf, assuming the segment size has not +changed. */ +static +void +ibuf_size_update( +/*=============*/ + const page_t* root) /*!< in: ibuf tree root */ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + ibuf.free_list_len = flst_get_len(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST); + + ibuf.height = 1 + btr_page_get_level(root); + + /* the '1 +' is the ibuf header page */ + ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len); +} + +/******************************************************************//** +Creates the insert buffer data structure at a database startup and initializes +the data structures for the insert buffer. +@return DB_SUCCESS or failure */ +dberr_t +ibuf_init_at_db_start(void) +/*=======================*/ +{ + page_t* root; + ulint n_used; + + ut_ad(!ibuf.index); + mtr_t mtr; + mtr.start(); + compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE); + compile_time_assert(IBUF_SPACE_ID == 0); + mtr_x_lock_space(fil_system.sys_space, &mtr); + buf_block_t* header_page = buf_page_get( + page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), + 0, RW_X_LATCH, &mtr); + + if (!header_page) { + mtr.commit(); + return DB_DECRYPTION_FAILED; + } + + /* At startup we intialize ibuf to have a maximum of + CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the + buffer pool size. Once ibuf struct is initialized this + value is updated with the user supplied size by calling + ibuf_max_size_update(). */ + ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) + * CHANGE_BUFFER_DEFAULT_SIZE) / 100; + + mutex_create(LATCH_ID_IBUF, &ibuf_mutex); + + mutex_create(LATCH_ID_IBUF_BITMAP, &ibuf_bitmap_mutex); + + mutex_create(LATCH_ID_IBUF_PESSIMISTIC_INSERT, + &ibuf_pessimistic_insert_mutex); + + mutex_enter(&ibuf_mutex); + + fseg_n_reserved_pages(*header_page, + IBUF_HEADER + IBUF_TREE_SEG_HEADER + + header_page->frame, &n_used, &mtr); + + ut_ad(n_used >= 2); + + ibuf.seg_size = n_used; + + { + buf_block_t* block; + + block = buf_page_get( + page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO), + 0, RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + root = buf_block_get_frame(block); + } + + ibuf_size_update(root); + mutex_exit(&ibuf_mutex); + + ibuf.empty = page_is_empty(root); + mtr.commit(); + + ibuf.index = dict_mem_index_create( + dict_mem_table_create("innodb_change_buffer", + fil_system.sys_space, 1, 0, 0, 0), + "CLUST_IND", + DICT_CLUSTERED | DICT_IBUF, 1); + ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; + ibuf.index->n_uniq = REC_MAX_N_FIELDS; + rw_lock_create(index_tree_rw_lock_key, &ibuf.index->lock, + SYNC_IBUF_INDEX_TREE); +#ifdef BTR_CUR_ADAPT + ibuf.index->search_info = btr_search_info_create(ibuf.index->heap); +#endif /* BTR_CUR_ADAPT */ + ibuf.index->page = FSP_IBUF_TREE_ROOT_PAGE_NO; + ut_d(ibuf.index->cached = TRUE); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if (!ibuf_dump) { + return DB_SUCCESS; + } + ib::info() << "Dumping the change buffer"; + ibuf_mtr_start(&mtr); + btr_pcur_t pcur; + if (DB_SUCCESS == btr_pcur_open_at_index_side( + true, ibuf.index, BTR_SEARCH_LEAF, &pcur, + true, 0, &mtr)) { + while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) { + rec_print_old(stderr, btr_pcur_get_rec(&pcur)); + } + } + ibuf_mtr_commit(&mtr); + ib::info() << "Dumped the change buffer"; +#endif + + return DB_SUCCESS; +} + +/*********************************************************************//** +Updates the max_size value for ibuf. */ +void +ibuf_max_size_update( +/*=================*/ + ulint new_val) /*!< in: new value in terms of + percentage of the buffer pool size */ +{ + ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) + * new_val) / 100; + mutex_enter(&ibuf_mutex); + ibuf.max_size = new_size; + mutex_exit(&ibuf_mutex); +} + +# ifdef UNIV_DEBUG +/** Gets the desired bits for a given page from a bitmap page. +@param[in] page bitmap page +@param[in] page_id page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param[in,out] mtr mini-transaction holding an x-latch on the +bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, \ + MTR_MEMO_PAGE_X_FIX, mtr, bit) +# else /* UNIV_DEBUG */ +/** Gets the desired bits for a given page from a bitmap page. +@param[in] page bitmap page +@param[in] page_id page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param[in,out] mtr mini-transaction holding an x-latch on the +bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, bit) +# endif /* UNIV_DEBUG */ + +/** Gets the desired bits for a given page from a bitmap page. +@param[in] page bitmap page +@param[in] page_id page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] latch_type MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ... +@param[in,out] mtr mini-transaction holding latch_type on the +bitmap page +@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@return value of bits */ +UNIV_INLINE +ulint +ibuf_bitmap_page_get_bits_low( + const page_t* page, + const page_id_t page_id, + ulint zip_size, +#ifdef UNIV_DEBUG + ulint latch_type, + mtr_t* mtr, +#endif /* UNIV_DEBUG */ + ulint bit) +{ + ulint byte_offset; + ulint bit_offset; + ulint map_byte; + ulint value; + const ulint size = zip_size ? zip_size : srv_page_size; + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(bit < IBUF_BITS_PER_PAGE); + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); + ut_ad(mtr->memo_contains_page_flagged(page, latch_type)); + + bit_offset = (page_id.page_no() & (size - 1)) + * IBUF_BITS_PER_PAGE + bit; + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < srv_page_size); + + map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); + + value = ut_bit_get_nth(map_byte, bit_offset); + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + + value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1); + } + + return(value); +} + +/** Sets the desired bit for a given page in a bitmap page. +@tparam bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param[in,out] block bitmap page +@param[in] page_id page id whose bits to set +@param[in] physical_size page size +@param[in] val value to set +@param[in,out] mtr mtr containing an x-latch to the bitmap page */ +template<ulint bit> +static void +ibuf_bitmap_page_set_bits( + buf_block_t* block, + const page_id_t page_id, + ulint physical_size, + ulint val, + mtr_t* mtr) +{ + ulint byte_offset; + ulint bit_offset; + + static_assert(bit < IBUF_BITS_PER_PAGE, "wrong bit"); + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(page_id.space())); + + bit_offset = (page_id.page_no() % physical_size) + * IBUF_BITS_PER_PAGE + bit; + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < srv_page_size); + + byte* map_byte = &block->frame[IBUF_BITMAP + byte_offset]; + byte b = *map_byte; + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + ut_ad(val <= 3); + b &= static_cast<byte>(~(3U << bit_offset)); + b |= static_cast<byte>(((val & 2) >> 1) << bit_offset + | (val & 1) << (bit_offset + 1)); + } else { + ut_ad(val <= 1); + b &= static_cast<byte>(~(1U << bit_offset)); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + b |= static_cast<byte>(val << bit_offset); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + } + + mtr->write<1,mtr_t::MAYBE_NOP>(*block, map_byte, b); +} + +/** Calculates the bitmap page number for a given page number. +@param[in] page_id page id +@param[in] size page size +@return the bitmap page id where the file page is mapped */ +inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size) +{ + if (!size) + size= srv_page_size; + + return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET + + uint32_t(page_id.page_no() & ~(size - 1))); +} + +/** Gets the ibuf bitmap page where the bits describing a given file page are +stored. +@param[in] page_id page id of the file page +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] file file name +@param[in] line line where called +@param[in,out] mtr mini-transaction +@return bitmap page where the file page is mapped, that is, the bitmap +page containing the descriptor bits for the file page; the bitmap page +is x-latched */ +static +buf_block_t* +ibuf_bitmap_get_map_page_func( + const page_id_t page_id, + ulint zip_size, + const char* file, + unsigned line, + mtr_t* mtr) +{ + buf_block_t* block = buf_page_get_gen( + ibuf_bitmap_page_no_calc(page_id, zip_size), + zip_size, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, + file, line, mtr); + + if (block) { + buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP); + } + + return block; +} + +/** Gets the ibuf bitmap page where the bits describing a given file page are +stored. +@param[in] page_id page id of the file page +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@return bitmap page where the file page is mapped, that is, the bitmap +page containing the descriptor bits for the file page; the bitmap page +is x-latched */ +#define ibuf_bitmap_get_map_page(page_id, zip_size, mtr) \ + ibuf_bitmap_get_map_page_func(page_id, zip_size, \ + __FILE__, __LINE__, mtr) + +/************************************************************************//** +Sets the free bits of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INLINE +void +ibuf_set_free_bits_low( +/*===================*/ + const buf_block_t* block, /*!< in: index page; free bits are set if + the index is non-clustered and page + level is 0 */ + ulint val, /*!< in: value to set: < 4 */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(mtr->is_named_space(block->page.id().space())); + if (!page_is_leaf(block->frame)) { + return; + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(val <= ibuf_index_page_calc_free(block)); +#endif /* UNIV_IBUF_DEBUG */ + const page_id_t id(block->page.id()); + + if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + id, block->zip_size(), mtr)) { + ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( + bitmap_page, id, block->physical_size(), + val, mtr); + } +} + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val) /*!< in: value to set: < 4 */ +{ + if (!page_is_leaf(block->frame)) { + return; + } + + mtr_t mtr; + mtr.start(); + const page_id_t id(block->page.id()); + + const fil_space_t* space = mtr.set_named_space_id(id.space()); + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(id, + block->zip_size(), + &mtr); + + if (space->purpose != FIL_TYPE_TABLESPACE) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } + +#ifdef UNIV_IBUF_DEBUG + if (max_val != ULINT_UNDEFINED) { + ulint old_val; + + old_val = ibuf_bitmap_page_get_bits( + bitmap_page, id, + IBUF_BITMAP_FREE, &mtr); + ut_a(old_val <= max_val); + } + + ut_a(val <= ibuf_index_page_calc_free(block)); +#endif /* UNIV_IBUF_DEBUG */ + + ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( + bitmap_page, id, block->physical_size(), + val, &mtr); + + mtr.commit(); +} + +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block) /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +{ + ibuf_set_free_bits(block, 0, ULINT_UNDEFINED); +} + +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ulint before; + ulint after; + + ut_a(!is_buf_block_get_page_zip(block)); + ut_ad(mtr->is_named_space(block->page.id().space())); + + before = ibuf_index_page_calc_free_bits(srv_page_size, + max_ins_size); + + after = ibuf_index_page_calc_free(block); + + /* This approach cannot be used on compressed pages, since the + computed value of "before" often does not match the current + state of the bitmap. This is because the free space may + increase or decrease when a compressed page is reorganized. */ + if (before != after) { + ibuf_set_free_bits_low(block, after, mtr); + } +} + +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(page_is_leaf(block->frame)); + ut_ad(block->zip_size()); + + ulint after = ibuf_index_page_calc_free_zip(block); + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + block->page.id(), block->zip_size(), mtr)) { + + ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( + bitmap_page, block->page.id(), + block->physical_size(), after, mtr); + } +} + +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint state; + + ut_ad(mtr->is_named_space(block1->page.id().space())); + ut_ad(block1->page.id().space() == block2->page.id().space()); + + /* As we have to x-latch two random bitmap pages, we have to acquire + the bitmap mutex to prevent a deadlock with a similar operation + performed by another OS thread. */ + + mutex_enter(&ibuf_bitmap_mutex); + + state = ibuf_index_page_calc_free(block1); + + ibuf_set_free_bits_low(block1, state, mtr); + + state = ibuf_index_page_calc_free(block2); + + ibuf_set_free_bits_low(block2, state, mtr); + + mutex_exit(&ibuf_bitmap_mutex); +} + +/** Returns TRUE if the page is one of the fixed address ibuf pages. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return TRUE if a fixed address ibuf i/o page */ +inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size) +{ + return(page_id == page_id_t(IBUF_SPACE_ID, IBUF_TREE_ROOT_PAGE_NO) + || ibuf_bitmap_page(page_id, zip_size)); +} + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] x_latch FALSE if relaxed check (avoid latching the +bitmap page) +@param[in] file file name +@param[in] line line where called +@param[in,out] mtr mtr which will contain an x-latch to the +bitmap page if the page is not one of the fixed address ibuf pages, or NULL, +in which case a new transaction is created. +@return TRUE if level 2 or level 3 page */ +bool +ibuf_page_low( + const page_id_t page_id, + ulint zip_size, +#ifdef UNIV_DEBUG + bool x_latch, +#endif /* UNIV_DEBUG */ + const char* file, + unsigned line, + mtr_t* mtr) +{ + ibool ret; + mtr_t local_mtr; + + ut_ad(!recv_no_ibuf_operations); + ut_ad(x_latch || mtr == NULL); + + if (ibuf_fixed_addr_page(page_id, zip_size)) { + return(true); + } else if (page_id.space() != IBUF_SPACE_ID) { + return(false); + } + + compile_time_assert(IBUF_SPACE_ID == 0); + ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE); + +#ifdef UNIV_DEBUG + if (!x_latch) { + mtr_start(&local_mtr); + + /* Get the bitmap page without a page latch, so that + we will not be violating the latching order when + another bitmap page has already been latched by this + thread. The page will be buffer-fixed, and thus it + cannot be removed or relocated while we are looking at + it. The contents of the page could change, but the + IBUF_BITMAP_IBUF bit that we are interested in should + not be modified by any other thread. Nobody should be + calling ibuf_add_free_page() or ibuf_remove_free_page() + while the page is linked to the insert buffer b-tree. */ + dberr_t err = DB_SUCCESS; + + buf_block_t* block = buf_page_get_gen( + ibuf_bitmap_page_no_calc(page_id, zip_size), + zip_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, + file, line, &local_mtr, &err); + + ret = ibuf_bitmap_page_get_bits_low( + block->frame, page_id, zip_size, + MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF); + + mtr_commit(&local_mtr); + return(ret); + } +#endif /* UNIV_DEBUG */ + + if (mtr == NULL) { + mtr = &local_mtr; + mtr_start(mtr); + } + + ret = ibuf_bitmap_page_get_bits(ibuf_bitmap_get_map_page_func( + page_id, zip_size, file, line, + mtr)->frame, + page_id, zip_size, + IBUF_BITMAP_IBUF, mtr); + + if (mtr == &local_mtr) { + mtr_commit(mtr); + } + + return(ret); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec) +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the page number field of an ibuf record. +@return page number */ +static +uint32_t +ibuf_rec_get_page_no_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len); + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec) +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the space id field of an ibuf record. For < 4.1.x format records +returns 0. +@return space id */ +static +uint32_t +ibuf_rec_get_space_func( +/*====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(rec,op,comp,info_len,counter) +#endif +/****************************************************************//** +Get various information about an ibuf record in >= 4.1.x format. */ +static +void +ibuf_rec_get_info_func( +/*===================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: ibuf record */ + ibuf_op_t* op, /*!< out: operation type, or NULL */ + ibool* comp, /*!< out: compact flag, or NULL */ + ulint* info_len, /*!< out: length of info fields at the + start of the fourth field, or + NULL */ + ulint* counter) /*!< in: counter value, or NULL */ +{ + const byte* types; + ulint fields; + ulint len; + + /* Local variables to shadow arguments. */ + ibuf_op_t op_local; + ibool comp_local; + ulint info_len_local; + ulint counter_local; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + fields = rec_get_n_fields_old(rec); + ut_a(fields > IBUF_REC_FIELD_USER); + + types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + compile_time_assert(IBUF_REC_INFO_SIZE + < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + switch (info_len_local) { + case 0: + case 1: + op_local = IBUF_OP_INSERT; + comp_local = info_len_local; + ut_ad(!counter); + counter_local = ULINT_UNDEFINED; + break; + + case IBUF_REC_INFO_SIZE: + op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT; + counter_local = mach_read_from_2( + types + IBUF_REC_OFFSET_COUNTER); + break; + + default: + ut_error; + } + + ut_a(op_local < IBUF_OP_COUNT); + ut_a((len - info_len_local) == + (fields - IBUF_REC_FIELD_USER) + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (op) { + *op = op_local; + } + + if (comp) { + *comp = comp_local; + } + + if (info_len) { + *info_len = info_len_local; + } + + if (counter) { + *counter = counter_local; + } +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec) +#endif + +/****************************************************************//** +Returns the operation type field of an ibuf record. +@return operation type */ +static +ibuf_op_t +ibuf_rec_get_op_type_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + ulint len; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + (void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + if (len > 1) { + /* This is a < 4.1.x format record */ + + return(IBUF_OP_INSERT); + } else { + ibuf_op_t op; + + ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL); + + return(op); + } +} + +/****************************************************************//** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it +can't be read */ +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* ptr; + ulint len; + + if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) { + + return(ULINT_UNDEFINED); + } + + ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + if (len >= 2) { + + return(mach_read_from_2(ptr)); + } else { + + return(ULINT_UNDEFINED); + } +} + + +/** + Add accumulated operation counts to a permanent array. + Both arrays must be of size IBUF_OP_COUNT. +*/ +static void ibuf_add_ops(Atomic_counter<ulint> *out, const ulint *in) +{ + for (auto i = 0; i < IBUF_OP_COUNT; i++) + out[i]+= in[i]; +} + + +/****************************************************************//** +Print operation counts. The array must be of size IBUF_OP_COUNT. */ +static +void +ibuf_print_ops( +/*===========*/ + const Atomic_counter<ulint>* ops, /*!< in: operation counts */ + FILE* file) /*!< in: file where to print */ +{ + static const char* op_names[] = { + "insert", + "delete mark", + "delete" + }; + ulint i; + + ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT); + + for (i = 0; i < IBUF_OP_COUNT; i++) { + fprintf(file, "%s " ULINTPF "%s", op_names[i], + ulint{ops[i]}, (i < (IBUF_OP_COUNT - 1)) ? ", " : ""); + } + + putc('\n', file); +} + +/********************************************************************//** +Creates a dummy index for inserting a record to a non-clustered index. +@return dummy index */ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + ulint n, /*!< in: number of fields */ + ibool comp) /*!< in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + + table = dict_mem_table_create("IBUF_DUMMY", NULL, n, 0, + comp ? DICT_TF_COMPACT : 0, 0); + + index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n); + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + ut_d(index->is_dummy = true); + + return(index); +} +/********************************************************************//** +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*=====================*/ + dict_index_t* index, /*!< in: dummy index */ + const dtype_t* type, /*!< in: the data type of the column */ + ulint len) /*!< in: length of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, NULL, NULL, + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type)); + dict_index_add_col(index, index->table, + dict_table_get_nth_col(index->table, i), len); +} +/********************************************************************//** +Deallocates a dummy index for inserting a record to a non-clustered index. */ +static +void +ibuf_dummy_index_free( +/*==================*/ + dict_index_t* index) /*!< in, own: dummy index */ +{ + dict_table_t* table = index->table; + + dict_mem_index_free(index); + dict_mem_table_free(table); +} + +#ifdef UNIV_DEBUG +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex) +#else /* UNIV_DEBUG */ +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex) +#endif + +/*********************************************************************//** +Builds the entry used to + +1) IBUF_OP_INSERT: insert into a non-clustered index + +2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to + activate + +3) IBUF_OP_DELETE: find the record we need to delete + +when we have the corresponding record in an ibuf index. + +NOTE that as we copy pointers to fields in ibuf_rec, the caller must +hold a latch to the ibuf_rec page as long as the entry is used! + +@return own: entry to insert to a non-clustered index */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* ibuf_rec, /*!< in: record in an insert buffer */ + mem_heap_t* heap, /*!< in: heap where built */ + dict_index_t** pindex) /*!< out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + const byte* types; + const byte* data; + ulint len; + ulint info_len; + ulint i; + ulint comp; + dict_index_t* index; + + ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER); + + n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len); + + ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL); + + index = ibuf_dummy_index_create(n_fields, comp); + + len -= info_len; + types += info_len; + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old( + ibuf_rec, i + IBUF_REC_FIELD_USER, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field), len); + } + + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + + /* Prevent an ut_ad() failure in page_zip_write_rec() by + adding system columns to the dummy table pointed to by the + dummy secondary index. The insert buffer is only used for + secondary indexes, whose records never contain any system + columns, such as DB_TRX_ID. */ + ut_d(dict_table_add_system_columns(index->table, index->table->heap)); + + *pindex = index; + + return(tuple); +} + +/******************************************************************//** +Get the data size. +@return size of fields */ +UNIV_INLINE +ulint +ibuf_rec_get_size( +/*==============*/ + const rec_t* rec, /*!< in: ibuf record */ + const byte* types, /*!< in: fields */ + ulint n_fields, /*!< in: number of fields */ + ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT, + nonzero=ROW_FORMAT=COMPACT */ +{ + ulint i; + ulint field_offset; + ulint types_offset; + ulint size = 0; + + field_offset = IBUF_REC_FIELD_USER; + types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + + for (i = 0; i < n_fields; i++) { + ulint len; + dtype_t dtype; + + rec_get_nth_field_offs_old(rec, i + field_offset, &len); + + if (len != UNIV_SQL_NULL) { + size += len; + } else { + dtype_new_read_for_order_and_null_size(&dtype, types); + + size += dtype_get_sql_null_size(&dtype, comp); + } + + types += types_offset; + } + + return(size); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec) +#endif + +/********************************************************************//** +Returns the space taken by a stored non-clustered index entry if converted to +an index record. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_rec_get_volume_func( +/*=====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* ibuf_rec)/*!< in: ibuf record */ +{ + ulint len; + const byte* data; + const byte* types; + ulint n_fields; + ulint data_size; + ulint comp; + ibuf_op_t op; + ulint info_len; + + ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len); + ut_a(len == 1); + ut_a(*data == 0); + + types = rec_get_nth_field_old( + ibuf_rec, IBUF_REC_FIELD_METADATA, &len); + + ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL); + + if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) { + /* Delete-marking a record doesn't take any + additional space, and while deleting a record + actually frees up space, we have to play it safe and + pretend it takes no additional space (the record + might not exist, etc.). */ + + return(0); + } else if (comp) { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec, + heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); + } + + types += info_len; + n_fields = rec_get_n_fields_old(ibuf_rec) + - IBUF_REC_FIELD_USER; + + data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp); + + return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0) + + page_dir_calc_reserved_space(1)); +} + +/*********************************************************************//** +Builds the tuple to insert to an ibuf tree when we have an entry for a +non-clustered index. + +NOTE that the original entry must be kept because we copy pointers to +its fields. + +@return own: entry to insert into an ibuf index tree */ +static +dtuple_t* +ibuf_entry_build( +/*=============*/ + ibuf_op_t op, /*!< in: operation type */ + dict_index_t* index, /*!< in: non-clustered index */ + const dtuple_t* entry, /*!< in: entry for a non-clustered index */ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number where entry should + be inserted */ + ulint counter,/*!< in: counter value; + ULINT_UNDEFINED=not used */ + mem_heap_t* heap) /*!< in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + const dfield_t* entry_field; + ulint n_fields; + byte* buf; + byte* ti; + byte* type_info; + ulint i; + + ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT); + ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF); + ut_ad(op < IBUF_OP_COUNT); + + /* We have to build a tuple with the following fields: + + 1-4) These are described at the top of this file. + + 5) The rest of the fields are copied from the entry. + + All fields in the tuple are ordered like the type binary in our + insert buffer tree. */ + + n_fields = dtuple_get_n_fields(entry); + + tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER); + + /* 1) Space Id */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* 2) Marker byte */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 1)); + + /* We set the marker byte zero */ + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* 3) Page number */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + /* 4) Type info, part #1 */ + + if (counter == ULINT_UNDEFINED) { + i = dict_table_is_comp(index->table) ? 1 : 0; + } else { + ut_ad(counter <= 0xFFFF); + i = IBUF_REC_INFO_SIZE; + } + + ti = type_info = static_cast<byte*>( + mem_heap_alloc( + heap, + i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE)); + + switch (i) { + default: + ut_error; + break; + case 1: + /* set the flag for ROW_FORMAT=COMPACT */ + *ti++ = 0; + /* fall through */ + case 0: + /* the old format does not allow delete buffering */ + ut_ad(op == IBUF_OP_INSERT); + break; + case IBUF_REC_INFO_SIZE: + mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter); + + ti[IBUF_REC_OFFSET_TYPE] = (byte) op; + ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table) + ? IBUF_REC_COMPACT : 0; + ti += IBUF_REC_INFO_SIZE; + break; + } + + /* 5+) Fields from the entry */ + + for (i = 0; i < n_fields; i++) { + ulint fixed_len; + const dict_field_t* ifield; + + field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER); + entry_field = dtuple_get_nth_field(entry, i); + dfield_copy(field, entry_field); + + ifield = dict_index_get_nth_field(index, i); + /* Prefix index columns of fixed-length columns are of + fixed length. However, in the function call below, + dfield_get_type(entry_field) contains the fixed length + of the column in the clustered index. Replace it with + the fixed length of the secondary index column. */ + fixed_len = ifield->fixed_len; + +#ifdef UNIV_DEBUG + if (fixed_len) { + /* dict_index_add_col() should guarantee these */ + ut_ad(fixed_len <= (ulint) + dfield_get_type(entry_field)->len); + if (ifield->prefix_len) { + ut_ad(ifield->prefix_len == fixed_len); + } else { + ut_ad(fixed_len == (ulint) + dfield_get_type(entry_field)->len); + } + } +#endif /* UNIV_DEBUG */ + + dtype_new_store_for_order_and_null_size( + ti, dfield_get_type(entry_field), fixed_len); + ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + } + + /* 4) Type info, part #2 */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA); + + dfield_set_data(field, type_info, ulint(ti - type_info)); + + /* Set all the types in the new tuple binary */ + + dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER); + + return(tuple); +} + +/*********************************************************************//** +Builds a search tuple used to search buffered inserts for an index page. +This is for >= 4.1.x format records. +@return own: search tuple */ +static +dtuple_t* +ibuf_search_tuple_build( +/*====================*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number */ + mem_heap_t* heap) /*!< in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the new format record marker byte */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 1)); + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA); + + return(tuple); +} + +/*********************************************************************//** +Checks if there are enough pages in the free list of the ibuf tree that we +dare to start a pessimistic insert to the insert buffer. +@return whether enough free pages in list */ +static inline bool ibuf_data_enough_free_for_insert() +{ + ut_ad(mutex_own(&ibuf_mutex)); + + /* We want a big margin of free pages, because a B-tree can sometimes + grow in size also if records are deleted from it, as the node pointers + can change, and we must make sure that we are able to delete the + inserts buffered for pages that we read to the buffer pool, without + any risk of running out of free space in the insert buffer. */ + + return(ibuf.free_list_len >= (ibuf.size / 2) + 3 * ibuf.height); +} + +/*********************************************************************//** +Checks if there are enough pages in the free list of the ibuf tree that we +should remove them and free to the file space management. +@return TRUE if enough free pages in list */ +UNIV_INLINE +ibool +ibuf_data_too_much_free(void) +/*=========================*/ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height); +} + +/** Allocate a change buffer page. +@retval true on success +@retval false if no space left */ +static bool ibuf_add_free_page() +{ + mtr_t mtr; + page_t* header_page; + buf_block_t* block; + + mtr.start(); + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + mtr_x_lock_space(fil_system.sys_space, &mtr); + header_page = ibuf_header_page_get(&mtr); + + /* Allocate a new page: NOTE that if the page has been a part of a + non-clustered index which has subsequently been dropped, then the + page may have buffered inserts in the insert buffer, and these + should be deleted from there. These get deleted when the page + allocation creates the page in buffer. Thus the call below may end + up calling the insert buffer routines and, as we yet have no latches + to insert buffer tree pages, these routines can run without a risk + of a deadlock. This is the reason why we created a special ibuf + header page apart from the ibuf tree. */ + + block = fseg_alloc_free_page( + header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP, + &mtr); + + if (block == NULL) { + mtr.commit(); + return false; + } + + ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1); + ibuf_enter(&mtr); + mutex_enter(&ibuf_mutex); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); + + mtr.write<2>(*block, block->frame + FIL_PAGE_TYPE, + FIL_PAGE_IBUF_FREE_LIST); + + /* Add the page to the free list and update the ibuf size data */ + + flst_add_last(ibuf_tree_root_get(&mtr), + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + + ibuf.seg_size++; + ibuf.free_list_len++; + + /* Set the bit indicating that this page is now an ibuf tree page + (level 2 page) */ + + const page_id_t page_id(block->page.id()); + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id, + srv_page_size, true, + &mtr); + + ibuf_mtr_commit(&mtr); + + return true; +} + +/*********************************************************************//** +Removes a page from the free list and frees it to the fsp system. */ +static +void +ibuf_remove_free_page(void) +/*=======================*/ +{ + mtr_t mtr; + mtr_t mtr2; + page_t* header_page; + + log_free_check(); + + mtr_start(&mtr); + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + + mtr_x_lock_space(fil_system.sys_space, &mtr); + header_page = ibuf_header_page_get(&mtr); + + /* Prevent pessimistic inserts to insert buffer trees for a while */ + ibuf_enter(&mtr); + mutex_enter(&ibuf_pessimistic_insert_mutex); + mutex_enter(&ibuf_mutex); + + if (!ibuf_data_too_much_free()) { + + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + + ibuf_mtr_commit(&mtr); + + return; + } + + ibuf_mtr_start(&mtr2); + + buf_block_t* root = ibuf_tree_root_get(&mtr2); + + mutex_exit(&ibuf_mutex); + + uint32_t page_no = flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST + + root->frame).page; + + /* NOTE that we must release the latch on the ibuf tree root + because in fseg_free_page we access level 1 pages, and the root + is a level 2 page. */ + + ibuf_mtr_commit(&mtr2); + ibuf_exit(&mtr); + + /* Since pessimistic inserts were prevented, we know that the + page is still in the free list. NOTE that also deletes may take + pages from the free list, but they take them from the start, and + the free list was so long that they cannot have taken the last + page from it. */ + + compile_time_assert(IBUF_SPACE_ID == 0); + fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + fil_system.sys_space, page_no, &mtr); + + const page_id_t page_id(IBUF_SPACE_ID, page_no); + + ibuf_enter(&mtr); + + mutex_enter(&ibuf_mutex); + + root = ibuf_tree_root_get(&mtr); + + ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST + + root->frame).page); + + buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + /* Remove the page from the free list and update the ibuf size data */ + + flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + + mutex_exit(&ibuf_pessimistic_insert_mutex); + + ibuf.seg_size--; + ibuf.free_list_len--; + + /* Set the bit indicating that this page is no more an ibuf tree page + (level 2 page) */ + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>( + bitmap_page, page_id, srv_page_size, false, &mtr); + + buf_page_free(fil_system.sys_space, page_no, &mtr, __FILE__, __LINE__); + + ibuf_mtr_commit(&mtr); +} + +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +void +ibuf_free_excess_pages(void) +/*========================*/ +{ + /* Free at most a few pages at a time, so that we do not delay the + requested service too much */ + + for (ulint i = 0; i < 4; i++) { + + ibool too_much_free; + + mutex_enter(&ibuf_mutex); + too_much_free = ibuf_data_too_much_free(); + mutex_exit(&ibuf_mutex); + + if (!too_much_free) { + return; + } + + ibuf_remove_free_page(); + } +} + +#ifdef UNIV_DEBUG +# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,pages,n_stored) \ + ibuf_get_merge_page_nos_func(contract,rec,mtr,ids,pages,n_stored) +#else /* UNIV_DEBUG */ +# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,pages,n_stored) \ + ibuf_get_merge_page_nos_func(contract,rec,ids,pages,n_stored) +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Reads page numbers from a leaf in an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static +ulint +ibuf_get_merge_page_nos_func( +/*=========================*/ + ibool contract,/*!< in: TRUE if this function is called to + contract the tree, FALSE if this is called + when a single page becomes full and we look + if it pays to read also nearby pages */ + const rec_t* rec, /*!< in: insert buffer record */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction holding rec */ +#endif /* UNIV_DEBUG */ + uint32_t* space_ids,/*!< in/out: space id's of the pages */ + uint32_t* page_nos,/*!< in/out: buffer for at least + IBUF_MAX_N_PAGES_MERGED many page numbers; + the page numbers are in an ascending order */ + ulint* n_stored)/*!< out: number of page numbers stored to + page_nos in this function */ +{ + uint32_t prev_page_no; + uint32_t prev_space_id; + uint32_t first_page_no; + uint32_t first_space_id; + uint32_t rec_page_no; + uint32_t rec_space_id; + ulint sum_volumes; + ulint volume_for_page; + ulint rec_volume; + ulint limit; + ulint n_pages; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + *n_stored = 0; + + limit = ut_min(IBUF_MAX_N_PAGES_MERGED, + buf_pool_get_curr_size() / 4); + + if (page_rec_is_supremum(rec)) { + + rec = page_rec_get_prev_const(rec); + } + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next_const(rec); + } + + if (page_rec_is_supremum(rec)) { + + return(0); + } + + first_page_no = ibuf_rec_get_page_no(mtr, rec); + first_space_id = ibuf_rec_get_space(mtr, rec); + n_pages = 0; + prev_page_no = 0; + prev_space_id = 0; + + /* Go backwards from the first rec until we reach the border of the + 'merge area', or the page start or the limit of storeable pages is + reached */ + + while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) { + + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); + + if (rec_space_id != first_space_id + || (rec_page_no / IBUF_MERGE_AREA) + != (first_page_no / IBUF_MERGE_AREA)) { + + break; + } + + if (rec_page_no != prev_page_no + || rec_space_id != prev_space_id) { + n_pages++; + } + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_prev_const(rec); + } + + rec = page_rec_get_next_const(rec); + + /* At the loop start there is no prev page; we mark this with a pair + of space id, page no (0, 0) for which there can never be entries in + the insert buffer */ + + prev_page_no = 0; + prev_space_id = 0; + sum_volumes = 0; + volume_for_page = 0; + + while (*n_stored < limit) { + if (page_rec_is_supremum(rec)) { + /* When no more records available, mark this with + another 'impossible' pair of space id, page no */ + rec_page_no = 1; + rec_space_id = 0; + } else { + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); + /* In the system tablespace the smallest + possible secondary index leaf page number is + bigger than FSP_DICT_HDR_PAGE_NO (7). + In all tablespaces, pages 0 and 1 are reserved + for the allocation bitmap and the change + buffer bitmap. In file-per-table tablespaces, + a file segment inode page will be created at + page 2 and the clustered index tree is created + at page 3. So for file-per-table tablespaces, + page 4 is the smallest possible secondary + index leaf page. CREATE TABLESPACE also initially + uses pages 2 and 3 for the first created table, + but that table may be dropped, allowing page 2 + to be reused for a secondary index leaf page. + To keep this assertion simple, just + make sure the page is >= 2. */ + ut_ad(rec_page_no >= FSP_FIRST_INODE_PAGE_NO); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED); +#endif + if ((rec_space_id != prev_space_id + || rec_page_no != prev_page_no) + && (prev_space_id != 0 || prev_page_no != 0)) { + + if (contract + || (prev_page_no == first_page_no + && prev_space_id == first_space_id) + || (volume_for_page + > ((IBUF_MERGE_THRESHOLD - 1) + * 4U << srv_page_size_shift + / IBUF_PAGE_SIZE_PER_FREE_SPACE) + / IBUF_MERGE_THRESHOLD)) { + + space_ids[*n_stored] = prev_space_id; + page_nos[*n_stored] = prev_page_no; + + (*n_stored)++; + + sum_volumes += volume_for_page; + } + + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { + + break; + } + + volume_for_page = 0; + } + + if (rec_page_no == 1 && rec_space_id == 0) { + /* Supremum record */ + + break; + } + + rec_volume = ibuf_rec_get_volume(mtr, rec); + + volume_for_page += rec_volume; + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_next_const(rec); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif +#if 0 + fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n", + *n_stored, sum_volumes); +#endif + return(sum_volumes); +} + +/*******************************************************************//** +Get the matching records for space id. +@return current rec or NULL */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const rec_t* +ibuf_get_user_rec( +/*===============*/ + btr_pcur_t* pcur, /*!< in: the current cursor */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + do { + const rec_t* rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_user_rec(rec)) { + return(rec); + } + } while (btr_pcur_move_to_next(pcur, mtr)); + + return(NULL); +} + +/*********************************************************************//** +Reads page numbers for a space id from an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ulint +ibuf_get_merge_pages( +/*=================*/ + btr_pcur_t* pcur, /*!< in/out: cursor */ + uint32_t space, /*!< in: space for which to merge */ + ulint limit, /*!< in: max page numbers to read */ + uint32_t* pages, /*!< out: pages read */ + uint32_t* spaces, /*!< out: spaces read */ + ulint* n_pages,/*!< out: number of pages read */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + const rec_t* rec; + ulint volume = 0; + + *n_pages = 0; + + while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0 + && ibuf_rec_get_space(mtr, rec) == space + && *n_pages < limit) { + + uint32_t page_no = ibuf_rec_get_page_no(mtr, rec); + + if (*n_pages == 0 || pages[*n_pages - 1] != page_no) { + spaces[*n_pages] = space; + pages[*n_pages] = page_no; + ++*n_pages; + } + + volume += ibuf_rec_get_volume(mtr, rec); + + btr_pcur_move_to_next(pcur, mtr); + } + + return(volume); +} + +/** +Delete a change buffer record. +@param[in] page_id page identifier +@param[in,out] pcur persistent cursor positioned on the record +@param[in] search_tuple search key for (space,page_no) +@param[in,out] mtr mini-transaction +@return whether mtr was committed (due to pessimistic operation) */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur, + const dtuple_t* search_tuple, mtr_t* mtr); + +/** Merge the change buffer to some pages. */ +static void ibuf_read_merge_pages(const uint32_t* space_ids, + const uint32_t* page_nos, ulint n_stored) +{ +#ifndef DBUG_OFF + mem_heap_t* heap = mem_heap_create(512); + ulint dops[IBUF_OP_COUNT]; + memset(dops, 0, sizeof(dops)); +#endif + + for (ulint i = 0; i < n_stored; i++) { + const ulint space_id = space_ids[i]; + fil_space_t* s = fil_space_t::get(space_id); + if (!s) { +tablespace_deleted: + /* The tablespace was not found: remove all + entries for it */ + ibuf_delete_for_discarded_space(space_id); + while (i + 1 < n_stored + && space_ids[i + 1] == space_id) { + i++; + } + continue; + } + + const ulint zip_size = s->zip_size(), size = s->size; + s->release(); + mtr_t mtr; + + if (UNIV_LIKELY(page_nos[i] < size)) { + mtr.start(); + dberr_t err; + buf_page_get_gen(page_id_t(space_id, page_nos[i]), + zip_size, RW_X_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, &mtr, &err, true); + mtr.commit(); + if (err == DB_TABLESPACE_DELETED) { + goto tablespace_deleted; + } + } +#ifndef DBUG_OFF + DBUG_EXECUTE_IF("ibuf_merge_corruption", goto work_around;); + continue; + + /* The following code works around a hang when the + change buffer is corrupted, likely due to the race + condition in crash recovery that was fixed in + MDEV-24449. But, it also introduces corruption by + itself in the following scenario: + + (1) We merged buffered changes in buf_page_get_gen() + (2) We committed the mini-transaction + (3) Redo log and the page with the merged changes is written + (4) A write completion callback thread evicts the page. + (5) Other threads buffer changes for that page. + (6) We will wrongly discard those newly buffered changes below. + + This code will be available in debug builds, so that + users may try to fix a shutdown hang that occurs due + to a corrupted change buffer. */ + +work_around: + /* Prevent an infinite loop, by removing entries from + the change buffer also in the case the bitmap bits were + wrongly clear even though buffered changes exist. */ + const dtuple_t* tuple = ibuf_search_tuple_build( + space_id, page_nos[i], heap); +loop: + btr_pcur_t pcur; + ibuf_mtr_start(&mtr); + btr_pcur_open(ibuf.index, tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + goto done; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur); + if (ibuf_rec_get_space(&mtr, ibuf_rec) != space_id + || ibuf_rec_get_page_no(&mtr, ibuf_rec) + != page_nos[i]) { + break; + } + + dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++; + /* Delete the record from ibuf */ + if (ibuf_delete_rec(page_id_t(space_id, page_nos[i]), + &pcur, tuple, &mtr)) { + /* Deletion was pessimistic and mtr + was committed: we start from the + beginning again */ + ut_ad(mtr.has_committed()); + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + goto loop; + } + } +done: + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + mem_heap_empty(heap); +#endif + } + +#ifndef DBUG_OFF + ibuf_add_ops(ibuf.n_discarded_ops, dops); + mem_heap_free(heap); +#endif +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +static +ulint +ibuf_merge_pages( +/*=============*/ + ulint* n_pages) /*!< out: number of pages to which merged */ +{ + mtr_t mtr; + btr_pcur_t pcur; + ulint sum_sizes; + uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED]; + uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED]; + + *n_pages = 0; + + ibuf_mtr_start(&mtr); + + /* Open a cursor to a randomly chosen leaf of the tree, at a random + position within the leaf */ + bool available; + + available = btr_pcur_open_at_rnd_pos(ibuf.index, BTR_SEARCH_LEAF, + &pcur, &mtr); + /* No one should make this index unavailable when server is running */ + ut_a(available); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index)); + + if (page_is_empty(btr_pcur_get_page(&pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf.empty); + ut_ad(btr_pcur_get_block(&pcur)->page.id() + == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO)); + + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + return(0); + } + + sum_sizes = ibuf_get_merge_page_nos(TRUE, + btr_pcur_get_rec(&pcur), &mtr, + space_ids, + page_nos, n_pages); + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + ibuf_read_merge_pages(space_ids, page_nos, *n_pages); + + return(sum_sizes + 1); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages referring to space_id +to the buffer pool. +@returns number of pages merged.*/ +ulint +ibuf_merge_space( +/*=============*/ + ulint space) /*!< in: tablespace id to merge */ +{ + mtr_t mtr; + btr_pcur_t pcur; + mem_heap_t* heap = mem_heap_create(512); + dtuple_t* tuple = ibuf_search_tuple_build(space, 0, heap); + ulint n_pages = 0; + + ut_ad(space < SRV_SPACE_ID_UPPER_BOUND); + + ibuf_mtr_start(&mtr); + + /* Position the cursor on the first matching record. */ + + btr_pcur_open( + ibuf.index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, + &mtr); + + mem_heap_free(heap); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index)); + + ulint sum_sizes = 0; + uint32_t pages[IBUF_MAX_N_PAGES_MERGED]; + uint32_t spaces[IBUF_MAX_N_PAGES_MERGED]; + + if (page_is_empty(btr_pcur_get_page(&pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf.empty); + ut_ad(btr_pcur_get_block(&pcur)->page.id() + == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO)); + } else { + + sum_sizes = ibuf_get_merge_pages( + &pcur, uint32_t(space), IBUF_MAX_N_PAGES_MERGED, + &pages[0], &spaces[0], &n_pages, + &mtr); + ib::info() << "Size of pages merged " << sum_sizes; + } + + ibuf_mtr_commit(&mtr); + + btr_pcur_close(&pcur); + + if (n_pages > 0) { + ut_ad(n_pages <= UT_ARR_SIZE(pages)); + +#ifdef UNIV_DEBUG + for (ulint i = 0; i < n_pages; ++i) { + ut_ad(spaces[i] == space); + } +#endif /* UNIV_DEBUG */ + + ibuf_read_merge_pages(spaces, pages, n_pages); + } + + return(n_pages); +} + +/** Contract the change buffer by reading pages to the buffer pool. +@param[out] n_pages number of pages merged +@param[in] sync whether the caller waits for +the issued reads to complete +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +MY_ATTRIBUTE((warn_unused_result)) +static ulint ibuf_merge(ulint* n_pages) +{ + *n_pages = 0; + + /* We perform a dirty read of ibuf.empty, without latching + the insert buffer root page. We trust this dirty read except + when a slow shutdown is being executed. During a slow + shutdown, the insert buffer merge must be completed. */ + + if (ibuf.empty && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) { + return(0); +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + } else if (ibuf_debug) { + return(0); +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + } else { + return ibuf_merge_pages(n_pages); + } +} + +/** Contract the change buffer by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is empty */ +static ulint ibuf_contract() +{ + ulint n_pages; + return ibuf_merge_pages(&n_pages); +} + +/** Contract the change buffer by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +ulint ibuf_merge_all() +{ +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if (ibuf_debug) { + return(0); + } +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + ulint sum_bytes = 0; + ulint n_pages = srv_io_capacity; + + for (ulint sum_pages = 0; sum_pages < n_pages; ) { + ulint n_pag2; + ulint n_bytes = ibuf_merge(&n_pag2); + + if (n_bytes == 0) { + break; + } + + sum_bytes += n_bytes; + } + + return sum_bytes; +} + +/*********************************************************************//** +Contract insert buffer trees after insert if they are too big. */ +UNIV_INLINE +void +ibuf_contract_after_insert( +/*=======================*/ + ulint entry_size) /*!< in: size of a record which was inserted + into an ibuf tree */ +{ + /* Perform dirty reads of ibuf.size and ibuf.max_size, to + reduce ibuf_mutex contention. ibuf.max_size remains constant + after ibuf_init_at_db_start(), but ibuf.size should be + protected by ibuf_mutex. Given that ibuf.size fits in a + machine word, this should be OK; at worst we are doing some + excessive ibuf_contract() or occasionally skipping a + ibuf_contract(). */ + if (ibuf.size < ibuf.max_size) { + return; + } + + /* Contract at least entry_size many bytes */ + ulint sum_sizes = 0; + ulint size; + + do { + size = ibuf_contract(); + sum_sizes += size; + } while (size > 0 && sum_sizes < entry_size); +} + +/** Determine if a change buffer record has been encountered already. +@param rec change buffer record in the MySQL 5.5 format +@param hash hash table of encountered records +@param size number of elements in hash +@retval true if a distinct record +@retval false if this may be duplicating an earlier record */ +static bool ibuf_get_volume_buffered_hash(const rec_t *rec, ulint *hash, + ulint size) +{ + ut_ad(rec_get_n_fields_old(rec) > IBUF_REC_FIELD_USER); + const ulint start= rec_get_field_start_offs(rec, IBUF_REC_FIELD_USER); + const ulint len= rec_get_data_size_old(rec) - start; + const uint32_t fold= ut_crc32(rec + start, len); + hash+= (fold / (CHAR_BIT * sizeof *hash)) % size; + ulint bitmask= static_cast<ulint>(1) << (fold % (CHAR_BIT * sizeof(*hash))); + + if (*hash & bitmask) + return false; + + /* We have not seen this record yet. Remember it. */ + *hash|= bitmask; + return true; +} + +#ifdef UNIV_DEBUG +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs) +#else /* UNIV_DEBUG */ +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs) +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Update the estimate of the number of records on a page, and +get the space taken by merging the buffered record to the index page. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_get_volume_buffered_count_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint* hash, /*!< in/out: hash array */ + ulint size, /*!< in: number of elements in hash array */ + lint* n_recs) /*!< in/out: estimated number of records + on the page that rec points to */ +{ + ulint len; + ibuf_op_t ibuf_op; + const byte* types; + ulint n_fields; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + n_fields = rec_get_n_fields_old(rec); + ut_ad(n_fields > IBUF_REC_FIELD_USER); + n_fields -= IBUF_REC_FIELD_USER; + + rec_get_nth_field_offs_old(rec, 1, &len); + /* This function is only invoked when buffering new + operations. All pre-4.1 records should have been merged + when the database was started up. */ + ut_a(len == 1); + + if (rec_get_deleted_flag(rec, 0)) { + /* This record has been merged already, + but apparently the system crashed before + the change was discarded from the buffer. + Pretend that the record does not exist. */ + return(0); + } + + types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE), + IBUF_REC_INFO_SIZE)) { + default: + ut_error; + case 0: + /* This ROW_TYPE=REDUNDANT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + + len = ibuf_rec_get_size(rec, types, n_fields, 0); + + return(len + + rec_get_converted_extra_size(len, n_fields, 0) + + page_dir_calc_reserved_space(1)); + case 1: + /* This ROW_TYPE=COMPACT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + goto get_volume_comp; + + case IBUF_REC_INFO_SIZE: + ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + break; + } + + switch (ibuf_op) { + case IBUF_OP_INSERT: + /* Inserts can be done by updating a delete-marked record. + Because delete-mark and insert operations can be pointing to + the same records, we must not count duplicates. */ + case IBUF_OP_DELETE_MARK: + /* There must be a record to delete-mark. + See if this record has been already buffered. */ + if (n_recs && ibuf_get_volume_buffered_hash(rec, hash, size)) { + (*n_recs)++; + } + + if (ibuf_op == IBUF_OP_DELETE_MARK) { + /* Setting the delete-mark flag does not + affect the available space on the page. */ + return(0); + } + break; + case IBUF_OP_DELETE: + /* A record will be removed from the page. */ + if (n_recs) { + (*n_recs)--; + } + /* While deleting a record actually frees up space, + we have to play it safe and pretend that it takes no + additional space (the record might not exist, etc.). */ + return(0); + default: + ut_error; + } + + ut_ad(ibuf_op == IBUF_OP_INSERT); + +get_volume_comp: + { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec( + mtr, rec, heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); + } +} + +/*********************************************************************//** +Gets an upper limit for the combined size of entries buffered in the insert +buffer for a given page. +@return upper limit for the volume of buffered inserts for the index +page, in bytes; srv_page_size, if the entries for the index page span +several pages in the insert buffer */ +static +ulint +ibuf_get_volume_buffered( +/*=====================*/ + const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an + insert buffer tree where we would insert an + entry for the index page whose number is + page_no, latch mode has to be BTR_MODIFY_PREV + or BTR_MODIFY_TREE */ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: page number of an index page */ + lint* n_recs, /*!< in/out: minimum number of records on the + page after the buffered changes have been + applied, or NULL to disable the counting */ + mtr_t* mtr) /*!< in: mini-transaction of pcur */ +{ + ulint volume; + const rec_t* rec; + const page_t* page; + const page_t* prev_page; + const page_t* next_page; + /* bitmap of buffered recs */ + ulint hash_bitmap[128 / sizeof(ulint)]; + + ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) + || (pcur->latch_mode == BTR_MODIFY_TREE)); + + /* Count the volume of inserts earlier in the alphabetical order than + pcur */ + + volume = 0; + + if (n_recs) { + memset(hash_bitmap, 0, sizeof hash_bitmap); + } + + rec = btr_pcur_get_rec(pcur); + page = page_align(rec); + ut_ad(page_validate(page, ibuf.index)); + + if (page_rec_is_supremum(rec)) { + rec = page_rec_get_prev_const(rec); + } + + uint32_t prev_page_no; + + for (; !page_rec_is_infimum(rec); + rec = page_rec_get_prev_const(rec)) { + ut_ad(page_align(rec) == page); + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + goto count_later; + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + + /* Look at the previous page */ + + prev_page_no = btr_page_get_prev(page); + + if (prev_page_no == FIL_NULL) { + + goto count_later; + } + + { + buf_block_t* block; + + block = buf_page_get( + page_id_t(IBUF_SPACE_ID, prev_page_no), + 0, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + prev_page = buf_block_get_frame(block); + ut_ad(page_validate(prev_page, ibuf.index)); + } + +#ifdef UNIV_BTR_DEBUG + static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + ut_a(!memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT, + page + FIL_PAGE_OFFSET, 4)); +#endif /* UNIV_BTR_DEBUG */ + + rec = page_get_supremum_rec(prev_page); + rec = page_rec_get_prev_const(rec); + + for (;; rec = page_rec_get_prev_const(rec)) { + ut_ad(page_align(rec) == prev_page); + + if (page_rec_is_infimum(rec)) { + + /* We cannot go to yet a previous page, because we + do not have the x-latch on it, and cannot acquire one + because of the latching order: we have to give up */ + + return(srv_page_size); + } + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + goto count_later; + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + +count_later: + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_supremum(rec)) { + rec = page_rec_get_next_const(rec); + } + + for (; !page_rec_is_supremum(rec); + rec = page_rec_get_next_const(rec)) { + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + return(volume); + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + + /* Look at the next page */ + + uint32_t next_page_no = btr_page_get_next(page); + + if (next_page_no == FIL_NULL) { + + return(volume); + } + + { + buf_block_t* block; + + block = buf_page_get( + page_id_t(IBUF_SPACE_ID, next_page_no), + 0, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + + next_page = buf_block_get_frame(block); + ut_ad(page_validate(next_page, ibuf.index)); + } + +#ifdef UNIV_BTR_DEBUG + static_assert(FIL_PAGE_PREV % 4 == 0, "alignment"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + ut_a(!memcmp_aligned<4>(next_page + FIL_PAGE_PREV, + page + FIL_PAGE_OFFSET, 4)); +#endif /* UNIV_BTR_DEBUG */ + + rec = page_get_infimum_rec(next_page); + rec = page_rec_get_next_const(rec); + + for (;; rec = page_rec_get_next_const(rec)) { + ut_ad(page_align(rec) == next_page); + + if (page_rec_is_supremum(rec)) { + + /* We give up */ + + return(srv_page_size); + } + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + return(volume); + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } +} + +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +void +ibuf_update_max_tablespace_id(void) +/*===============================*/ +{ + ulint max_space_id; + const rec_t* rec; + const byte* field; + ulint len; + btr_pcur_t pcur; + mtr_t mtr; + + ut_a(!dict_table_is_comp(ibuf.index->table)); + + ibuf_mtr_start(&mtr); + + btr_pcur_open_at_index_side( + false, ibuf.index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index)); + + btr_pcur_move_to_prev(&pcur, &mtr); + + if (btr_pcur_is_before_first_on_page(&pcur)) { + /* The tree is empty */ + + max_space_id = 0; + } else { + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + max_space_id = mach_read_from_4(field); + } + + ibuf_mtr_commit(&mtr); + + /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ + + fil_set_max_space_id_if_bigger(max_space_id); +} + +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(mtr,rec,space,page_no) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(rec,space,page_no) +#endif +/****************************************************************//** +Helper function for ibuf_get_entry_counter_func. Checks if rec is for +(space, page_no), and if so, reads counter value from it and returns +that + 1. +@retval ULINT_UNDEFINED if the record does not contain any counter +@retval 0 if the record is not for (space, page_no) +@retval 1 + previous counter value, otherwise */ +static +ulint +ibuf_get_entry_counter_low_func( +/*============================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction of rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint space, /*!< in: space id */ + ulint page_no) /*!< in: page number */ +{ + ulint counter; + const byte* field; + ulint len; + + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + /* Check the tablespace identifier. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + if (mach_read_from_4(field) != space) { + + return(0); + } + + /* Check the page offset. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len); + ut_a(len == 4); + + if (mach_read_from_4(field) != page_no) { + + return(0); + } + + /* Check if the record contains a counter field. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + default: + ut_error; + case 0: /* ROW_FORMAT=REDUNDANT */ + case 1: /* ROW_FORMAT=COMPACT */ + return(ULINT_UNDEFINED); + + case IBUF_REC_INFO_SIZE: + counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER); + ut_a(counter < 0xFFFF); + return(counter + 1); + } +} + +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \ + ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \ + ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf) +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Calculate the counter field for an entry based on the current +last record in ibuf for (space, page_no). +@return the counter field, or ULINT_UNDEFINED +if we should abort this insertion to ibuf */ +static +ulint +ibuf_get_entry_counter_func( +/*========================*/ + ulint space, /*!< in: space id of entry */ + ulint page_no, /*!< in: page number of entry */ + const rec_t* rec, /*!< in: the record preceding the + insertion point */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction */ +#endif /* UNIV_DEBUG */ + ibool only_leaf) /*!< in: TRUE if this is the only + leaf page that can contain entries + for (space,page_no), that is, there + was no exact match for (space,page_no) + in the node pointer */ +{ + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_validate(page_align(rec), ibuf.index)); + + if (page_rec_is_supremum(rec)) { + /* This is just for safety. The record should be a + page infimum or a user record. */ + ut_ad(0); + return(ULINT_UNDEFINED); + } else if (!page_rec_is_infimum(rec)) { + return(ibuf_get_entry_counter_low(mtr, rec, space, page_no)); + } else if (only_leaf || !page_has_prev(page_align(rec))) { + /* The parent node pointer did not contain the + searched for (space, page_no), which means that the + search ended on the correct page regardless of the + counter value, and since we're at the infimum record, + there are no existing records. */ + return(0); + } else { + /* We used to read the previous page here. It would + break the latching order, because the caller has + buffer-fixed an insert buffer bitmap page. */ + return(ULINT_UNDEFINED); + } +} + + +/** Translates the ibuf free bits to the free space on a page in bytes. +@param[in] physical_size page_size +@param[in] bits value for ibuf bitmap bits +@return maximum insert size after reorganize for the page */ +inline ulint +ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits) +{ + ut_ad(bits < 4); + ut_ad(physical_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + + if (bits == 3) { + bits = 4; + } + + return bits * physical_size / IBUF_PAGE_SIZE_PER_FREE_SPACE; +} + +/** Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. +@param[in] mode BTR_MODIFY_PREV or BTR_MODIFY_TREE +@param[in] op operation type +@param[in] no_counter TRUE=use 5.0.3 format; FALSE=allow delete +buffering +@param[in] entry index entry to insert +@param[in] entry_size rec_get_converted_size(index, entry) +@param[in,out] index index where to insert; must not be unique +or clustered +@param[in] page_id page id where to insert +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] thr query thread +@return DB_SUCCESS, DB_STRONG_FAIL or other error */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +ibuf_insert_low( + ulint mode, + ibuf_op_t op, + ibool no_counter, + const dtuple_t* entry, + ulint entry_size, + dict_index_t* index, + const page_id_t page_id, + ulint zip_size, + que_thr_t* thr) +{ + big_rec_t* dummy_big_rec; + btr_pcur_t pcur; + btr_cur_t* cursor; + dtuple_t* ibuf_entry; + mem_heap_t* offsets_heap = NULL; + mem_heap_t* heap; + rec_offs* offsets = NULL; + ulint buffered; + lint min_n_recs; + rec_t* ins_rec; + buf_block_t* bitmap_page; + buf_block_t* block; + page_t* root; + dberr_t err; + ibool do_merge; + uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED]; + uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint n_stored; + mtr_t mtr; + mtr_t bitmap_mtr; + + ut_a(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_spatial(index)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(!no_counter || op == IBUF_OP_INSERT); + ut_ad(page_id.space() == index->table->space_id); + ut_a(op < IBUF_OP_COUNT); + + do_merge = FALSE; + + /* Perform dirty reads of ibuf.size and ibuf.max_size, to + reduce ibuf_mutex contention. Given that ibuf.max_size and + ibuf.size fit in a machine word, this should be OK; at worst + we are doing some excessive ibuf_contract() or occasionally + skipping an ibuf_contract(). */ + if (ibuf.max_size == 0) { + return(DB_STRONG_FAIL); + } + + if (ibuf.size >= ibuf.max_size + IBUF_CONTRACT_DO_NOT_INSERT) { + /* Insert buffer is now too big, contract it but do not try + to insert */ + + +#ifdef UNIV_IBUF_DEBUG + fputs("Ibuf too big\n", stderr); +#endif + ibuf_contract(); + + return(DB_STRONG_FAIL); + } + + heap = mem_heap_create(1024); + + /* Build the entry which contains the space id and the page number + as the first fields and the type information for other fields, and + which will be inserted to the insert buffer. Using a counter value + of 0xFFFF we find the last record for (space, page_no), from which + we can then read the counter value N and use N + 1 in the record we + insert. (We patch the ibuf_entry's counter field to the correct + value just before actually inserting the entry.) */ + + ibuf_entry = ibuf_entry_build( + op, index, entry, page_id.space(), page_id.page_no(), + no_counter ? ULINT_UNDEFINED : 0xFFFF, heap); + + /* Open a cursor to the insert buffer tree to calculate if we can add + the new entry to it without exceeding the free space limit for the + page. */ + + if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) { + for (;;) { + mutex_enter(&ibuf_pessimistic_insert_mutex); + mutex_enter(&ibuf_mutex); + + if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) { + + break; + } + + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + + if (!ibuf_add_free_page()) { + + mem_heap_free(heap); + return(DB_STRONG_FAIL); + } + } + } + + ibuf_mtr_start(&mtr); + + btr_pcur_open(ibuf.index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr); + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index)); + + /* Find out the volume of already buffered inserts for the same index + page */ + min_n_recs = 0; + buffered = ibuf_get_volume_buffered(&pcur, + page_id.space(), + page_id.page_no(), + op == IBUF_OP_DELETE + ? &min_n_recs + : NULL, &mtr); + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (op == IBUF_OP_DELETE + && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) { + /* The page could become empty after the record is + deleted, or the page has been read in to the buffer + pool. Refuse to buffer the operation. */ + + /* The buffer pool watch is needed for IBUF_OP_DELETE + because of latching order considerations. We can + check buf_pool_watch_occurred() only after latching + the insert buffer B-tree pages that contain buffered + changes for the page. We never buffer IBUF_OP_DELETE, + unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have + been previously buffered for the page. Because there + are buffered operations for the page, the insert + buffer B-tree page latches held by mtr will guarantee + that no changes for the user page will be merged + before mtr_commit(&mtr). We must not mtr_commit(&mtr) + until after the IBUF_OP_DELETE has been buffered. */ + +fail_exit: + if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) { + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + } + + err = DB_STRONG_FAIL; + goto func_exit; + } + + /* After this point, the page could still be loaded to the + buffer pool, but we do not have to care about it, since we are + holding a latch on the insert buffer leaf page that contains + buffered changes for (space, page_no). If the page enters the + buffer pool, buf_page_read_complete() for (space, page_no) will + have to acquire a latch on the same insert buffer leaf page, + which it cannot do until we have buffered the IBUF_OP_DELETE + and done mtr_commit(&mtr) to release the latch. */ + + ibuf_mtr_start(&bitmap_mtr); + + bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr); + + /* We check if the index page is suitable for buffered entries */ + + if (buf_pool.page_hash_contains(page_id)) { +commit_exit: + ibuf_mtr_commit(&bitmap_mtr); + goto fail_exit; + } else { + lock_mutex_enter(); + const auto lock_exists = lock_sys.get_first(page_id); + lock_mutex_exit(); + if (lock_exists) { + goto commit_exit; + } + } + + if (op == IBUF_OP_INSERT) { + ulint bits = ibuf_bitmap_page_get_bits( + bitmap_page->frame, page_id, physical_size, + IBUF_BITMAP_FREE, &bitmap_mtr); + + if (buffered + entry_size + page_dir_calc_reserved_space(1) + > ibuf_index_page_calc_free_from_bits(physical_size, + bits)) { + /* Release the bitmap page latch early. */ + ibuf_mtr_commit(&bitmap_mtr); + + /* It may not fit */ + do_merge = TRUE; + + ibuf_get_merge_page_nos(FALSE, + btr_pcur_get_rec(&pcur), &mtr, + space_ids, + page_nos, &n_stored); + + goto fail_exit; + } + } + + if (!no_counter) { + /* Patch correct counter value to the entry to + insert. This can change the insert position, which can + result in the need to abort in some cases. */ + ulint counter = ibuf_get_entry_counter( + page_id.space(), page_id.page_no(), + btr_pcur_get_rec(&pcur), &mtr, + btr_pcur_get_btr_cur(&pcur)->low_match + < IBUF_REC_FIELD_METADATA); + dfield_t* field; + + if (counter == ULINT_UNDEFINED) { + goto commit_exit; + } + + field = dtuple_get_nth_field( + ibuf_entry, IBUF_REC_FIELD_METADATA); + mach_write_to_2( + (byte*) dfield_get_data(field) + + IBUF_REC_OFFSET_COUNTER, counter); + } + + /* Set the bitmap bit denoting that the insert buffer contains + buffered entries for this index page, if the bit is not set yet */ + index->set_modified(bitmap_mtr); + ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>( + bitmap_page, page_id, physical_size, true, &bitmap_mtr); + ibuf_mtr_commit(&bitmap_mtr); + + cursor = btr_pcur_get_btr_cur(&pcur); + + if (mode == BTR_MODIFY_PREV) { + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + block = btr_cur_get_block(cursor); + ut_ad(block->page.id().space() == IBUF_SPACE_ID); + + /* If this is the root page, update ibuf.empty. */ + if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) { + const page_t* root = buf_block_get_frame(block); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + ibuf.empty = page_is_empty(root); + } + } else { + ut_ad(BTR_LATCH_MODE_WITHOUT_INTENTION(mode) + == BTR_MODIFY_TREE); + + /* We acquire an sx-latch to the root page before the insert, + because a pessimistic insert releases the tree x-latch, + which would cause the sx-latching of the root after that to + break the latching order. */ + + root = ibuf_tree_root_get(&mtr)->frame; + + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + } + + mutex_exit(&ibuf_pessimistic_insert_mutex); + ibuf_size_update(root); + mutex_exit(&ibuf_mutex); + ibuf.empty = page_is_empty(root); + + block = btr_cur_get_block(cursor); + ut_ad(block->page.id().space() == IBUF_SPACE_ID); + } + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + if (err == DB_SUCCESS && op != IBUF_OP_DELETE) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, NULL, + thr_get_trx(thr)->id, &mtr); + } + +func_exit: + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + mem_heap_free(heap); + + if (err == DB_SUCCESS + && BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) { + ibuf_contract_after_insert(entry_size); + } + + if (do_merge) { +#ifdef UNIV_IBUF_DEBUG + ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif + ibuf_read_merge_pages(space_ids, page_nos, n_stored); + } + + return(err); +} + +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index +is clustered or unique. +@param[in] op operation type +@param[in] entry index entry to insert +@param[in,out] index index where to insert +@param[in] page_id page id where to insert +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] thr query thread +@return true if success */ +bool +ibuf_insert( + ibuf_op_t op, + const dtuple_t* entry, + dict_index_t* index, + const page_id_t page_id, + ulint zip_size, + que_thr_t* thr) +{ + dberr_t err; + ulint entry_size; + ibool no_counter; + /* Read the settable global variable only once in + this function, so that we will have a consistent view of it. */ + ibuf_use_t use = ibuf_use_t(innodb_change_buffering); + DBUG_ENTER("ibuf_insert"); + + DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF, + op, page_id.space(), page_id.page_no())); + + ut_ad(dtuple_check_typed(entry)); + ut_ad(page_id.space() != SRV_TMP_SPACE_ID); + + ut_a(!dict_index_is_clust(index)); + ut_ad(!index->table->is_temporary()); + + no_counter = use <= IBUF_USE_INSERT; + + switch (op) { + case IBUF_OP_INSERT: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_DELETE: + case IBUF_USE_DELETE_MARK: + DBUG_RETURN(false); + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + goto check_watch; + } + break; + case IBUF_OP_DELETE_MARK: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + DBUG_RETURN(false); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto check_watch; + } + break; + case IBUF_OP_DELETE: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + DBUG_RETURN(false); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto skip_watch; + } + break; + case IBUF_OP_COUNT: + break; + } + + /* unknown op or use */ + ut_error; + +check_watch: + /* If a thread attempts to buffer an insert on a page while a + purge is in progress on the same page, the purge must not be + buffered, because it could remove a record that was + re-inserted later. For simplicity, we block the buffering of + all operations on a page that has a purge pending. + + We do not check this in the IBUF_OP_DELETE case, because that + would always trigger the buffer pool watch during purge and + thus prevent the buffering of delete operations. We assume + that the issuer of IBUF_OP_DELETE has called + buf_pool_t::watch_set(). */ + + if (buf_pool.page_hash_contains<true>(page_id)) { + /* A buffer pool watch has been set or the + page has been read into the buffer pool. + Do not buffer the request. If a purge operation + is being buffered, have this request executed + directly on the page in the buffer pool after the + buffered entries for this page have been merged. */ + DBUG_RETURN(false); + } + +skip_watch: + entry_size = rec_get_converted_size(index, entry, 0); + + if (entry_size + >= page_get_free_space_of_empty(dict_table_is_comp(index->table)) + / 2) { + + DBUG_RETURN(false); + } + + err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter, + entry, entry_size, + index, page_id, zip_size, thr); + if (err == DB_FAIL) { + err = ibuf_insert_low(BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT, + op, no_counter, entry, entry_size, + index, page_id, zip_size, thr); + } + + ut_a(err == DB_SUCCESS || err == DB_STRONG_FAIL + || err == DB_TOO_BIG_RECORD); + + DBUG_RETURN(err == DB_SUCCESS); +} + +/********************************************************************//** +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. +@return newly inserted record */ +static MY_ATTRIBUTE((nonnull)) +rec_t* +ibuf_insert_to_index_page_low( +/*==========================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + buf_block_t* block, /*!< in/out: index page where the buffered + entry should be placed */ + dict_index_t* index, /*!< in: record descriptor */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr, /*!< in/out: mtr */ + page_cur_t* page_cur)/*!< in/out: cursor positioned on the record + after which to insert the buffered entry */ +{ + rec_t* rec; + DBUG_ENTER("ibuf_insert_to_index_page_low"); + + rec = page_cur_tuple_insert(page_cur, entry, index, + offsets, &heap, 0, mtr); + if (rec != NULL) { + DBUG_RETURN(rec); + } + + /* Page reorganization or recompression should already have + been attempted by page_cur_tuple_insert(). Besides, per + ibuf_index_page_calc_free_zip() the page should not have been + recompressed or reorganized. */ + ut_ad(!is_buf_block_get_page_zip(block)); + + /* If the record did not fit, reorganize */ + + btr_page_reorganize(page_cur, index, mtr); + + /* This time the record must fit */ + + rec = page_cur_tuple_insert(page_cur, entry, index, + offsets, &heap, 0, mtr); + if (rec != NULL) { + DBUG_RETURN(rec); + } + + ib::error() << "Insert buffer insert fails; page free " + << page_get_max_insert_size(block->frame, 1) + << ", dtuple size " + << rec_get_converted_size(index, entry, 0); + + fputs("InnoDB: Cannot insert index record ", stderr); + dtuple_print(stderr, entry); + fputs("\nInnoDB: The table where this index record belongs\n" + "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" + "InnoDB: that table.\n", stderr); + + if (buf_block_t *bitmap_page = ibuf_bitmap_get_map_page( + block->page.id(), block->zip_size(), mtr)) { + + ib::error() << "page " << block->page.id() << ", size " + << block->physical_size() << ", bitmap bits " + << ibuf_bitmap_page_get_bits(bitmap_page->frame, + block->page.id(), block->zip_size(), + IBUF_BITMAP_FREE, mtr); + } + + ib::error() << BUG_REPORT_MSG; + + ut_ad(0); + DBUG_RETURN(NULL); +} + +/************************************************************************ +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. */ +static +void +ibuf_insert_to_index_page( +/*======================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + buf_block_t* block, /*!< in/out: index page where the buffered entry + should be placed */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + ulint low_match; + page_t* page = buf_block_get_frame(block); + rec_t* rec; + rec_offs* offsets; + mem_heap_t* heap; + + DBUG_ENTER("ibuf_insert_to_index_page"); + + DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF, + block->page.id().space(), + block->page.id().page_no())); + + ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); +#ifdef BTR_CUR_HASH_ADAPT + /* A change buffer merge must occur before users are granted + any access to the page. No adaptive hash index entries may + point to a freshly read page. */ + ut_ad(!block->index); + assert_block_ahi_empty(block); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(mtr->is_named_space(block->page.id().space())); + + if (UNIV_UNLIKELY(dict_table_is_comp(index->table) + != (ibool)!!page_is_comp(page))) { + ib::warn() << "Trying to insert a record from the insert" + " buffer to an index page but the 'compact' flag does" + " not match!"; + goto dump; + } + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (page_rec_is_supremum(rec)) { + ib::warn() << "Trying to insert a record from the insert" + " buffer to an index page but the index page" + " is empty!"; + goto dump; + } + + if (!rec_n_fields_is_sane(index, rec, entry)) { + ib::warn() << "Trying to insert a record from the insert" + " buffer to an index page but the number of fields" + " does not match!"; + rec_print(stderr, rec, index); +dump: + dtuple_print(stderr, entry); + ut_ad(0); + + ib::warn() << "The table where this index record belongs" + " is now probably corrupt. Please run CHECK TABLE on" + " your tables. " << BUG_REPORT_MSG; + + DBUG_VOID_RETURN; + } + + low_match = page_cur_search(block, index, entry, &page_cur); + + heap = mem_heap_create( + sizeof(upd_t) + + REC_OFFS_HEADER_SIZE * sizeof(*offsets) + + dtuple_get_n_fields(entry) + * (sizeof(upd_field_t) + sizeof *offsets)); + + if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) { + upd_t* update; + + rec = page_cur_get_rec(&page_cur); + + /* This is based on + row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */ + ut_ad(rec_get_deleted_flag(rec, page_is_comp(page))); + + offsets = rec_get_offsets(rec, index, NULL, index->n_fields, + ULINT_UNDEFINED, &heap); + update = row_upd_build_sec_rec_difference_binary( + rec, index, offsets, entry, heap); + + if (update->n_fields == 0) { + /* The records only differ in the delete-mark. + Clear the delete-mark, like we did before + Bug #56680 was fixed. */ + btr_rec_set_deleted<false>(block, rec, mtr); + goto updated_in_place; + } + + /* Copy the info bits. Clear the delete-mark. */ + update->info_bits = rec_get_info_bits(rec, page_is_comp(page)); + update->info_bits &= byte(~REC_INFO_DELETED_FLAG); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + /* We cannot invoke btr_cur_optimistic_update() here, + because we do not have a btr_cur_t or que_thr_t, + as the insert buffer merge occurs at a very low level. */ + if (!row_upd_changes_field_size_or_external(index, offsets, + update) + && (!page_zip || btr_cur_update_alloc_zip( + page_zip, &page_cur, index, offsets, + rec_offs_size(offsets), false, mtr))) { + /* This is the easy case. Do something similar + to btr_cur_update_in_place(). */ + rec = page_cur_get_rec(&page_cur); + btr_cur_upd_rec_in_place(rec, index, offsets, + update, block, mtr); + + DBUG_EXECUTE_IF( + "crash_after_log_ibuf_upd_inplace", + log_buffer_flush_to_disk(); + ib::info() << "Wrote log record for ibuf" + " update in place operation"; + DBUG_SUICIDE(); + ); + + goto updated_in_place; + } + + /* btr_cur_update_alloc_zip() may have changed this */ + rec = page_cur_get_rec(&page_cur); + + /* A collation may identify values that differ in + storage length. + Some examples (1 or 2 bytes): + utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I + utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S + utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS + + latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S + + Examples of a character (3-byte UTF-8 sequence) + identified with 2 or 4 characters (1-byte UTF-8 sequences): + + utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO + utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN + */ + + /* Delete the different-length record, and insert the + buffered one. */ + + lock_rec_store_on_page_infimum(block, rec); + page_cur_delete_rec(&page_cur, index, offsets, mtr); + page_cur_move_to_prev(&page_cur); + rec = ibuf_insert_to_index_page_low(entry, block, index, + &offsets, heap, mtr, + &page_cur); + + ut_ad(!cmp_dtuple_rec(entry, rec, offsets)); + lock_rec_restore_from_page_infimum(block, rec, block); + } else { + offsets = NULL; + ibuf_insert_to_index_page_low(entry, block, index, + &offsets, heap, mtr, + &page_cur); + } +updated_in_place: + mem_heap_free(heap); + + DBUG_VOID_RETURN; +} + +/****************************************************************//** +During merge, sets the delete mark on a record for a secondary index +entry. */ +static +void +ibuf_set_del_mark( +/*==============*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + ulint low_match; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + + low_match = page_cur_search(block, index, entry, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + rec_t* rec = page_cur_get_rec(&page_cur); + + /* Delete mark the old index record. According to a + comment in row_upd_sec_index_entry(), it can already + have been delete marked if a lock wait occurred in + row_ins_sec_index_entry() in a previous invocation of + row_upd_sec_index_entry(). */ + + if (UNIV_LIKELY + (!rec_get_deleted_flag( + rec, dict_table_is_comp(index->table)))) { + btr_rec_set_deleted<true>(block, rec, mtr); + } + } else { + const page_t* page + = page_cur_get_page(&page_cur); + const buf_block_t* block + = page_cur_get_block(&page_cur); + + ib::error() << "Unable to find a record to delete-mark"; + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, page_cur_get_rec(&page_cur), index); + + ib::error() << "page " << block->page.id() << " (" + << page_get_n_recs(page) << " records, index id " + << btr_page_get_index_id(page) << ")."; + + ib::error() << BUG_REPORT_MSG; + ut_ad(0); + } +} + +/****************************************************************//** +During merge, delete a record for a secondary index entry. */ +static +void +ibuf_delete( +/*========*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in/out: mtr; must be committed + before latching any further pages */ +{ + page_cur_t page_cur; + ulint low_match; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(!index->is_spatial()); + ut_ad(!index->is_clust()); + + low_match = page_cur_search(block, index, entry, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + rec_t* rec = page_cur_get_rec(&page_cur); + + /* TODO: the below should probably be a separate function, + it's a bastardized version of btr_cur_optimistic_delete. */ + + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + mem_heap_t* heap = NULL; + ulint max_ins_size = 0; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + if (page_get_n_recs(page) <= 1 + || !(REC_INFO_DELETED_FLAG + & rec_get_info_bits(rec, page_is_comp(page)))) { + /* Refuse to purge the last record or a + record that has not been marked for deletion. */ + ib::error() << "Unable to purge a record"; + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + fprintf(stderr, "\nspace " UINT32PF " offset " UINT32PF + " (%u records, index id %llu)\n" + "InnoDB: Submit a detailed bug report" + " to https://jira.mariadb.org/\n", + block->page.id().space(), + block->page.id().page_no(), + (unsigned) page_get_n_recs(page), + (ulonglong) btr_page_get_index_id(page)); + + ut_ad(0); + return; + } + + lock_update_delete(block, rec); + + if (!page_zip) { + max_ins_size + = page_get_max_insert_size_after_reorganize( + page, 1); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&page_cur, index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { + /* The record must have been purged already. */ + } +} + +/*********************************************************************//** +Restores insert buffer tree cursor position +@return whether the position was restored */ +static MY_ATTRIBUTE((nonnull)) +bool +ibuf_restore_pos( +/*=============*/ + const page_id_t page_id,/*!< in: page identifier */ + const dtuple_t* search_tuple, + /*!< in: search tuple for entries of page_no */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor whose + position is to be restored */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mode == BTR_MODIFY_LEAF + || BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE); + + if (UNIV_LIKELY(btr_pcur_restore_position(mode, pcur, mtr))) { + return true; + } + + if (fil_space_t* s = fil_space_t::get(page_id.space())) { + ib::error() << "ibuf cursor restoration fails!" + " ibuf record inserted to page " + << page_id + << " in file " << s->chain.start->name; + s->release(); + + ib::error() << BUG_REPORT_MSG; + + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); + dtuple_print(stderr, search_tuple); + + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); + } + + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + return false; +} + +/** +Delete a change buffer record. +@param[in] page_id page identifier +@param[in,out] pcur persistent cursor positioned on the record +@param[in] search_tuple search key for (space,page_no) +@param[in,out] mtr mini-transaction +@return whether mtr was committed (due to pessimistic operation) */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur, + const dtuple_t* search_tuple, mtr_t* mtr) +{ + ibool success; + page_t* root; + dberr_t err; + + ut_ad(ibuf_inside(mtr)); + ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); + ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) + == page_id.page_no()); + ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) + == page_id.space()); + + success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), + 0, mtr); + + if (success) { + if (page_is_empty(btr_pcur_get_page(pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + root = btr_pcur_get_page(pcur); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + /* ibuf.empty is protected by the root page latch. + Before the deletion, it had to be FALSE. */ + ut_ad(!ibuf.empty); + ibuf.empty = true; + } + + return(FALSE); + } + + /* We have to resort to a pessimistic delete from ibuf. + Delete-mark the record so that it will not be applied again, + in case the server crashes before the pessimistic delete is + made persistent. */ + btr_rec_set_deleted<true>(btr_pcur_get_block(pcur), + btr_pcur_get_rec(pcur), mtr); + + btr_pcur_store_position(pcur, mtr); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + + ibuf_mtr_start(mtr); + mutex_enter(&ibuf_mutex); + + if (!ibuf_restore_pos(page_id, search_tuple, + BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, + pcur, mtr)) { + + mutex_exit(&ibuf_mutex); + ut_ad(mtr->has_committed()); + goto func_exit; + } + + root = ibuf_tree_root_get(mtr)->frame; + + btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0, + false, mtr); + ut_a(err == DB_SUCCESS); + + ibuf_size_update(root); + mutex_exit(&ibuf_mutex); + + ibuf.empty = page_is_empty(root); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + +func_exit: + ut_ad(mtr->has_committed()); + btr_pcur_close(pcur); + + return(TRUE); +} + +/** Check whether buffered changes exist for a page. +@param[in] id page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return whether buffered changes exist */ +bool ibuf_page_exists(const page_id_t id, ulint zip_size) +{ + ut_ad(!fsp_is_system_temporary(id.space())); + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (ibuf_fixed_addr_page(id, physical_size) + || fsp_descr_page(id, physical_size)) { + return false; + } + + mtr_t mtr; + bool bitmap_bits = false; + + ibuf_mtr_start(&mtr); + if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + id, zip_size, &mtr)) { + bitmap_bits = ibuf_bitmap_page_get_bits( + bitmap_page->frame, id, zip_size, + IBUF_BITMAP_BUFFERED, &mtr) != 0; + } + ibuf_mtr_commit(&mtr); + return bitmap_bits; +} + +/** Reset the bits in the bitmap page for the given block and page id. +@param b X-latched secondary index page (nullptr to discard changes) +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param mtr mini-transaction */ +static void ibuf_reset_bitmap(buf_block_t *b, page_id_t page_id, + ulint zip_size, mtr_t *mtr) +{ + buf_block_t *bitmap= ibuf_bitmap_get_map_page(page_id, zip_size, mtr); + if (!bitmap) + return; + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + /* FIXME: update the bitmap byte only once! */ + ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(bitmap, page_id, + physical_size, false, mtr); + + if (b) + ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(bitmap, page_id, physical_size, + ibuf_index_page_calc_free(b), + mtr); +} + +/** When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. +@param block X-latched page to try to apply changes to, or NULL to discard +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ +void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id, + ulint zip_size) +{ + if (trx_sys_hdr_page(page_id)) { + return; + } + + btr_pcur_t pcur; +#ifdef UNIV_IBUF_DEBUG + ulint volume = 0; +#endif /* UNIV_IBUF_DEBUG */ + bool corruption_noticed = false; + mtr_t mtr; + + /* Counts for merged & discarded operations. */ + ulint mops[IBUF_OP_COUNT]; + ulint dops[IBUF_OP_COUNT]; + + ut_ad(!block || page_id == block->page.id()); + ut_ad(!block || block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(!block || block->page.status == buf_page_t::NORMAL); + ut_ad(!trx_sys_hdr_page(page_id)); + ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0)); + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (ibuf_fixed_addr_page(page_id, physical_size) + || fsp_descr_page(page_id, physical_size)) { + return; + } + + fil_space_t* space = fil_space_t::get(page_id.space()); + + if (UNIV_UNLIKELY(!space)) { + block = NULL; + } else { + ulint bitmap_bits = 0; + + ibuf_mtr_start(&mtr); + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + page_id, zip_size, &mtr); + + if (bitmap_page + && fil_page_get_type(bitmap_page->frame) + != FIL_PAGE_TYPE_ALLOCATED) { + bitmap_bits = ibuf_bitmap_page_get_bits( + bitmap_page->frame, page_id, zip_size, + IBUF_BITMAP_BUFFERED, &mtr); + } + + ibuf_mtr_commit(&mtr); + + if (bitmap_bits && fseg_page_is_free( + space, page_id.page_no())) { + ibuf_mtr_start(&mtr); + mtr.set_named_space(space); + ibuf_reset_bitmap(block, page_id, zip_size, &mtr); + ibuf_mtr_commit(&mtr); + bitmap_bits = 0; + } + + if (!bitmap_bits) { + /* No changes are buffered for this page. */ + space->release(); + return; + } + } + + mem_heap_t* heap = mem_heap_create(512); + + const dtuple_t* search_tuple = ibuf_search_tuple_build( + page_id.space(), page_id.page_no(), heap); + + if (block != NULL) { + /* Move the ownership of the x-latch on the page to this OS + thread, so that we can acquire a second x-latch on it. This + is needed for the insert operations to the index page to pass + the debug checks. */ + + rw_lock_x_lock_move_ownership(&(block->lock)); + + if (!fil_page_index_page_check(block->frame) + || !page_is_leaf(block->frame)) { + + corruption_noticed = true; + + ib::error() << "Corruption in the tablespace. Bitmap" + " shows insert buffer records to page " + << page_id << " though the page type is " + << fil_page_get_type(block->frame) + << ", which is not an index leaf page. We try" + " to resolve the problem by skipping the" + " insert buffer merge for this page. Please" + " run CHECK TABLE on your tables to determine" + " if they are corrupt after this."; + ut_ad(0); + } + } + + memset(mops, 0, sizeof(mops)); + memset(dops, 0, sizeof(dops)); + +loop: + ibuf_mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for this + index page */ + btr_pcur_open_on_user_rec( + ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (block) { + ut_ad(rw_lock_own(&block->lock, RW_LOCK_X)); + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + rw_lock_x_lock(&block->lock); + + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); + /* This is a user page (secondary index leaf page), + but we pretend that it is a change buffer page in + order to obey the latching order. This should be OK, + because buffered changes are applied immediately while + the block is io-fixed. Other threads must not try to + latch an io-fixed block. */ + buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE); + } + + if (space) { + mtr.set_named_space(space); + } + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + goto reset_bit; + } + + for (;;) { + rec_t* rec; + + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this index page */ + if (ibuf_rec_get_page_no(&mtr, rec) != page_id.page_no() + || ibuf_rec_get_space(&mtr, rec) != page_id.space()) { + + if (block != NULL) { + page_header_reset_last_insert(block, &mtr); + } + + goto reset_bit; + } + + if (corruption_noticed) { + fputs("InnoDB: Discarding record\n ", stderr); + rec_print_old(stderr, rec); + fputs("\nInnoDB: from the insert buffer!\n\n", stderr); + } else if (block != NULL && !rec_get_deleted_flag(rec, 0)) { + /* Now we have at pcur a record which should be + applied on the index page; NOTE that the call below + copies pointers to fields in rec, and we must + keep the latch to the rec page until the + insertion is finished! */ + dtuple_t* entry; + trx_id_t max_trx_id; + dict_index_t* dummy_index; + ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec); + + max_trx_id = page_get_max_trx_id(page_align(rec)); + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, &mtr); + + ut_ad(page_validate(page_align(rec), ibuf.index)); + + entry = ibuf_build_entry_from_ibuf_rec( + &mtr, rec, heap, &dummy_index); + ut_ad(!dummy_index->table->space); + dummy_index->table->space = space; + dummy_index->table->space_id = space->id; + + ut_ad(page_validate(block->frame, dummy_index)); + + switch (op) { + case IBUF_OP_INSERT: +#ifdef UNIV_IBUF_DEBUG + volume += rec_get_converted_size( + dummy_index, entry, 0); + + volume += page_dir_calc_reserved_space(1); + + ut_a(volume <= (4U << srv_page_size_shift) + / IBUF_PAGE_SIZE_PER_FREE_SPACE); +#endif + ibuf_insert_to_index_page( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE_MARK: + ibuf_set_del_mark( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE: + ibuf_delete(entry, block, dummy_index, &mtr); + /* Because ibuf_delete() will latch an + insert buffer bitmap page, commit mtr + before latching any further pages. + Store and restore the cursor position. */ + ut_ad(rec == btr_pcur_get_rec(&pcur)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(ibuf_rec_get_page_no(&mtr, rec) + == page_id.page_no()); + ut_ad(ibuf_rec_get_space(&mtr, rec) + == page_id.space()); + + /* Mark the change buffer record processed, + so that it will not be merged again in case + the server crashes between the following + mtr_commit() and the subsequent mtr_commit() + of deleting the change buffer record. */ + btr_rec_set_deleted<true>( + btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), &mtr); + + btr_pcur_store_position(&pcur, &mtr); + ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr); + + ibuf_mtr_start(&mtr); + mtr.set_named_space(space); + + ut_ad(rw_lock_own(&block->lock, RW_LOCK_X)); + buf_block_buf_fix_inc(block, + __FILE__, __LINE__); + rw_lock_x_lock(&block->lock); + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); + + /* This is a user page (secondary + index leaf page), but it should be OK + to use too low latching order for it, + as the block is io-fixed. */ + buf_block_dbg_add_level( + block, SYNC_IBUF_TREE_NODE); + + if (!ibuf_restore_pos(page_id, search_tuple, + BTR_MODIFY_LEAF, + &pcur, &mtr)) { + + ut_ad(mtr.has_committed()); + mops[op]++; + ibuf_dummy_index_free(dummy_index); + goto loop; + } + + break; + default: + ut_error; + } + + mops[op]++; + + ibuf_dummy_index_free(dummy_index); + } else { + dops[ibuf_rec_get_op_type(&mtr, rec)]++; + } + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(page_id, &pcur, search_tuple, &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ut_ad(mtr.has_committed()); + goto loop; + } else if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + goto loop; + } + } + +reset_bit: + if (space) { + ibuf_reset_bitmap(block, page_id, zip_size, &mtr); + } + + ibuf_mtr_commit(&mtr); + + if (space) { + space->release(); + } + + btr_pcur_close(&pcur); + mem_heap_free(heap); + + ibuf.n_merges++; + ibuf_add_ops(ibuf.n_merged_ops, mops); + ibuf_add_ops(ibuf.n_discarded_ops, dops); +} + +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or crash recovery. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(ulint space) +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; + const rec_t* ibuf_rec; + mtr_t mtr; + + /* Counts for discarded operations. */ + ulint dops[IBUF_OP_COUNT]; + + heap = mem_heap_create(512); + + /* Use page number 0 to build the search tuple so that we get the + cursor positioned at the first entry for this space id */ + + search_tuple = ibuf_search_tuple_build(space, 0, heap); + + memset(dops, 0, sizeof(dops)); +loop: + ibuf_mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for the + space */ + btr_pcur_open_on_user_rec( + ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + goto leave_loop; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + ibuf_rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this space */ + if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) { + + goto leave_loop; + } + + uint32_t page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec); + + dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++; + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(page_id_t(space, page_no), + &pcur, search_tuple, &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ut_ad(mtr.has_committed()); + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + goto loop; + } + } + +leave_loop: + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + + ibuf_add_ops(ibuf.n_discarded_ops, dops); + + mem_heap_free(heap); +} + +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +bool +ibuf_is_empty(void) +/*===============*/ +{ + mtr_t mtr; + + ibuf_mtr_start(&mtr); + + ut_d(mutex_enter(&ibuf_mutex)); + const buf_block_t* root = ibuf_tree_root_get(&mtr); + bool is_empty = page_is_empty(root->frame); + ut_a(is_empty == ibuf.empty); + ut_d(mutex_exit(&ibuf_mutex)); + ibuf_mtr_commit(&mtr); + + return(is_empty); +} + +/******************************************************************//** +Prints info of ibuf. */ +void +ibuf_print( +/*=======*/ + FILE* file) /*!< in: file where to print */ +{ + mutex_enter(&ibuf_mutex); + + fprintf(file, + "Ibuf: size " ULINTPF ", free list len " ULINTPF "," + " seg size " ULINTPF ", " ULINTPF " merges\n", + ibuf.size, + ibuf.free_list_len, + ibuf.seg_size, + ulint{ibuf.n_merges}); + + fputs("merged operations:\n ", file); + ibuf_print_ops(ibuf.n_merged_ops, file); + + fputs("discarded operations:\n ", file); + ibuf_print_ops(ibuf.n_discarded_ops, file); + + mutex_exit(&ibuf_mutex); +} + +/** Check the insert buffer bitmaps on IMPORT TABLESPACE. +@param[in] trx transaction +@param[in,out] space tablespace being imported +@return DB_SUCCESS or error code */ +dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) +{ + ut_ad(trx->mysql_thd); + ut_ad(space->purpose == FIL_TYPE_IMPORT); + + const unsigned zip_size = space->zip_size(); + const unsigned physical_size = space->physical_size(); + + uint32_t size= std::min(space->free_limit, space->size); + + if (size == 0) { + return(DB_TABLE_NOT_FOUND); + } + + mtr_t mtr; + + mutex_enter(&ibuf_mutex); + + /* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat + every page_size pages. For example if page_size is 16 KiB, then the + two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop + below page_no is measured in number of pages since the beginning of + the space, as usual. */ + + for (uint32_t page_no = 0; page_no < size; page_no += physical_size) { + if (trx_is_interrupted(trx)) { + mutex_exit(&ibuf_mutex); + return(DB_INTERRUPTED); + } + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + ibuf_enter(&mtr); + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + page_id_t(space->id, page_no), zip_size, &mtr); + if (!bitmap_page) { + mutex_exit(&ibuf_mutex); + mtr.commit(); + return DB_CORRUPTION; + } + + if (buf_is_zeroes(span<const byte>(bitmap_page->frame, + physical_size))) { + /* This means we got all-zero page instead of + ibuf bitmap page. The subsequent page should be + all-zero pages. */ +#ifdef UNIV_DEBUG + for (uint32_t curr_page = page_no + 1; + curr_page < physical_size; curr_page++) { + + buf_block_t* block = buf_page_get( + page_id_t(space->id, curr_page), + zip_size, RW_S_LATCH, &mtr); + page_t* page = buf_block_get_frame(block); + ut_ad(buf_is_zeroes(span<const byte>( + page, + physical_size))); + } +#endif /* UNIV_DEBUG */ + ibuf_exit(&mtr); + mtr_commit(&mtr); + continue; + } + + for (uint32_t i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size; + i++) { + const uint32_t offset = page_no + i; + const page_id_t cur_page_id(space->id, offset); + + if (ibuf_bitmap_page_get_bits( + bitmap_page->frame, cur_page_id, zip_size, + IBUF_BITMAP_IBUF, &mtr)) { + + mutex_exit(&ibuf_mutex); + ibuf_exit(&mtr); + mtr_commit(&mtr); + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "File %s page %u" + " is wrongly flagged to belong to the" + " insert buffer", + space->chain.start->name, offset); + return(DB_CORRUPTION); + } + + if (ibuf_bitmap_page_get_bits( + bitmap_page->frame, cur_page_id, zip_size, + IBUF_BITMAP_BUFFERED, &mtr)) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Buffered changes" + " for file %s page %u are lost", + space->chain.start->name, offset); + + /* Tolerate this error, so that + slightly corrupted tables can be + imported and dumped. Clear the bit. */ + ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>( + bitmap_page, cur_page_id, + physical_size, false, &mtr); + } + } + + ibuf_exit(&mtr); + mtr_commit(&mtr); + } + + mutex_exit(&ibuf_mutex); + return(DB_SUCCESS); +} + +/** Updates free bits and buffered bits for bulk loaded page. +@param[in] block index page +@param[in] reset flag if reset free val */ +void +ibuf_set_bitmap_for_bulk_load( + buf_block_t* block, + bool reset) +{ + mtr_t mtr; + ulint free_val; + + ut_a(page_is_leaf(buf_block_get_frame(block))); + + free_val = ibuf_index_page_calc_free(block); + + mtr.start(); + fil_space_t* space = mtr.set_named_space_id(block->page.id().space()); + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id(), + space->zip_size(), + &mtr); + + free_val = reset ? 0 : ibuf_index_page_calc_free(block); + /* FIXME: update the bitmap byte only once! */ + ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( + bitmap_page, block->page.id(), block->physical_size(), + free_val, &mtr); + + ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>( + bitmap_page, block->page.id(), block->physical_size(), + false, &mtr); + + mtr.commit(); +} |