summaryrefslogtreecommitdiffstats
path: root/storage/innobase
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/CMakeLists.txt7
-rw-r--r--storage/innobase/btr/btr0btr.cc508
-rw-r--r--storage/innobase/btr/btr0bulk.cc12
-rw-r--r--storage/innobase/btr/btr0cur.cc566
-rw-r--r--storage/innobase/btr/btr0defragment.cc820
-rw-r--r--storage/innobase/btr/btr0pcur.cc29
-rw-r--r--storage/innobase/btr/btr0sea.cc18
-rw-r--r--storage/innobase/buf/buf0buddy.cc4
-rw-r--r--storage/innobase/buf/buf0buf.cc546
-rw-r--r--storage/innobase/buf/buf0dblwr.cc37
-rw-r--r--storage/innobase/buf/buf0dump.cc2
-rw-r--r--storage/innobase/buf/buf0flu.cc30
-rw-r--r--storage/innobase/buf/buf0lru.cc65
-rw-r--r--storage/innobase/buf/buf0rea.cc417
-rw-r--r--storage/innobase/data/data0type.cc9
-rw-r--r--storage/innobase/dict/dict0boot.cc55
-rw-r--r--storage/innobase/dict/dict0defrag_bg.cc434
-rw-r--r--storage/innobase/dict/dict0dict.cc45
-rw-r--r--storage/innobase/dict/dict0load.cc17
-rw-r--r--storage/innobase/dict/dict0stats.cc140
-rw-r--r--storage/innobase/dict/dict0stats_bg.cc7
-rw-r--r--storage/innobase/dict/drop.cc11
-rw-r--r--storage/innobase/fil/fil0crypt.cc8
-rw-r--r--storage/innobase/fil/fil0fil.cc195
-rw-r--r--storage/innobase/fil/fil0pagecompress.cc3
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc748
-rw-r--r--storage/innobase/fsp/fsp0sysspace.cc14
-rw-r--r--storage/innobase/fut/fut0lst.cc12
-rw-r--r--storage/innobase/gis/gis0rtree.cc277
-rw-r--r--storage/innobase/gis/gis0sea.cc83
-rw-r--r--storage/innobase/handler/ha_innodb.cc617
-rw-r--r--storage/innobase/handler/ha_innodb.h23
-rw-r--r--storage/innobase/handler/handler0alter.cc12
-rw-r--r--storage/innobase/handler/i_s.cc42
-rw-r--r--storage/innobase/ibuf/ibuf0ibuf.cc4610
-rw-r--r--storage/innobase/include/btr0btr.h40
-rw-r--r--storage/innobase/include/btr0cur.h35
-rw-r--r--storage/innobase/include/btr0defragment.h65
-rw-r--r--storage/innobase/include/btr0types.h45
-rw-r--r--storage/innobase/include/buf0buf.h161
-rw-r--r--storage/innobase/include/buf0buf.inl2
-rw-r--r--storage/innobase/include/buf0dblwr.h40
-rw-r--r--storage/innobase/include/buf0lru.h24
-rw-r--r--storage/innobase/include/buf0rea.h52
-rw-r--r--storage/innobase/include/data0type.h57
-rw-r--r--storage/innobase/include/data0type.inl122
-rw-r--r--storage/innobase/include/dict0boot.h35
-rw-r--r--storage/innobase/include/dict0defrag_bg.h101
-rw-r--r--storage/innobase/include/dict0dict.h23
-rw-r--r--storage/innobase/include/dict0dict.inl2
-rw-r--r--storage/innobase/include/dict0load.h8
-rw-r--r--storage/innobase/include/dict0mem.h62
-rw-r--r--storage/innobase/include/dict0stats.h25
-rw-r--r--storage/innobase/include/dict0types.h13
-rw-r--r--storage/innobase/include/fil0fil.h94
-rw-r--r--storage/innobase/include/fsp0file.h5
-rw-r--r--storage/innobase/include/fsp0fsp.h6
-rw-r--r--storage/innobase/include/fsp0sysspace.h20
-rw-r--r--storage/innobase/include/fsp0types.h16
-rw-r--r--storage/innobase/include/fut0lst.h9
-rw-r--r--storage/innobase/include/gis0rtree.h65
-rw-r--r--storage/innobase/include/gis0rtree.inl5
-rw-r--r--storage/innobase/include/ibuf0ibuf.h457
-rw-r--r--storage/innobase/include/ibuf0ibuf.inl282
-rw-r--r--storage/innobase/include/lock0lock.h7
-rw-r--r--storage/innobase/include/log0crypt.h4
-rw-r--r--storage/innobase/include/log0log.h4
-rw-r--r--storage/innobase/include/log0recv.h61
-rw-r--r--storage/innobase/include/mtr0mtr.h13
-rw-r--r--storage/innobase/include/os0file.h12
-rw-r--r--storage/innobase/include/page0cur.h12
-rw-r--r--storage/innobase/include/page0cur.inl7
-rw-r--r--storage/innobase/include/page0page.h19
-rw-r--r--storage/innobase/include/page0zip.h10
-rw-r--r--storage/innobase/include/page0zip.inl4
-rw-r--r--storage/innobase/include/rem0rec.inl6
-rw-r--r--storage/innobase/include/row0import.h10
-rw-r--r--storage/innobase/include/row0purge.h35
-rw-r--r--storage/innobase/include/row0row.h35
-rw-r--r--storage/innobase/include/srv0mon.h21
-rw-r--r--storage/innobase/include/srv0srv.h34
-rw-r--r--storage/innobase/include/sux_lock.h4
-rw-r--r--storage/innobase/include/trx0sys.h3
-rw-r--r--storage/innobase/include/trx0trx.h3
-rw-r--r--storage/innobase/include/trx0undo.h6
-rw-r--r--storage/innobase/include/univ.i6
-rw-r--r--storage/innobase/lock/lock0lock.cc44
-rw-r--r--storage/innobase/log/log0crypt.cc10
-rw-r--r--storage/innobase/log/log0log.cc50
-rw-r--r--storage/innobase/log/log0recv.cc320
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc22
-rw-r--r--storage/innobase/os/os0file.cc117
-rw-r--r--storage/innobase/page/page0cur.cc17
-rw-r--r--storage/innobase/page/page0page.cc39
-rw-r--r--storage/innobase/page/page0zip.cc14
-rw-r--r--storage/innobase/rem/rem0cmp.cc46
-rw-r--r--storage/innobase/rem/rem0rec.cc9
-rw-r--r--storage/innobase/row/row0import.cc409
-rw-r--r--storage/innobase/row/row0ins.cc63
-rw-r--r--storage/innobase/row/row0log.cc62
-rw-r--r--storage/innobase/row/row0merge.cc12
-rw-r--r--storage/innobase/row/row0mysql.cc25
-rw-r--r--storage/innobase/row/row0purge.cc99
-rw-r--r--storage/innobase/row/row0quiesce.cc15
-rw-r--r--storage/innobase/row/row0row.cc77
-rw-r--r--storage/innobase/row/row0sel.cc14
-rw-r--r--storage/innobase/row/row0uins.cc31
-rw-r--r--storage/innobase/row/row0umod.cc49
-rw-r--r--storage/innobase/row/row0upd.cc41
-rw-r--r--storage/innobase/srv/srv0mon.cc112
-rw-r--r--storage/innobase/srv/srv0srv.cc99
-rw-r--r--storage/innobase/srv/srv0start.cc316
-rw-r--r--storage/innobase/trx/trx0purge.cc4
-rw-r--r--storage/innobase/trx/trx0rseg.cc10
-rw-r--r--storage/innobase/trx/trx0sys.cc10
-rw-r--r--storage/innobase/trx/trx0trx.cc7
-rw-r--r--storage/innobase/trx/trx0undo.cc20
117 files changed, 4025 insertions, 10748 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index b3125ca9..84fb6845 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -1,6 +1,6 @@
# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2014, 2022, MariaDB Corporation.
+# Copyright (c) 2014, 2023, MariaDB Corporation.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -139,7 +139,6 @@ SET(INNOBASE_SOURCES
btr/btr0cur.cc
btr/btr0pcur.cc
btr/btr0sea.cc
- btr/btr0defragment.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@@ -157,7 +156,6 @@ SET(INNOBASE_SOURCES
dict/dict0mem.cc
dict/dict0stats.cc
dict/dict0stats_bg.cc
- dict/dict0defrag_bg.cc
dict/drop.cc
eval/eval0eval.cc
eval/eval0proc.cc
@@ -192,7 +190,6 @@ SET(INNOBASE_SOURCES
include/btr0bulk.h
include/btr0cur.h
include/btr0cur.inl
- include/btr0defragment.h
include/btr0pcur.h
include/btr0pcur.inl
include/btr0sea.h
@@ -217,7 +214,6 @@ SET(INNOBASE_SOURCES
include/dict0boot.h
include/dict0crea.h
include/dict0crea.inl
- include/dict0defrag_bg.h
include/dict0dict.h
include/dict0dict.inl
include/dict0load.h
@@ -270,7 +266,6 @@ SET(INNOBASE_SOURCES
include/handler0alter.h
include/hash0hash.h
include/ibuf0ibuf.h
- include/ibuf0ibuf.inl
include/lock0iter.h
include/lock0lock.h
include/lock0lock.inl
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 6b3a3733..45207d9d 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -34,10 +34,8 @@ Created 6/2/1994 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
-#include "btr0defragment.h"
#include "rem0cmp.h"
#include "lock0lock.h"
-#include "ibuf0ibuf.h"
#include "trx0trx.h"
#include "srv0mon.h"
#include "gis0geo.h"
@@ -181,9 +179,8 @@ we allocate pages for the non-leaf levels of the tree.
@param block B-tree root page
@param space tablespace
@return whether the segment header is valid */
-static bool btr_root_fseg_validate(ulint offset,
- const buf_block_t &block,
- const fil_space_t &space)
+bool btr_root_fseg_validate(ulint offset, const buf_block_t &block,
+ const fil_space_t &space)
{
ut_ad(block.page.id().space() == space.id);
const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET +
@@ -213,13 +210,12 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
@param[in] index index tree
@param[in] page page number
@param[in] mode latch mode
-@param[in] merge whether change buffer merge should be attempted
@param[in,out] mtr mini-transaction
@param[out] err error code
@param[out] first set if this is a first-time access to the page
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
- uint32_t page, rw_lock_type_t mode, bool merge,
+ uint32_t page, rw_lock_type_t mode,
mtr_t *mtr, dberr_t *err, bool *first)
{
ut_ad(mode != RW_NO_LATCH);
@@ -229,7 +225,7 @@ buf_block_t *btr_block_get(const dict_index_t &index,
buf_block_t *block=
buf_page_get_gen(page_id_t{index.table->space->id, page},
index.table->space->zip_size(), mode, nullptr, BUF_GET,
- mtr, err, merge && !index.is_clust());
+ mtr, err);
ut_ad(!block == (*err != DB_SUCCESS));
if (UNIV_LIKELY(block != nullptr))
@@ -282,7 +278,7 @@ btr_root_block_get(
block=
buf_page_get_gen(page_id_t{index->table->space->id, index->page},
index->table->space->zip_size(), mode, guess, BUF_GET,
- mtr, err, false);
+ mtr, err);
ut_ad(!block == (*err != DB_SUCCESS));
if (UNIV_LIKELY(block != nullptr))
@@ -297,7 +293,6 @@ btr_root_block_get(
*err= DB_PAGE_CORRUPTED;
block= nullptr;
}
- else if (index->is_ibuf());
else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
*block, *index->table->space) ||
!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
@@ -533,41 +528,7 @@ btr_block_reget(mtr_t *mtr, const dict_index_t &index,
}
ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK));
- return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err);
-}
-
-/**************************************************************//**
-Allocates a new file page to be used in an ibuf tree. Takes the page from
-the free list of the tree, which must contain pages!
-@return new allocated block, x-latched */
-static
-buf_block_t*
-btr_page_alloc_for_ibuf(
-/*====================*/
- dict_index_t* index, /*!< in: index tree */
- mtr_t* mtr, /*!< in: mtr */
- dberr_t* err) /*!< out: error code */
-{
- buf_block_t *root= btr_get_latched_root(*index, mtr);
- if (UNIV_UNLIKELY(!root))
- return root;
- buf_block_t *new_block=
- buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
- mach_read_from_4(PAGE_HEADER +
- PAGE_BTR_IBUF_FREE_LIST +
- FLST_FIRST + FIL_ADDR_PAGE +
- root->page.frame)),
- 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
- if (new_block)
- {
- buf_page_make_young_if_needed(&new_block->page);
- *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block,
- PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
- fil_system.sys_space->free_limit, mtr);
- }
- ut_d(if (*err == DB_SUCCESS)
- flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
- return new_block;
+ return btr_block_get(index, id.page_no(), RW_X_LATCH, mtr, err);
}
static MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -599,10 +560,9 @@ buf_block_t *btr_root_block_sx(dict_index_t *index, mtr_t *mtr, dberr_t *err)
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
+MY_ATTRIBUTE((nonnull, warn_unused_result))
buf_block_t*
-btr_page_alloc_low(
-/*===============*/
+btr_page_alloc(
dict_index_t* index, /*!< in: index */
uint32_t hint_page_no, /*!< in: hint of a good page */
byte file_direction, /*!< in: direction where a possible
@@ -616,6 +576,8 @@ btr_page_alloc_low(
page should be initialized. */
dberr_t* err) /*!< out: error code */
{
+ ut_ad(level < BTR_MAX_NODE_LEVEL);
+
buf_block_t *root= btr_root_block_sx(index, mtr, err);
if (UNIV_UNLIKELY(!root))
return root;
@@ -625,55 +587,6 @@ btr_page_alloc_low(
true, mtr, init_mtr, err);
}
-/**************************************************************//**
-Allocates a new file page to be used in an index tree. NOTE: we assume
-that the caller has made the reservation for free extents!
-@retval NULL if no page could be allocated */
-buf_block_t*
-btr_page_alloc(
-/*===========*/
- dict_index_t* index, /*!< in: index */
- uint32_t hint_page_no, /*!< in: hint of a good page */
- byte file_direction, /*!< in: direction where a possible
- page split is made */
- ulint level, /*!< in: level where the page is placed
- in the tree */
- mtr_t* mtr, /*!< in/out: mini-transaction
- for the allocation */
- mtr_t* init_mtr, /*!< in/out: mini-transaction
- for x-latching and initializing
- the page */
- dberr_t* err) /*!< out: error code */
-{
- ut_ad(level < BTR_MAX_NODE_LEVEL);
- return index->is_ibuf()
- ? btr_page_alloc_for_ibuf(index, mtr, err)
- : btr_page_alloc_low(index, hint_page_no, file_direction, level,
- mtr, init_mtr, err);
-}
-
-/**************************************************************//**
-Frees a page used in an ibuf tree. Puts the page to the free list of the
-ibuf tree. */
-static
-dberr_t
-btr_page_free_for_ibuf(
-/*===================*/
- dict_index_t* index, /*!< in: index tree */
- buf_block_t* block, /*!< in: block to be freed, x-latched */
- mtr_t* mtr) /*!< in: mtr */
-{
- ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
- buf_block_t *root= btr_get_latched_root(*index, mtr);
- dberr_t err=
- flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
- block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
- fil_system.sys_space->free_limit, mtr);
- ut_d(if (err == DB_SUCCESS)
- flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
- return err;
-}
-
/** Free an index page.
@param[in,out] index index tree
@param[in,out] block block to be freed
@@ -706,9 +619,6 @@ dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
The page will be freed, so previous changes to it by this
mini-transaction should not matter. */
- if (index->is_ibuf())
- return btr_page_free_for_ibuf(index, block, mtr);
-
fil_space_t *space= index->table->space;
dberr_t err;
@@ -775,8 +685,7 @@ btr_node_ptr_get_child(
return btr_block_get(
*index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
- RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
- mtr, err);
+ RW_SX_LATCH, mtr, err);
}
MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
@@ -1040,77 +949,32 @@ btr_create(
mtr_t* mtr,
dberr_t* err)
{
- buf_block_t* block;
-
ut_ad(mtr->is_named_space(space));
ut_ad(index_id != BTR_FREED_INDEX_ID);
ut_ad(index || space == fil_system.sys_space);
- /* Create the two new segments (one, in the case of an ibuf tree) for
- the index tree; the segment headers are put on the allocated root page
- (for an ibuf tree, not in the root, but on a separate ibuf header
- page) */
-
- if (UNIV_UNLIKELY(type & DICT_IBUF)) {
- /* Allocate first the ibuf header page */
- buf_block_t* ibuf_hdr_block = fseg_create(
- space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err);
-
- if (ibuf_hdr_block == NULL) {
- return(FIL_NULL);
- }
-
- ut_ad(ibuf_hdr_block->page.id().page_no()
- == IBUF_HEADER_PAGE_NO);
- /* Allocate then the next page to the segment: it will be the
- tree root page */
+ /* Create the two new segments for the index tree;
+ the segment headers are put on the allocated root page */
- block = fseg_alloc_free_page_general(
- buf_block_get_frame(ibuf_hdr_block)
- + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
- IBUF_TREE_ROOT_PAGE_NO,
- FSP_UP, false, mtr, mtr, err);
+ buf_block_t *block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
+ mtr, err);
- if (block == NULL) {
- return(FIL_NULL);
- }
-
- ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
-
- flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
- } else {
- block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
- mtr, err);
-
- if (block == NULL) {
- return(FIL_NULL);
- }
+ if (!block) {
+ return FIL_NULL;
+ }
- if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
- err, false, block)) {
- /* Not enough space for new segment, free root
- segment before return. */
- btr_free_root(block, *space, mtr);
- return(FIL_NULL);
- }
+ if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
+ err, false, block)) {
+ /* Not enough space for new segment, free root
+ segment before return. */
+ btr_free_root(block, *space, mtr);
+ return FIL_NULL;
}
ut_ad(!page_has_siblings(block->page.frame));
btr_root_page_init(block, index_id, index, mtr);
- /* We reset the free bits for the page in a separate
- mini-transaction to allow creation of several trees in the
- same mtr, otherwise the latch on a bitmap page would prevent
- it because of the latching order.
-
- Note: Insert Buffering is disabled for temporary tables given that
- most temporary tables are smaller in size and short-lived. */
- if (!(type & DICT_CLUSTERED)
- && (!index || !index->table->is_temporary())) {
- ibuf_reset_free_bits(block);
- }
-
/* In the following assertion we test that two records of maximum
allowed size fit on the root page: this fact is needed to ensure
correctness of split algorithms */
@@ -1260,9 +1124,9 @@ void btr_drop_temporary_table(const dict_table_t &table)
for (const dict_index_t *index= table.indexes.start; index;
index= dict_table_get_next_index(index))
{
- if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0,
- RW_X_LATCH, nullptr, BUF_GET, &mtr,
- nullptr, false))
+ if (buf_block_t *block= buf_page_get_gen({SRV_TMP_SPACE_ID, index->page},
+ 0, RW_X_LATCH, nullptr, BUF_GET,
+ &mtr, nullptr))
{
btr_free_but_not_root(block, MTR_LOG_NO_REDO);
mtr.set_log_mode(MTR_LOG_NO_REDO);
@@ -1451,18 +1315,18 @@ static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr)
if (page_get_max_trx_id(block->page.frame))
/* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
clustered index root pages. */
- ut_ad(dict_index_is_sec_or_ibuf(cursor->index)
+ ut_ad(!cursor->index->is_primary()
? page_is_leaf(block->page.frame)
: block->page.id().page_no() == cursor->index->page);
else
/* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
pages, and in temporary tables. It was always zero-initialized in
- page_create(). PAGE_MAX_TRX_ID must be nonzero on
- dict_index_is_sec_or_ibuf() leaf pages. */
+ page_create(). PAGE_MAX_TRX_ID must be nonzero on secondary index
+ leaf pages. */
ut_ad(cursor->index->table->is_temporary() ||
!page_is_leaf(block->page.frame) ||
- !dict_index_is_sec_or_ibuf(cursor->index));
+ cursor->index->is_primary());
#endif
const uint16_t data_size1= page_get_data_size(old->page.frame);
@@ -1662,15 +1526,7 @@ static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr)
return DB_SUCCESS;
}
-/*************************************************************//**
-Reorganizes an index page.
-
-IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index. This has to
-be done either within the same mini-transaction, or by invoking
-ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
-IBUF_BITMAP_FREE is unaffected by reorganization.
-
+/** Reorganize an index page.
@return error code
@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
dberr_t
@@ -1689,15 +1545,7 @@ btr_page_reorganize_block(
return btr_page_reorganize_low(&cur, mtr);
}
-/*************************************************************//**
-Reorganizes an index page.
-
-IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index. This has to
-be done either within the same mini-transaction, or by invoking
-ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
-IBUF_BITMAP_FREE is unaffected by reorganization.
-
+/** Reorganize an index page.
@param cursor page cursor
@param mtr mini-transaction
@return error code
@@ -1923,6 +1771,7 @@ btr_root_raise_and_insert(
ut_ad(!page_is_empty(root->page.frame));
index = btr_cur_get_index(cursor);
ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+ ut_ad(!index->is_spatial());
#ifdef UNIV_ZIP_DEBUG
ut_a(!root_page_zip
|| page_zip_validate(root_page_zip, root->page.frame, index));
@@ -1938,12 +1787,11 @@ btr_root_raise_and_insert(
return nullptr;
}
- if (index->is_ibuf()) {
- } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
- *root, *index->table->space)
- || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
- *root, *index->table->space)) {
- return nullptr;
+ if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+ *root, *index->table->space)
+ || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+ *root, *index->table->space)) {
+ return nullptr;
}
/* Allocate a new page to the tree. Root splitting is done by first
@@ -2009,18 +1857,12 @@ btr_root_raise_and_insert(
page_get_infimum_rec(root->page.frame));
}
- /* Move any existing predicate locks */
- if (dict_index_is_spatial(index)) {
- lock_prdt_rec_move(new_block, root_id);
- } else {
- btr_search_move_or_delete_hash_entries(
- new_block, root);
- }
+ btr_search_move_or_delete_hash_entries(new_block, root);
}
constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
- if (dict_index_is_sec_or_ibuf(index)) {
- /* In secondary indexes and the change buffer,
+ if (!index->is_primary()) {
+ /* In secondary indexes,
PAGE_MAX_TRX_ID can be reset on the root page, because
the field only matters on leaf pages, and the root no
longer is a leaf page. (Older versions of InnoDB did
@@ -2070,16 +1912,8 @@ btr_root_raise_and_insert(
/* Build the node pointer (= node key and page address) for the
child */
- if (dict_index_is_spatial(index)) {
- rtr_mbr_t new_mbr;
-
- rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
- node_ptr = rtr_index_build_node_ptr(
- index, &new_mbr, rec, new_page_no, *heap);
- } else {
- node_ptr = dict_index_build_node_ptr(
- index, rec, new_page_no, *heap, level);
- }
+ node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, *heap,
+ level);
/* The node pointer must be marked as the predefined minimum record,
as there is no lower alphabetical limit to records in the leftmost
node of a level: */
@@ -2112,13 +1946,6 @@ btr_root_raise_and_insert(
to new_block at this point. Thus, the data should fit. */
ut_a(node_ptr_rec);
- /* We play safe and reset the free bits for the new page */
-
- if (!dict_index_is_clust(index)
- && !index->table->is_temporary()) {
- ibuf_reset_free_bits(new_block);
- }
-
page_cursor->block = new_block;
page_cursor->index = index;
@@ -2484,10 +2311,9 @@ btr_insert_on_non_leaf_level(
rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
rtr_info_update_btr(&cursor, &rtr_info);
- err = rtr_search_to_nth_level(level, tuple,
- PAGE_CUR_RTREE_INSERT,
- BTR_CONT_MODIFY_TREE,
- &cursor, mtr);
+ err = rtr_search_to_nth_level(&cursor, nullptr, tuple,
+ BTR_CONT_MODIFY_TREE, mtr,
+ PAGE_CUR_RTREE_INSERT, level);
} else {
err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH,
&cursor, mtr);
@@ -2613,7 +2439,7 @@ btr_attach_half_pages(
ut_ad(mtr->memo_contains(index->lock,
MTR_MEMO_X_LOCK));
prev_block = btr_block_get(*index, prev_page_no,
- RW_X_LATCH, !level, mtr);
+ RW_X_LATCH, mtr);
}
#endif
}
@@ -2625,7 +2451,7 @@ btr_attach_half_pages(
ut_ad(mtr->memo_contains(index->lock,
MTR_MEMO_X_LOCK));
next_block = btr_block_get(*index, next_page_no,
- RW_X_LATCH, !level, mtr);
+ RW_X_LATCH, mtr);
}
#endif
}
@@ -2773,10 +2599,9 @@ btr_insert_into_right_sibling(
page_t* next_page;
btr_cur_t next_father_cursor;
rec_t* rec = nullptr;
- ulint max_size;
next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH,
- page_is_leaf(page), mtr);
+ mtr);
if (UNIV_UNLIKELY(!next_block)) {
return nullptr;
}
@@ -2799,8 +2624,6 @@ btr_insert_into_right_sibling(
return nullptr;
}
- max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
-
/* Extends gap lock for the next page */
if (is_leaf && cursor->index()->has_locking()) {
lock_update_node_pointer(block, next_block);
@@ -2810,15 +2633,6 @@ btr_insert_into_right_sibling(
n_ext, mtr);
if (!rec) {
- if (is_leaf
- && next_block->page.zip.ssize
- && !dict_index_is_clust(cursor->index())
- && !cursor->index()->table->is_temporary()) {
- /* Reset the IBUF_BITMAP_FREE bits, because
- page_cur_tuple_insert() will have attempted page
- reorganize before failing. */
- ibuf_reset_free_bits(next_block);
- }
return nullptr;
}
@@ -2856,34 +2670,12 @@ btr_insert_into_right_sibling(
}
ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
-
- if (is_leaf
- && !dict_index_is_clust(cursor->index())
- && !cursor->index()->table->is_temporary()) {
- /* Update the free bits of the B-tree page in the
- insert buffer bitmap. */
-
- if (next_block->page.zip.ssize) {
- ibuf_update_free_bits_zip(next_block, mtr);
- } else {
- ibuf_update_free_bits_if_full(
- next_block, max_size,
- rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
- }
- }
-
return(rec);
}
/*************************************************************//**
Moves record list end to another page. Moved records include
split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return error code */
static
dberr_t
@@ -2939,12 +2731,6 @@ page_move_rec_list_end(
/*************************************************************//**
Moves record list start to another page. Moved records do not include
split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return error code */
static
dberr_t
@@ -3002,15 +2788,10 @@ btr_page_split_and_insert(
ut_ad(*err == DB_SUCCESS);
ut_ad(dtuple_check_typed(tuple));
+ ut_ad(!cursor->index()->is_spatial());
buf_pool.pages_split++;
- if (cursor->index()->is_spatial()) {
- /* Split rtree page and update parent */
- return rtr_page_split_and_insert(flags, cursor, offsets, heap,
- tuple, n_ext, mtr, err);
- }
-
if (!*heap) {
*heap = mem_heap_create(1024);
}
@@ -3101,12 +2882,6 @@ got_split_rec:
}
btr_page_create(new_block, new_page_zip, cursor->index(),
page_level, mtr);
- /* Only record the leaf level page splits. */
- if (!page_level) {
- cursor->index()->stat_defrag_n_page_split ++;
- cursor->index()->stat_defrag_modified_counter ++;
- btr_defragment_save_defrag_stats_if_needed(cursor->index());
- }
/* 3. Calculate the first record on the upper half-page, and the
first record (move_limit) on original page which ends up on the
@@ -3375,13 +3150,6 @@ insert_empty:
/* The insert did not fit on the page: loop back to the
start of the function for a new split */
insert_failed:
- /* We play safe and reset the free bits for new_page */
- if (!dict_index_is_clust(page_cursor->index)
- && !page_cursor->index->table->is_temporary()) {
- ibuf_reset_free_bits(new_block);
- ibuf_reset_free_bits(block);
- }
-
n_iterations++;
ut_ad(n_iterations < 2
|| buf_block_get_page_zip(insert_block));
@@ -3391,17 +3159,6 @@ insert_failed:
}
func_exit:
- /* Insert fit on the page: update the free bits for the
- left and right pages in the same mtr */
-
- if (!dict_index_is_clust(page_cursor->index)
- && !page_cursor->index->table->is_temporary()
- && page_is_leaf(page)) {
-
- ibuf_update_free_bits_for_two_pages_low(
- left_block, right_block, mtr);
- }
-
ut_ad(page_validate(buf_block_get_frame(left_block),
page_cursor->index));
ut_ad(page_validate(buf_block_get_frame(right_block),
@@ -3437,8 +3194,7 @@ dberr_t btr_level_list_remove(const buf_block_t& block,
if (!prev)
{
ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
- prev= btr_block_get(index, id.page_no(), RW_X_LATCH,
- page_is_leaf(block.page.frame), mtr, &err);
+ prev= btr_block_get(index, id.page_no(), RW_X_LATCH, mtr, &err);
if (UNIV_UNLIKELY(!prev))
return err;
}
@@ -3453,8 +3209,7 @@ dberr_t btr_level_list_remove(const buf_block_t& block,
if (!next)
{
ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
- next= btr_block_get(index, id.page_no(), RW_X_LATCH,
- page_is_leaf(block.page.frame), mtr, &err);
+ next= btr_block_get(index, id.page_no(), RW_X_LATCH, mtr, &err);
if (UNIV_UNLIKELY(!next))
return err;
}
@@ -3479,6 +3234,7 @@ btr_lift_page_up(
must not be empty: use
btr_discard_only_page_on_level if the last
record from the page should be removed */
+ que_thr_t* thr, /*!< in/out: query thread */
mtr_t* mtr, /*!< in/out: mini-transaction */
dberr_t* err) /*!< out: error code */
{
@@ -3513,7 +3269,8 @@ btr_lift_page_up(
if (index->is_spatial()) {
offsets = rtr_page_get_father_block(
- nullptr, heap, mtr, nullptr, &cursor);
+ nullptr, heap, nullptr, &cursor,
+ thr, mtr);
} else {
offsets = btr_page_get_father_block(offsets, heap,
mtr, &cursor);
@@ -3542,7 +3299,8 @@ parent_corrupted:
if (index->is_spatial()) {
offsets = rtr_page_get_father_block(
- nullptr, heap, mtr, nullptr, &cursor);
+ nullptr, heap, nullptr, &cursor, thr,
+ mtr);
} else {
offsets = btr_page_get_father_block(offsets,
heap,
@@ -3677,13 +3435,8 @@ copied:
/* Free the file page */
btr_page_free(index, block, mtr);
- /* We play it safe and reset the free bits for the father */
- if (!dict_index_is_clust(index)
- && !index->table->is_temporary()) {
- ibuf_reset_free_bits(father_block);
- }
ut_ad(page_validate(father_block->page.frame, index));
- ut_ad(btr_check_node_ptr(index, father_block, mtr));
+ ut_ad(btr_check_node_ptr(index, father_block, thr, mtr));
return(lift_father_up ? block_orig : father_block);
}
@@ -3750,8 +3503,10 @@ btr_compress(
father_cursor.page_cur.block = block;
if (index->is_spatial()) {
+ ut_ad(cursor->rtr_info);
offsets = rtr_page_get_father_block(
- NULL, heap, mtr, cursor, &father_cursor);
+ nullptr, heap, cursor, &father_cursor,
+ cursor->rtr_info->thr, mtr);
ut_ad(cursor->page_cur.block->page.id() == block->page.id());
rec_t* my_rec = father_cursor.page_cur.rec;
@@ -3761,17 +3516,16 @@ btr_compress(
ib::info() << "father positioned on page "
<< page_no << "instead of "
<< block->page.id().page_no();
- offsets = btr_page_get_father_block(
- NULL, heap, mtr, &father_cursor);
+ goto get_offsets;
}
} else {
+get_offsets:
offsets = btr_page_get_father_block(
NULL, heap, mtr, &father_cursor);
}
if (UNIV_UNLIKELY(!offsets)) {
- err = DB_CORRUPTION;
- goto func_exit;
+ goto corrupted;
}
if (adjust) {
@@ -3779,14 +3533,7 @@ btr_compress(
if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) {
corrupted:
err = DB_CORRUPTION;
- err_exit:
- /* We play it safe and reset the free bits. */
- if (merge_block && merge_block->zip_size()
- && page_is_leaf(merge_block->page.frame)
- && !index->is_clust()) {
- ibuf_reset_free_bits(merge_block);
- }
- goto func_exit;
+ goto err_exit;
}
}
@@ -3794,7 +3541,10 @@ btr_compress(
/* The page is the only one on the level, lift the records
to the father */
- merge_block = btr_lift_page_up(index, block, mtr, &err);
+ merge_block = btr_lift_page_up(index, block,
+ cursor->rtr_info
+ ? cursor->rtr_info->thr
+ : nullptr, mtr, &err);
success:
if (adjust) {
ut_ad(nth_rec > 0);
@@ -3809,7 +3559,7 @@ success:
}
MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
-func_exit:
+err_exit:
mem_heap_free(heap);
DBUG_RETURN(err);
}
@@ -4109,49 +3859,6 @@ cannot_merge:
}
}
- if (!dict_index_is_clust(index)
- && !index->table->is_temporary()
- && page_is_leaf(merge_page)) {
- /* Update the free bits of the B-tree page in the
- insert buffer bitmap. This has to be done in a
- separate mini-transaction that is committed before the
- main mini-transaction. We cannot update the insert
- buffer bitmap in this mini-transaction, because
- btr_compress() can be invoked recursively without
- committing the mini-transaction in between. Since
- insert buffer bitmap pages have a lower rank than
- B-tree pages, we must not access other pages in the
- same mini-transaction after accessing an insert buffer
- bitmap page. */
-
- /* The free bits in the insert buffer bitmap must
- never exceed the free space on a page. It is safe to
- decrement or reset the bits in the bitmap in a
- mini-transaction that is committed before the
- mini-transaction that affects the free space. */
-
- /* It is unsafe to increment the bits in a separately
- committed mini-transaction, because in crash recovery,
- the free bits could momentarily be set too high. */
-
- if (merge_block->zip_size()) {
- /* Because the free bits may be incremented
- and we cannot update the insert buffer bitmap
- in the same mini-transaction, the only safe
- thing we can do here is the pessimistic
- approach: reset the free bits. */
- ibuf_reset_free_bits(merge_block);
- } else {
- /* On uncompressed pages, the free bits will
- never increase here. Thus, it is safe to
- write the bits accurately in a separate
- mini-transaction. */
- ibuf_update_free_bits_if_full(merge_block,
- srv_page_size,
- ULINT_UNDEFINED);
- }
- }
-
ut_ad(page_validate(merge_page, index));
#ifdef UNIV_ZIP_DEBUG
ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
@@ -4166,7 +3873,10 @@ cannot_merge:
err = btr_page_free(index, block, mtr);
if (err == DB_SUCCESS) {
ut_ad(leftmost_child
- || btr_check_node_ptr(index, merge_block, mtr));
+ || btr_check_node_ptr(index, merge_block,
+ cursor->rtr_info
+ ? cursor->rtr_info->thr
+ : nullptr, mtr));
goto success;
} else {
goto err_exit;
@@ -4183,11 +3893,13 @@ static
void
btr_discard_only_page_on_level(
/*===========================*/
- dict_index_t* index, /*!< in: index tree */
- buf_block_t* block, /*!< in: page which is the only on its level */
+ btr_cur_t* cur, /*!< in: cursor on a page which is the
+ only on its level */
mtr_t* mtr) /*!< in: mtr */
{
- ulint page_level = 0;
+ dict_index_t* index = cur->index();
+ buf_block_t* block = btr_cur_get_block(cur);
+ ulint page_level = 0;
ut_ad(!index->is_dummy);
@@ -4218,7 +3930,8 @@ btr_discard_only_page_on_level(
if (index->is_spatial()) {
/* Check any concurrent search having this page */
rtr_check_discard_page(index, NULL, block);
- if (!rtr_page_get_father(mtr, nullptr, &cursor)) {
+ if (!rtr_page_get_father(mtr, nullptr, &cursor,
+ cur->rtr_info->thr)) {
return;
}
} else {
@@ -4284,9 +3997,6 @@ btr_discard_only_page_on_level(
index->clear_instant_add();
}
} else if (!index->table->is_temporary()) {
- /* We play it safe and reset the free bits for the root */
- ibuf_reset_free_bits(block);
-
ut_a(max_trx_id);
page_set_max_trx_id(block,
buf_block_get_page_zip(block),
@@ -4323,7 +4033,8 @@ btr_discard_page(
MONITOR_INC(MONITOR_INDEX_DISCARD);
if (index->is_spatial()
- ? !rtr_page_get_father(mtr, cursor, &parent_cursor)
+ ? !rtr_page_get_father(mtr, cursor, &parent_cursor,
+ cursor->rtr_info->thr)
: !btr_page_get_father(mtr, &parent_cursor)) {
return DB_CORRUPTION;
}
@@ -4397,7 +4108,7 @@ btr_discard_page(
return DB_CORRUPTION;
}
} else {
- btr_discard_only_page_on_level(index, block, mtr);
+ btr_discard_only_page_on_level(cursor, mtr);
return DB_SUCCESS;
}
@@ -4452,14 +4163,20 @@ btr_discard_page(
If the merge_block's parent block is not same,
we cannot use btr_check_node_ptr() */
ut_ad(parent_is_different
- || btr_check_node_ptr(index, merge_block, mtr));
+ || btr_check_node_ptr(index, merge_block,
+ cursor->rtr_info
+ ? cursor->rtr_info->thr
+ : nullptr, mtr));
if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
== index->page
&& !page_has_siblings(btr_cur_get_page(&parent_cursor))
&& page_get_n_recs(btr_cur_get_page(&parent_cursor))
== 1) {
- btr_lift_page_up(index, merge_block, mtr, &err);
+ btr_lift_page_up(index, merge_block,
+ cursor->rtr_info
+ ? cursor->rtr_info->thr
+ : nullptr, mtr, &err);
}
}
@@ -4478,13 +4195,6 @@ btr_print_size(
fseg_header_t* seg;
mtr_t mtr;
- if (dict_index_is_ibuf(index)) {
- fputs("Sorry, cannot print info of an ibuf tree:"
- " use ibuf functions\n", stderr);
-
- return;
- }
-
mtr_start(&mtr);
root = btr_root_get(index, &mtr);
@@ -4494,13 +4204,10 @@ btr_print_size(
fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
fseg_print(seg, &mtr);
- if (!dict_index_is_ibuf(index)) {
-
- seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
- fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
- fseg_print(seg, &mtr);
- }
+ fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
mtr_commit(&mtr);
}
@@ -4611,6 +4318,7 @@ btr_check_node_ptr(
/*===============*/
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: index page */
+ que_thr_t* thr, /*!< in/out: query thread */
mtr_t* mtr) /*!< in: mtr */
{
mem_heap_t* heap;
@@ -4632,8 +4340,8 @@ btr_check_node_ptr(
heap = mem_heap_create(256);
if (dict_index_is_spatial(index)) {
- offsets = rtr_page_get_father_block(NULL, heap, mtr,
- NULL, &cursor);
+ offsets = rtr_page_get_father_block(NULL, heap,
+ NULL, &cursor, thr, mtr);
} else {
offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor);
}
@@ -4708,14 +4416,6 @@ btr_index_rec_validate(
ut_ad(index->n_core_fields);
- if (index->is_ibuf()) {
- /* The insert buffer index tree can contain records from any
- other index: we cannot check the number of fields or
- their length */
-
- return(TRUE);
- }
-
#ifdef VIRTUAL_INDEX_DEBUG
if (dict_index_has_virtual(index)) {
fprintf(stderr, "index name is %s\n", index->name());
@@ -5043,8 +4743,7 @@ corrupted:
mtr.release_last_page();
block = btr_block_get(*index, left_page_no,
- RW_SX_LATCH, false,
- &mtr, &err);
+ RW_SX_LATCH, &mtr, &err);
if (!block) {
goto invalid_page;
}
@@ -5115,7 +4814,7 @@ func_exit:
const rec_t* right_rec;
right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
- !level, &mtr, &err);
+ &mtr, &err);
if (!right_block) {
btr_validate_report1(index, level, block);
fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr);
@@ -5368,7 +5067,7 @@ node_ptr_fails:
mtr.start();
block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
- !level, &mtr, &err);
+ &mtr, &err);
goto loop;
}
@@ -5435,8 +5134,7 @@ error:
index = btr_cur_get_index(cursor);
page = btr_cur_get_page(cursor);
- mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
- mtr);
+ mblock = btr_block_get(*index, page_no, RW_X_LATCH, mtr);
if (!mblock) {
goto error;
}
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index e2513ad6..094d7570 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,7 +28,6 @@ Created 03/11/2014 Shaohua Wang
#include "btr0btr.h"
#include "btr0cur.h"
#include "btr0pcur.h"
-#include "ibuf0ibuf.h"
#include "page0page.h"
#include "trx0trx.h"
@@ -95,7 +94,7 @@ PageBulk::init()
}
} else {
new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
- false, &m_mtr);
+ &m_mtr);
if (!new_block) {
m_mtr.commit();
return(DB_CORRUPTION);
@@ -110,7 +109,7 @@ PageBulk::init()
m_page_zip = buf_block_get_page_zip(new_block);
- if (!m_level && dict_index_is_sec_or_ibuf(m_index)) {
+ if (!m_level && !m_index->is_primary()) {
page_update_max_trx_id(new_block, m_page_zip, m_trx_id,
&m_mtr);
}
@@ -551,9 +550,6 @@ inline void PageBulk::finish()
void PageBulk::commit(bool success)
{
finish();
- if (success && !m_index->is_clust() && page_is_leaf(m_page))
- ibuf_set_bitmap_for_bulk_load(m_block, &m_mtr,
- innobase_fill_factor == 100);
m_mtr.commit();
}
@@ -1182,7 +1178,7 @@ BtrBulk::finish(dberr_t err)
ut_ad(last_page_no != FIL_NULL);
last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
- false, &mtr);
+ &mtr);
if (!last_block) {
err = DB_CORRUPTION;
err_exit:
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index 2fc05b06..799a8575 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -55,7 +55,6 @@ Created 10/16/1994 Heikki Tuuri
#include "que0que.h"
#include "row0row.h"
#include "srv0srv.h"
-#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "zlib.h"
#include "srv0start.h"
@@ -67,15 +66,6 @@ Created 10/16/1994 Heikki Tuuri
#endif /* WITH_WSREP */
#include "log.h"
-/** Buffered B-tree operation types, introduced as part of delete buffering. */
-enum btr_op_t {
- BTR_NO_OP = 0, /*!< Not buffered */
- BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
- BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
- BTR_DELETE_OP, /*!< Purge a delete-marked record */
- BTR_DELMARK_OP /*!< Mark a record for deletion */
-};
-
/** Modification types for the B-tree operation.
Note that the order must be DELETE, BOTH, INSERT !!
*/
@@ -191,10 +181,14 @@ when loading a table definition.
static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
{
ut_ad(index->is_primary());
- ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
- ut_ad(index->table->supports_instant());
ut_ad(index->table->is_readable());
+ if (!index->table->supports_instant()) {
+ return DB_SUCCESS;
+ }
+
+ ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
+
dberr_t err;
const fil_space_t* space = index->table->space;
if (!space) {
@@ -461,17 +455,25 @@ when loading a table definition.
@param[in,out] table table definition from the data dictionary
@return error code
@retval DB_SUCCESS if no error occurred */
-dberr_t
-btr_cur_instant_init(dict_table_t* table)
+dberr_t btr_cur_instant_init(dict_table_t *table)
{
- mtr_t mtr;
- dict_index_t* index = dict_table_get_first_index(table);
- mtr.start();
- dberr_t err = index
- ? btr_cur_instant_init_low(index, &mtr)
- : DB_CORRUPTION;
- mtr.commit();
- return(err);
+ mtr_t mtr;
+ dict_index_t *index= dict_table_get_first_index(table);
+ mtr.start();
+ dberr_t err = index ? btr_cur_instant_init_low(index, &mtr) : DB_CORRUPTION;
+ mtr.commit();
+ if (err == DB_SUCCESS && index->is_gen_clust())
+ {
+ btr_cur_t cur;
+ mtr.start();
+ err= cur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr);
+ if (err != DB_SUCCESS);
+ else if (const rec_t *rec= page_rec_get_prev(btr_cur_get_rec(&cur)))
+ if (page_rec_is_user_rec(rec))
+ table->row_id= mach_read_from_6(rec);
+ mtr.commit();
+ }
+ return(err);
}
/** Initialize the n_core_null_bytes on first access to a clustered
@@ -783,20 +785,6 @@ static bool btr_cur_need_opposite_intention(const buf_page_t &bpage,
@return maximum size of a node pointer record in bytes */
static ulint btr_node_ptr_max_size(const dict_index_t* index)
{
- if (dict_index_is_ibuf(index)) {
- /* cannot estimate accurately */
- /* This is universal index for change buffer.
- The max size of the entry is about max key length * 2.
- (index key + primary key to be inserted to the index)
- (The max key length is UNIV_PAGE_SIZE / 16 * 3 at
- ha_innobase::max_supported_key_length(),
- considering MAX_KEY_LENGTH = 3072 at MySQL imposes
- the 3500 historical InnoDB value for 16K page size case.)
- For the universal index, node_ptr contains most of the entry.
- And 512 is enough to contain ibuf columns and meta-data */
- return srv_page_size / 8 * 3 + 512;
- }
-
/* Each record has page_no, length of page_no and header. */
ulint comp = dict_table_is_comp(index->table);
ulint rec_max_size = comp
@@ -971,7 +959,7 @@ static int btr_latch_prev(buf_block_t *block, page_id_t page_id,
retry:
buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr,
- BUF_GET, mtr, err, false);
+ BUF_GET, mtr, err);
if (UNIV_UNLIKELY(!prev))
return 0;
@@ -1042,11 +1030,9 @@ static int btr_latch_prev(buf_block_t *block, page_id_t page_id,
dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
btr_latch_mode latch_mode, mtr_t *mtr)
{
- ut_ad(index()->is_btree() || index()->is_ibuf());
- ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+ ut_ad(index()->is_btree());
buf_block_t *guess;
- btr_op_t btr_op;
btr_intention_t lock_intention;
bool detected_same_key_root= false;
@@ -1074,34 +1060,6 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK |
MTR_MEMO_X_LOCK));
- /* These flags are mutually exclusive, they are lumped together
- with the latch mode for historical reasons. It's possible for
- none of the flags to be set. */
- switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) {
- default:
- btr_op= BTR_NO_OP;
- break;
- case BTR_INSERT:
- btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE)
- ? BTR_INSERT_IGNORE_UNIQUE_OP
- : BTR_INSERT_OP;
- break;
- case BTR_DELETE:
- btr_op= BTR_DELETE_OP;
- ut_a(purge_node);
- break;
- case BTR_DELETE_MARK:
- btr_op= BTR_DELMARK_OP;
- break;
- }
-
- /* Operations on the insert buffer tree cannot be buffered. */
- ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf());
- /* Operations on the clustered index cannot be buffered. */
- ut_ad(btr_op == BTR_NO_OP || !index()->is_clust());
- /* Operations on the temporary table(indexes) cannot be buffered. */
- ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary());
-
const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
@@ -1123,7 +1081,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
# ifdef UNIV_SEARCH_PERF_STAT
info->n_searches++;
# endif
- bool ahi_enabled= btr_search_enabled && !index()->is_ibuf();
+ bool ahi_enabled= btr_search_enabled;
/* We do a dirty read of btr_search_enabled below,
and btr_search_guess_on_hash() will have to check it again. */
if (!ahi_enabled);
@@ -1214,80 +1172,15 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
up_bytes= 0;
low_match= 0;
low_bytes= 0;
- ulint buf_mode= BUF_GET;
search_loop:
auto block_savepoint= mtr->get_savepoint();
buf_block_t *block=
- buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr,
- &err, height == 0 && !index()->is_clust());
+ buf_page_get_gen(page_id, zip_size, rw_latch, guess, BUF_GET, mtr, &err);
if (!block)
{
- switch (err) {
- case DB_DECRYPTION_FAILED:
+ if (err == DB_DECRYPTION_FAILED)
btr_decryption_failed(*index());
- /* fall through */
- default:
- goto func_exit;
- case DB_SUCCESS:
- /* This must be a search to perform an insert, delete mark, or delete;
- try using the change buffer */
- ut_ad(height == 0);
- ut_ad(thr);
- break;
- }
-
- switch (btr_op) {
- default:
- MY_ASSERT_UNREACHABLE();
- break;
- case BTR_INSERT_OP:
- case BTR_INSERT_IGNORE_UNIQUE_OP:
- ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
-
- if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr))
- {
- flag= BTR_CUR_INSERT_TO_IBUF;
- goto func_exit;
- }
- break;
-
- case BTR_DELMARK_OP:
- ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
-
- if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
- index(), page_id, zip_size, thr))
- {
- flag = BTR_CUR_DEL_MARK_IBUF;
- goto func_exit;
- }
-
- break;
-
- case BTR_DELETE_OP:
- ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
- auto& chain = buf_pool.page_hash.cell_get(page_id.fold());
-
- if (!row_purge_poss_sec(purge_node, index(), tuple))
- /* The record cannot be purged yet. */
- flag= BTR_CUR_DELETE_REF;
- else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(),
- page_id, zip_size, thr))
- /* The purge was buffered. */
- flag= BTR_CUR_DELETE_IBUF;
- else
- {
- /* The purge could not be buffered. */
- buf_pool.watch_unset(page_id, chain);
- break;
- }
-
- buf_pool.watch_unset(page_id, chain);
- goto func_exit;
- }
-
- /* Change buffering did not succeed, we must read the page. */
- buf_mode= BUF_GET;
- goto search_loop;
+ goto func_exit;
}
if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
@@ -1411,11 +1304,12 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
goto func_exit;
if (page_has_next(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_next(block->page.frame),
- rw_latch, false, mtr, &err))
+ rw_latch, mtr, &err))
goto func_exit;
goto release_tree;
case BTR_SEARCH_LEAF:
case BTR_MODIFY_LEAF:
+ ut_ad(rw_latch == rw_lock_type_t(latch_mode));
if (!latch_by_caller)
{
release_tree:
@@ -1436,7 +1330,7 @@ release_tree:
goto func_exit;
if (page_has_next(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_next(block->page.frame),
- RW_X_LATCH, false, mtr, &err))
+ RW_X_LATCH, mtr, &err))
goto func_exit;
if (btr_cur_need_opposite_intention(block->page, index()->is_clust(),
lock_intention,
@@ -1573,12 +1467,12 @@ release_tree:
case BTR_MODIFY_ROOT_AND_LEAF:
rw_latch= RW_X_LATCH;
break;
- case BTR_MODIFY_PREV: /* ibuf_insert() or btr_pcur_move_to_prev() */
+ case BTR_MODIFY_PREV: /* btr_pcur_move_to_prev() */
case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
if (!not_first_access)
- buf_read_ahead_linear(page_id, zip_size, false);
+ buf_read_ahead_linear(page_id, zip_size);
if (page_has_prev(block->page.frame) &&
page_rec_is_first(page_cur.rec, block->page.frame))
@@ -1612,15 +1506,8 @@ release_tree:
case BTR_MODIFY_LEAF:
case BTR_SEARCH_LEAF:
rw_latch= rw_lock_type_t(latch_mode);
- if (btr_op != BTR_NO_OP && !index()->is_ibuf() &&
- ibuf_should_try(index(), btr_op != BTR_INSERT_OP))
- /* Try to buffer the operation if the leaf page
- is not in the buffer pool. */
- buf_mode= btr_op == BTR_DELETE_OP
- ? BUF_GET_IF_IN_POOL_OR_WATCH
- : BUF_GET_IF_IN_POOL;
- else if (!not_first_access)
- buf_read_ahead_linear(page_id, zip_size, false);
+ if (!not_first_access)
+ buf_read_ahead_linear(page_id, zip_size);
break;
case BTR_MODIFY_TREE:
ut_ad(rw_latch == RW_X_LATCH);
@@ -1666,8 +1553,7 @@ ATTRIBUTE_COLD
dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
page_cur_mode_t mode, mtr_t *mtr)
{
- ut_ad(index()->is_btree() || index()->is_ibuf());
- ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+ ut_ad(index()->is_btree());
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs* offsets= offsets_;
@@ -1746,7 +1632,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
block=
buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET,
- mtr, &err, !--height && !index()->is_clust());
+ mtr, &err);
if (!block)
{
@@ -1761,7 +1647,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
!fil_page_index_page_check(block->page.frame))
goto corrupted;
- if (height != btr_page_get_level(block->page.frame))
+ if (--height != btr_page_get_level(block->page.frame))
goto corrupted;
btr_cur_nonleaf_make_young(&block->page);
@@ -1777,7 +1663,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
goto func_exit;
if (page_has_next(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_next(block->page.frame),
- RW_X_LATCH, false, mtr, &err))
+ RW_X_LATCH, mtr, &err))
goto func_exit;
goto search_loop;
}
@@ -1807,14 +1693,14 @@ dberr_t btr_cur_search_to_nth_level(ulint level,
{
dict_index_t *const index= cursor->index();
- ut_ad(index->is_btree() || index->is_ibuf());
+ ut_ad(index->is_btree());
mem_heap_t *heap= nullptr;
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs *offsets= offsets_;
rec_offs_init(offsets_);
ut_ad(level);
ut_ad(dict_index_check_search_tuple(index, tuple));
- ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree());
+ ut_ad(index->is_btree());
ut_ad(dtuple_check_typed(tuple));
ut_ad(index->page != FIL_NULL);
@@ -1834,6 +1720,18 @@ dberr_t btr_cur_search_to_nth_level(ulint level,
ut_ad(mtr->memo_contains_flagged(&index->lock,
MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+ dberr_t err;
+
+ if (!index->table->space)
+ {
+ corrupted:
+ err= DB_CORRUPTION;
+ func_exit:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return err;
+ }
+
const ulint zip_size= index->table->space->zip_size();
/* Start with the root page. */
@@ -1841,7 +1739,7 @@ dberr_t btr_cur_search_to_nth_level(ulint level,
ulint height= ULINT_UNDEFINED;
search_loop:
- dberr_t err= DB_SUCCESS;
+ err= DB_SUCCESS;
if (buf_block_t *b=
mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch)))
block= b;
@@ -1864,14 +1762,7 @@ search_loop:
btr_page_get_index_id(block->page.frame) != index->id ||
fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
!fil_page_index_page_check(block->page.frame))
- {
- corrupted:
- err= DB_CORRUPTION;
- func_exit:
- if (UNIV_LIKELY_NULL(heap))
- mem_heap_free(heap);
- return err;
- }
+ goto corrupted;
const uint32_t page_level= btr_page_get_level(block->page.frame);
@@ -1961,7 +1852,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
/* This function doesn't need to lock left page of the leaf page */
static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), "");
static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), "");
- latch_mode= btr_latch_mode(latch_mode & ~4);
+ latch_mode= btr_latch_mode(latch_mode & (RW_S_LATCH | RW_X_LATCH));
ut_ad(!latch_by_caller ||
mtr->memo_contains_flagged(&index->lock,
MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK));
@@ -1993,7 +1884,7 @@ index_locked:
buf_block_t* block=
btr_block_get(*index, page,
height ? upper_rw_latch : root_leaf_rw_latch,
- !height, mtr, &err, &first_access);
+ mtr, &err, &first_access);
ut_ad(!block == (err != DB_SUCCESS));
if (!block)
@@ -2038,7 +1929,7 @@ index_locked:
break;
if (page_has_next(block->page.frame) &&
!btr_block_get(*index, btr_page_get_next(block->page.frame),
- RW_X_LATCH, false, mtr, &err))
+ RW_X_LATCH, mtr, &err))
break;
if (!index->lock.have_x() &&
@@ -2089,7 +1980,7 @@ index_locked:
{
if (!height && first && first_access)
buf_read_ahead_linear(page_id_t(block->page.id().space(), page),
- block->page.zip_size(), false);
+ block->page.zip_size());
}
else if (btr_cur_need_opposite_intention(block->page, index->is_clust(),
lock_intention,
@@ -2145,11 +2036,6 @@ be freed by reorganizing. Differs from btr_cur_optimistic_insert because
no heuristics is applied to whether it pays to use CPU time for
reorganizing the page or not.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to inserted record if succeed, else NULL */
static MY_ATTRIBUTE((nonnull, warn_unused_result))
rec_t*
@@ -2318,9 +2204,6 @@ static void btr_cur_prefetch_siblings(const buf_block_t *block,
{
ut_ad(page_is_leaf(block->page.frame));
- if (index->is_ibuf())
- return;
-
const page_t *page= block->page.frame;
uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
@@ -2555,14 +2438,6 @@ fail_err:
if (*rec) {
} else if (block->page.zip.data) {
ut_ad(!index->table->is_temporary());
- /* Reset the IBUF_BITMAP_FREE bits, because
- page_cur_tuple_insert() will have attempted page
- reorganize before failing. */
- if (leaf
- && !dict_index_is_clust(index)) {
- ibuf_reset_free_bits(block);
- }
-
goto fail;
} else {
ut_ad(!reorg);
@@ -2603,34 +2478,6 @@ fail_err:
lock_update_insert(block, *rec);
}
- if (leaf
- && !dict_index_is_clust(index)
- && !index->table->is_temporary()) {
- /* Update the free bits of the B-tree page in the
- insert buffer bitmap. */
-
- /* The free bits in the insert buffer bitmap must
- never exceed the free space on a page. It is safe to
- decrement or reset the bits in the bitmap in a
- mini-transaction that is committed before the
- mini-transaction that affects the free space. */
-
- /* It is unsafe to increment the bits in a separately
- committed mini-transaction, because in crash recovery,
- the free bits could momentarily be set too high. */
-
- if (block->page.zip.data) {
- /* Update the bits in the same mini-transaction. */
- ibuf_update_free_bits_zip(block, mtr);
- } else {
- /* Decrement the bits in a separate
- mini-transaction. */
- ibuf_update_free_bits_if_full(
- block, max_size,
- rec_size + PAGE_DIR_SLOT_SIZE);
- }
- }
-
*big_rec = big_rec_vec;
return(DB_SUCCESS);
@@ -2701,12 +2548,10 @@ btr_cur_pessimistic_insert(
the index tree, so that the insert will not fail because of
lack of space */
- if (!index->is_ibuf()
- && (err = fsp_reserve_free_extents(&n_reserved, index->table->space,
- uint32_t(cursor->tree_height / 16
- + 3),
- FSP_NORMAL, mtr))
- != DB_SUCCESS) {
+ err = fsp_reserve_free_extents(&n_reserved, index->table->space,
+ uint32_t(cursor->tree_height / 16 + 3),
+ FSP_NORMAL, mtr);
+ if (err != DB_SUCCESS) {
return err;
}
@@ -2738,11 +2583,21 @@ btr_cur_pessimistic_insert(
}
}
- *rec = index->page == btr_cur_get_block(cursor)->page.id().page_no()
- ? btr_root_raise_and_insert(flags, cursor, offsets, heap,
- entry, n_ext, mtr, &err)
- : btr_page_split_and_insert(flags, cursor, offsets, heap,
- entry, n_ext, mtr, &err);
+ if (index->page == btr_cur_get_block(cursor)->page.id().page_no()) {
+ *rec = index->is_spatial()
+ ? rtr_root_raise_and_insert(flags, cursor, offsets,
+ heap, entry, n_ext, mtr,
+ &err, thr)
+ : btr_root_raise_and_insert(flags, cursor, offsets,
+ heap, entry, n_ext, mtr,
+ &err);
+ } else if (index->is_spatial()) {
+ *rec = rtr_page_split_and_insert(flags, cursor, offsets, heap,
+ entry, n_ext, mtr, &err, thr);
+ } else {
+ *rec = btr_page_split_and_insert(flags, cursor, offsets, heap,
+ entry, n_ext, mtr, &err);
+ }
if (!*rec) {
goto func_exit;
@@ -2986,14 +2841,8 @@ static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
See if there is enough place in the page modification log to log
an update-in-place.
-@retval false if out of space; IBUF_BITMAP_FREE will be reset
-outside mtr if the page was recompressed
-@retval true if enough place;
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
-a secondary index leaf page. This has to be done either within the
-same mini-transaction, or by invoking ibuf_reset_free_bits() before
-mtr_commit(mtr). */
+@retval false if out of space
+@retval true if enough place */
bool
btr_cur_update_alloc_zip_func(
/*==========================*/
@@ -3014,7 +2863,6 @@ btr_cur_update_alloc_zip_func(
const page_t* page = page_cur_get_page(cursor);
ut_ad(page_zip == page_cur_get_page_zip(cursor));
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
if (page_zip_available(page_zip, dict_index_is_clust(index),
@@ -3038,26 +2886,8 @@ btr_cur_update_alloc_zip_func(
rec_offs_make_valid(page_cur_get_rec(cursor), index,
page_is_leaf(page), offsets);
- /* After recompressing a page, we must make sure that the free
- bits in the insert buffer bitmap will not exceed the free
- space on the page. Because this function will not attempt
- recompression unless page_zip_available() fails above, it is
- safe to reset the free bits if page_zip_available() fails
- again, below. The free bits can safely be reset in a separate
- mini-transaction. If page_zip_available() succeeds below, we
- can be sure that the btr_page_reorganize() above did not reduce
- the free space available on the page. */
-
- if (page_zip_available(page_zip, dict_index_is_clust(index),
- length, create)) {
- return true;
- }
- }
-
- if (!dict_index_is_clust(index)
- && !index->table->is_temporary()
- && page_is_leaf(page)) {
- ibuf_reset_free_bits(page_cur_get_block(cursor));
+ return page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create);
}
return(false);
@@ -3281,7 +3111,7 @@ We assume here that the ordering fields of the record do not change.
@return locking or undo log related error code, or
@retval DB_SUCCESS on success
@retval DB_ZIP_OVERFLOW if there is not enough space left
-on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+on a ROW_FORMAT=COMPRESSED page */
dberr_t
btr_cur_update_in_place(
/*====================*/
@@ -3301,7 +3131,6 @@ btr_cur_update_in_place(
further pages */
{
dict_index_t* index;
- dberr_t err;
rec_t* rec;
roll_ptr_t roll_ptr = 0;
ulint was_delete_marked;
@@ -3309,17 +3138,14 @@ btr_cur_update_in_place(
ut_ad(page_is_leaf(cursor->page_cur.block->page.frame));
rec = btr_cur_get_rec(cursor);
index = cursor->index();
- ut_ad(!index->is_ibuf());
ut_ad(rec_offs_validate(rec, index, offsets));
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
|| index->table->is_temporary());
- /* The insert buffer tree should never be updated in place. */
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
- || dict_index_is_clust(index));
+ || index->is_primary());
ut_ad(thr_get_trx(thr)->id == trx_id
- || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+ || (flags & ulint(~BTR_KEEP_POS_FLAG))
== (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
| BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
@@ -3342,22 +3168,17 @@ btr_cur_update_in_place(
}
/* Do lock checking and undo logging */
- err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
- update, cmpl_info,
- thr, mtr, &roll_ptr);
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
- goto func_exit;
+ if (dberr_t err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+ update, cmpl_info,
+ thr, mtr, &roll_ptr)) {
+ return err;
}
- if (!(flags & BTR_KEEP_SYS_FLAG)) {
- err = btr_cur_upd_rec_sys(block, rec, index, offsets,
- thr_get_trx(thr), roll_ptr, mtr);
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
- goto func_exit;
- }
+ if (flags & BTR_KEEP_SYS_FLAG) {
+ } else if (dberr_t err = btr_cur_upd_rec_sys(block, rec, index, offsets,
+ thr_get_trx(thr),
+ roll_ptr, mtr)) {
+ return err;
}
was_delete_marked = rec_get_deleted_flag(
@@ -3415,19 +3236,7 @@ btr_cur_update_in_place(
btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
}
- ut_ad(err == DB_SUCCESS);
-
-func_exit:
- if (page_zip
- && !(flags & BTR_KEEP_IBUF_BITMAP)
- && !dict_index_is_clust(index)
- && page_is_leaf(buf_block_get_frame(block))) {
- /* Update the free bits in the insert buffer. */
- ut_ad(!index->table->is_temporary());
- ibuf_update_free_bits_zip(block, mtr);
- }
-
- return(err);
+ return DB_SUCCESS;
}
/** Trim a metadata record during the rollback of instant ALTER TABLE.
@@ -3571,7 +3380,7 @@ fields of the record do not change.
@retval DB_OVERFLOW if the updated record does not fit
@retval DB_UNDERFLOW if the page would become too empty
@retval DB_ZIP_OVERFLOW if there is not enough space left
-on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+on a ROW_FORMAT=COMPRESSED page */
dberr_t
btr_cur_optimistic_update(
/*======================*/
@@ -3602,7 +3411,6 @@ btr_cur_optimistic_update(
ulint max_size;
ulint new_rec_size;
ulint old_rec_size;
- ulint max_ins_size = 0;
dtuple_t* new_entry;
roll_ptr_t roll_ptr;
ulint i;
@@ -3611,19 +3419,16 @@ btr_cur_optimistic_update(
page = buf_block_get_frame(block);
rec = btr_cur_get_rec(cursor);
index = cursor->index();
- ut_ad(index->has_locking());
ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
|| index->table->is_temporary());
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
/* This is intended only for leaf page updates */
ut_ad(page_is_leaf(page));
- /* The insert buffer tree should never be updated in place. */
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
|| dict_index_is_clust(index));
ut_ad(thr_get_trx(thr)->id == trx_id
- || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+ || (flags & ulint(~BTR_KEEP_POS_FLAG))
== (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
| BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
ut_ad(fil_page_index_page_check(page));
@@ -3652,7 +3457,6 @@ btr_cur_optimistic_update(
if (rec_offs_any_extern(*offsets)) {
any_extern:
- ut_ad(!index->is_ibuf());
/* Externally stored fields are treated in pessimistic
update */
@@ -3733,9 +3537,6 @@ any_extern:
if (UNIV_UNLIKELY(new_rec_size
>= (page_get_free_space_of_empty(page_is_comp(page))
/ 2))) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
err = DB_OVERFLOW;
goto func_exit;
}
@@ -3743,10 +3544,6 @@ any_extern:
if (UNIV_UNLIKELY(page_get_data_size(page)
- old_rec_size + new_rec_size
< BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
-
/* The page would become too empty */
err = DB_UNDERFLOW;
goto func_exit;
@@ -3759,19 +3556,9 @@ any_extern:
: (old_rec_size
+ page_get_max_insert_size_after_reorganize(page, 1));
- if (!page_zip) {
- max_ins_size = page_get_max_insert_size_after_reorganize(
- page, 1);
- }
-
if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
&& (max_size >= new_rec_size))
|| (page_get_n_recs(page) <= 1))) {
-
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
-
/* There was not enough space, or it did not pay to
reorganize: for simplicity, we decide what to do assuming a
reorganization is needed, though it might not be necessary */
@@ -3785,9 +3572,6 @@ any_extern:
update, cmpl_info,
thr, mtr, &roll_ptr);
if (err != DB_SUCCESS) {
- /* We may need to update the IBUF_BITMAP_FREE
- bits after a reorganize that was done in
- btr_cur_update_alloc_zip(). */
goto func_exit;
}
@@ -3843,22 +3627,11 @@ any_extern:
ut_ad(err == DB_SUCCESS);
if (!page_cur_move_to_next(page_cursor)) {
corrupted:
- err = DB_CORRUPTION;
- }
-
-func_exit:
- if (!(flags & BTR_KEEP_IBUF_BITMAP)
- && !dict_index_is_clust(index)) {
- /* Update the free bits in the insert buffer. */
- if (page_zip) {
- ut_ad(!index->table->is_temporary());
- ibuf_update_free_bits_zip(block, mtr);
- } else if (!index->table->is_temporary()) {
- ibuf_update_free_bits_low(block, max_ins_size, mtr);
- }
+ return DB_CORRUPTION;
}
if (err != DB_SUCCESS) {
+func_exit:
/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block, index);
@@ -3947,7 +3720,6 @@ btr_cur_pessimistic_update(
big_rec_t* dummy_big_rec;
dict_index_t* index;
buf_block_t* block;
- page_zip_des_t* page_zip;
rec_t* rec;
page_cur_t* page_cursor;
dberr_t err;
@@ -3960,20 +3732,19 @@ btr_cur_pessimistic_update(
*big_rec = NULL;
block = btr_cur_get_block(cursor);
- page_zip = buf_block_get_page_zip(block);
index = cursor->index();
- ut_ad(index->has_locking());
ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
MTR_MEMO_SX_LOCK));
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#if defined UNIV_ZIP_DEBUG || defined UNIV_DEBUG
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+#endif
#ifdef UNIV_ZIP_DEBUG
ut_a(!page_zip
|| page_zip_validate(page_zip, block->page.frame, index));
#endif /* UNIV_ZIP_DEBUG */
ut_ad(!page_zip || !index->table->is_temporary());
- /* The insert buffer tree should never be updated in place. */
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
|| index->table->is_temporary());
ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
@@ -3984,7 +3755,7 @@ btr_cur_pessimistic_update(
| BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
err = optim_err = btr_cur_optimistic_update(
- flags | BTR_KEEP_IBUF_BITMAP,
+ flags,
cursor, offsets, offsets_heap, update,
cmpl_info, thr, trx_id, mtr);
@@ -3995,18 +3766,6 @@ btr_cur_pessimistic_update(
break;
default:
err_exit:
- /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
- For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
- already reset by btr_cur_update_alloc_zip() if the
- page was recompressed. */
- if (page_zip
- && optim_err != DB_ZIP_OVERFLOW
- && !dict_index_is_clust(index)
- && page_is_leaf(block->page.frame)) {
- ut_ad(!index->table->is_temporary());
- ibuf_update_free_bits_zip(block, mtr);
- }
-
if (big_rec_vec != NULL) {
dtuple_big_rec_free(big_rec_vec);
}
@@ -4084,11 +3843,6 @@ btr_cur_pessimistic_update(
index->first_user_field())))) {
big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
-
- /* We cannot goto return_after_reservations,
- because we may need to update the
- IBUF_BITMAP_FREE bits, which was suppressed by
- BTR_KEEP_IBUF_BITMAP. */
#ifdef UNIV_ZIP_DEBUG
ut_a(!page_zip
|| page_zip_validate(page_zip, block->page.frame,
@@ -4139,11 +3893,6 @@ btr_cur_pessimistic_update(
btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
}
- const ulint max_ins_size = page_zip
- ? 0
- : page_get_max_insert_size_after_reorganize(block->page.frame,
- 1);
-
if (UNIV_UNLIKELY(is_metadata)) {
ut_ad(new_entry->is_metadata());
ut_ad(index->is_instant());
@@ -4228,18 +3977,6 @@ btr_cur_pessimistic_update(
rec_offs_make_valid(page_cursor->rec, index,
true, *offsets);
}
- } else if (!dict_index_is_clust(index)
- && page_is_leaf(block->page.frame)) {
- /* Update the free bits in the insert buffer.
- This is the same block which was skipped by
- BTR_KEEP_IBUF_BITMAP. */
- if (page_zip) {
- ut_ad(!index->table->is_temporary());
- ibuf_update_free_bits_zip(block, mtr);
- } else if (!index->table->is_temporary()) {
- ibuf_update_free_bits_low(block, max_ins_size,
- mtr);
- }
}
#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
@@ -4260,16 +3997,7 @@ btr_cur_pessimistic_update(
of a badly-compressing record, it is possible for
btr_cur_optimistic_update() to return DB_UNDERFLOW and
btr_cur_insert_if_possible() to return FALSE. */
- ut_a(page_zip || optim_err != DB_UNDERFLOW);
-
- /* Out of space: reset the free bits.
- This is the same block which was skipped by
- BTR_KEEP_IBUF_BITMAP. */
- if (!dict_index_is_clust(index)
- && !index->table->is_temporary()
- && page_is_leaf(block->page.frame)) {
- ibuf_reset_free_bits(block);
- }
+ ut_ad(page_zip || optim_err != DB_UNDERFLOW);
}
if (big_rec_vec != NULL) {
@@ -4314,8 +4042,7 @@ btr_cur_pessimistic_update(
same temp-table in parallel.
max_trx_id is ignored for temp tables because it not required
for MVCC. */
- if (dict_index_is_sec_or_ibuf(index)
- && !index->table->is_temporary()) {
+ if (!index->is_primary() && !index->table->is_temporary()) {
/* Update PAGE_MAX_TRX_ID in the index page header.
It was not updated by btr_cur_pessimistic_insert()
because of BTR_NO_LOCKING_FLAG. */
@@ -4626,9 +4353,6 @@ btr_cur_optimistic_delete(
}
{
- page_t* page = buf_block_get_frame(block);
- page_zip_des_t* page_zip= buf_block_get_page_zip(block);
-
if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
& REC_INFO_MIN_REC_FLAG)) {
/* This should be rolling back instant ADD COLUMN.
@@ -4637,7 +4361,7 @@ btr_cur_optimistic_delete(
insert into SYS_COLUMNS is rolled back. */
ut_ad(cursor->index()->table->supports_instant());
ut_ad(cursor->index()->is_primary());
- ut_ad(!page_zip);
+ ut_ad(!buf_block_get_page_zip(block));
page_cur_delete_rec(btr_cur_get_page_cur(cursor),
offsets, mtr);
/* We must empty the PAGE_FREE list, because
@@ -4655,40 +4379,8 @@ btr_cur_optimistic_delete(
btr_search_update_hash_on_delete(cursor);
}
- if (page_zip) {
-#ifdef UNIV_ZIP_DEBUG
- ut_a(page_zip_validate(page_zip, page,
- cursor->index()));
-#endif /* UNIV_ZIP_DEBUG */
- page_cur_delete_rec(btr_cur_get_page_cur(cursor),
- offsets, mtr);
-#ifdef UNIV_ZIP_DEBUG
- ut_a(page_zip_validate(page_zip, page,
- cursor->index()));
-#endif /* UNIV_ZIP_DEBUG */
-
- /* On compressed pages, the IBUF_BITMAP_FREE
- space is not affected by deleting (purging)
- records, because it is defined as the minimum
- of space available *without* reorganize, and
- space available in the modification log. */
- } else {
- const ulint max_ins
- = page_get_max_insert_size_after_reorganize(
- page, 1);
-
- page_cur_delete_rec(btr_cur_get_page_cur(cursor),
- offsets, mtr);
-
- /* The change buffer does not handle inserts
- into non-leaf pages, into clustered indexes,
- or into the change buffer. */
- if (!cursor->index()->is_clust()
- && !cursor->index()->table->is_temporary()
- && !dict_index_is_ibuf(cursor->index())) {
- ibuf_update_free_bits_low(block, max_ins, mtr);
- }
- }
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ offsets, mtr);
}
func_exit:
@@ -4884,9 +4576,9 @@ discard_page:
goto err_exit;
}
- btr_cur_t cursor;
- cursor.page_cur.index = index;
- cursor.page_cur.block = block;
+ btr_cur_t cur;
+ cur.page_cur.index = index;
+ cur.page_cur.block = block;
if (!page_has_prev(page)) {
/* If we delete the leftmost node pointer on a
@@ -4902,16 +4594,17 @@ discard_page:
rec_offs* offsets;
ulint len;
- rtr_page_get_father_block(NULL, heap, mtr, NULL,
- &cursor);
- father_rec = btr_cur_get_rec(&cursor);
+ rtr_page_get_father_block(nullptr, heap, nullptr,
+ &cur,
+ cursor->rtr_info->thr, mtr);
+ father_rec = btr_cur_get_rec(&cur);
offsets = rec_get_offsets(father_rec, index, NULL,
0, ULINT_UNDEFINED, &heap);
rtr_read_mbr(rec_get_nth_field(
father_rec, offsets, 0, &len), &father_mbr);
- rtr_update_mbr_field(&cursor, offsets, NULL,
+ rtr_update_mbr_field(&cur, offsets, NULL,
page, &father_mbr, next_rec, mtr);
ut_d(parent_latched = true);
} else {
@@ -4919,12 +4612,12 @@ discard_page:
on a page, we have to change the parent node pointer
so that it is equal to the new leftmost node pointer
on the page */
- ret = btr_page_get_father(mtr, &cursor);
+ ret = btr_page_get_father(mtr, &cur);
if (!ret) {
*err = DB_CORRUPTION;
goto err_exit;
}
- *err = btr_cur_node_ptr_delete(&cursor, mtr);
+ *err = btr_cur_node_ptr_delete(&cur, mtr);
if (*err != DB_SUCCESS) {
got_err:
ret = FALSE;
@@ -4971,7 +4664,10 @@ got_err:
#endif /* UNIV_ZIP_DEBUG */
ut_ad(!parent_latched
- || btr_check_node_ptr(index, block, mtr));
+ || btr_check_node_ptr(index, block,
+ cursor->rtr_info
+ ? cursor->rtr_info->thr
+ : nullptr, mtr));
if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
if (UNIV_LIKELY(allow_merge)) {
@@ -5113,7 +4809,7 @@ public:
{
buf_block_t *parent_block= m_block;
- m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level,
+ m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH,
&mtr, nullptr);
if (!m_block)
return false;
@@ -5327,8 +5023,7 @@ static ha_rows btr_estimate_n_rows_in_range_on_level(
buf_block_t *prev_block= block;
/* Fetch the page. */
- block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr,
- nullptr);
+ block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, &mtr, nullptr);
if (prev_block)
{
@@ -5644,6 +5339,7 @@ search_loop:
DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows););
+#ifdef NOT_USED
/* Do not estimate the number of rows in the range to over 1 / 2 of the
estimated rows in the whole table */
@@ -5658,6 +5354,10 @@ search_loop:
if (n_rows == 0)
n_rows= table_n_rows;
}
+#else
+ if (n_rows > table_n_rows)
+ n_rows= table_n_rows;
+#endif
DBUG_RETURN(n_rows);
@@ -5962,7 +5662,7 @@ struct btr_blob_log_check_t {
m_mtr, &err));
}
m_pcur->btr_cur.page_cur.block = btr_block_get(
- *index, page_no, RW_X_LATCH, false, m_mtr);
+ *index, page_no, RW_X_LATCH, m_mtr);
/* The page should not be evicted or corrupted while
we are holding a buffer-fix on it. */
m_pcur->btr_cur.page_cur.block->page.unfix();
@@ -6701,7 +6401,7 @@ btr_copy_blob_prefix(
return copied_len;
}
if (!buf_page_make_young_if_needed(&block->page)) {
- buf_read_ahead_linear(id, 0, false);
+ buf_read_ahead_linear(id, 0);
}
page = buf_block_get_frame(block);
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
deleted file mode 100644
index 642db0e9..00000000
--- a/storage/innobase/btr/btr0defragment.cc
+++ /dev/null
@@ -1,820 +0,0 @@
-/*****************************************************************************
-
-Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
-Copyright (C) 2014, 2023, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-/**************************************************//**
-@file btr/btr0defragment.cc
-Index defragmentation.
-
-Created 05/29/2014 Rongrong Zhong
-Modified 16/07/2014 Sunguck Lee
-Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
-*******************************************************/
-
-#include "btr0defragment.h"
-#include "btr0btr.h"
-#include "btr0cur.h"
-#include "btr0sea.h"
-#include "btr0pcur.h"
-#include "dict0stats.h"
-#include "dict0stats_bg.h"
-#include "dict0defrag_bg.h"
-#include "ibuf0ibuf.h"
-#include "lock0lock.h"
-#include "srv0start.h"
-#include "mysqld.h"
-
-#include <list>
-
-/* When there's no work, either because defragment is disabled, or because no
-query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
-#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
-/* Reduce the target page size by this amount when compression failure happens
-during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
-3% of the page size. When there are compression failures in defragmentation,
-our goal is to get a decent defrag ratio with as few compression failure as
-possible. From experimentation it seems that reduce the target size by 512 every
-time will make sure the page is compressible within a couple of iterations. */
-#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
-
-/** Item in the work queue for btr_degrament_thread. */
-struct btr_defragment_item_t
-{
- /** persistent cursor where btr_defragment_n_pages should start */
- btr_pcur_t * const pcur;
- /** completion signal */
- pthread_cond_t *cond;
- /** timestamp of last time this index is processed by defragment thread */
- ulonglong last_processed= 0;
-
- btr_defragment_item_t(btr_pcur_t *pcur, pthread_cond_t *cond)
- : pcur(pcur), cond(cond) {}
-};
-
-/* Work queue for defragmentation. */
-typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
-static btr_defragment_wq_t btr_defragment_wq;
-
-/* Mutex protecting the defragmentation work queue.*/
-static mysql_mutex_t btr_defragment_mutex;
-#ifdef UNIV_PFS_MUTEX
-mysql_pfs_key_t btr_defragment_mutex_key;
-#endif /* UNIV_PFS_MUTEX */
-
-/* Number of compression failures caused by defragmentation since server
-start. */
-Atomic_counter<ulint> btr_defragment_compression_failures;
-/* Number of btr_defragment_n_pages calls that altered page but didn't
-manage to release any page. */
-Atomic_counter<ulint> btr_defragment_failures;
-/* Total number of btr_defragment_n_pages calls that altered page.
-The difference between btr_defragment_count and btr_defragment_failures shows
-the amount of effort wasted. */
-Atomic_counter<ulint> btr_defragment_count;
-
-bool btr_defragment_active;
-static void btr_defragment_chunk(void*);
-
-static tpool::timer* btr_defragment_timer;
-static tpool::task_group task_group(1);
-static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
-static void btr_defragment_start();
-
-static void submit_defragment_task(void*arg=0)
-{
- srv_thread_pool->submit_task(&btr_defragment_task);
-}
-
-/******************************************************************//**
-Initialize defragmentation. */
-void
-btr_defragment_init()
-{
- srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
- mysql_mutex_init(btr_defragment_mutex_key, &btr_defragment_mutex,
- nullptr);
- btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
- btr_defragment_active = true;
-}
-
-/******************************************************************//**
-Shutdown defragmentation. Release all resources. */
-void
-btr_defragment_shutdown()
-{
- if (!btr_defragment_timer)
- return;
- delete btr_defragment_timer;
- btr_defragment_timer = 0;
- task_group.cancel_pending(&btr_defragment_task);
- mysql_mutex_lock(&btr_defragment_mutex);
- std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
- while(iter != btr_defragment_wq.end()) {
- btr_defragment_item_t* item = *iter;
- iter = btr_defragment_wq.erase(iter);
- if (item->cond) {
- pthread_cond_signal(item->cond);
- }
- }
- mysql_mutex_unlock(&btr_defragment_mutex);
- mysql_mutex_destroy(&btr_defragment_mutex);
- btr_defragment_active = false;
-}
-
-
-/******************************************************************//**
-Functions used by the query threads: btr_defragment_xxx_index
-Query threads find/add/remove index. */
-/******************************************************************//**
-Check whether the given index is in btr_defragment_wq. We use index->id
-to identify indices. */
-bool
-btr_defragment_find_index(
- dict_index_t* index) /*!< Index to find. */
-{
- mysql_mutex_lock(&btr_defragment_mutex);
- for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
- iter != btr_defragment_wq.end();
- ++iter) {
- btr_defragment_item_t* item = *iter;
- btr_pcur_t* pcur = item->pcur;
- btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
- dict_index_t* idx = btr_cur_get_index(cursor);
- if (index->id == idx->id) {
- mysql_mutex_unlock(&btr_defragment_mutex);
- return true;
- }
- }
- mysql_mutex_unlock(&btr_defragment_mutex);
- return false;
-}
-
-/** Defragment an index.
-@param pcur persistent cursor
-@param thd current session, for checking thd_killed()
-@return whether the operation was interrupted */
-bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd)
-{
- dict_stats_empty_defrag_summary(pcur->index());
- pthread_cond_t cond;
- pthread_cond_init(&cond, nullptr);
- btr_defragment_item_t item(pcur, &cond);
- mysql_mutex_lock(&btr_defragment_mutex);
- btr_defragment_wq.push_back(&item);
- if (btr_defragment_wq.size() == 1)
- /* Kick off defragmentation work */
- btr_defragment_start();
- bool interrupted= false;
- for (;;)
- {
- timespec abstime;
- set_timespec(abstime, 1);
- if (!my_cond_timedwait(&cond, &btr_defragment_mutex.m_mutex, &abstime))
- break;
- if (thd_killed(thd))
- {
- item.cond= nullptr;
- interrupted= true;
- break;
- }
- }
-
- pthread_cond_destroy(&cond);
- mysql_mutex_unlock(&btr_defragment_mutex);
- return interrupted;
-}
-
-/******************************************************************//**
-When table is dropped, this function is called to mark a table as removed in
-btr_efragment_wq. The difference between this function and the remove_index
-function is this will not NULL the event. */
-void
-btr_defragment_remove_table(
- dict_table_t* table) /*!< Index to be removed. */
-{
- mysql_mutex_lock(&btr_defragment_mutex);
- for (auto item : btr_defragment_wq)
- {
- if (item->cond && table == item->pcur->index()->table)
- {
- pthread_cond_signal(item->cond);
- item->cond= nullptr;
- }
- }
- mysql_mutex_unlock(&btr_defragment_mutex);
-}
-
-/*********************************************************************//**
-Check whether we should save defragmentation statistics to persistent storage.
-Currently we save the stats to persistent storage every 100 updates. */
-void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index)
-{
- if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
- && index->table->space_id != 0 // do not track system tables
- && !index->table->is_temporary()
- && index->stat_defrag_modified_counter
- >= srv_defragment_stats_accuracy) {
- dict_stats_defrag_pool_add(index);
- index->stat_defrag_modified_counter = 0;
- }
-}
-
-/*********************************************************************//**
-Main defragment functionalities used by defragment thread.*/
-/*************************************************************//**
-Calculate number of records from beginning of block that can
-fit into size_limit
-@return number of records */
-static
-ulint
-btr_defragment_calc_n_recs_for_size(
- buf_block_t* block, /*!< in: B-tree page */
- dict_index_t* index, /*!< in: index of the page */
- ulint size_limit, /*!< in: size limit to fit records in */
- ulint* n_recs_size) /*!< out: actual size of the records that fit
- in size_limit. */
-{
- page_t* page = buf_block_get_frame(block);
- ulint n_recs = 0;
- rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
- rec_offs* offsets = offsets_;
- rec_offs_init(offsets_);
- mem_heap_t* heap = NULL;
- ulint size = 0;
- page_cur_t cur;
-
- const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
- page_cur_set_before_first(block, &cur);
- while (rec_t* cur_rec = page_cur_move_to_next(&cur)) {
- if (page_rec_is_supremum(cur_rec)) {
- break;
- }
- offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
- ULINT_UNDEFINED, &heap);
- ulint rec_size = rec_offs_size(offsets);
- size += rec_size;
- if (size > size_limit) {
- size = size - rec_size;
- break;
- }
- n_recs ++;
- }
- *n_recs_size = size;
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
- return n_recs;
-}
-
-MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
-/************************************************************//**
-Returns the upper level node pointer to a page. It is assumed that mtr holds
-an sx-latch on the tree.
-@return rec_get_offsets() of the node pointer record */
-static
-rec_offs*
-btr_page_search_father_node_ptr(
- rec_offs* offsets,/*!< in: work area for the return value */
- mem_heap_t* heap, /*!< in: memory heap to use */
- btr_cur_t* cursor, /*!< in: cursor pointing to user record,
- out: cursor on node pointer record,
- its page x-latched */
- mtr_t* mtr) /*!< in: mtr */
-{
- const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
- dict_index_t* index = btr_cur_get_index(cursor);
- ut_ad(!index->is_spatial());
-
- ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
- | MTR_MEMO_SX_LOCK));
- ut_ad(dict_index_get_page(index) != page_no);
-
- const auto level = btr_page_get_level(btr_cur_get_page(cursor));
-
- const rec_t* user_rec = btr_cur_get_rec(cursor);
- ut_a(page_rec_is_user_rec(user_rec));
-
- if (btr_cur_search_to_nth_level(level + 1,
- dict_index_build_node_ptr(index,
- user_rec, 0,
- heap, level),
- RW_X_LATCH,
- cursor, mtr) != DB_SUCCESS) {
- return nullptr;
- }
-
- const rec_t* node_ptr = btr_cur_get_rec(cursor);
- ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive()
- || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
-
- offsets = rec_get_offsets(node_ptr, index, offsets, 0,
- ULINT_UNDEFINED, &heap);
-
- if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
- offsets = nullptr;
- }
-
- return(offsets);
-}
-
-static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor)
-{
- rec_t *rec=
- page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
- if (UNIV_UNLIKELY(!rec))
- return false;
- cursor->page_cur.rec= rec;
- mem_heap_t *heap= mem_heap_create(100);
- const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr);
- mem_heap_free(heap);
- return got;
-}
-
-/*************************************************************//**
-Merge as many records from the from_block to the to_block. Delete
-the from_block if all records are successfully merged to to_block.
-@return the to_block to target for next merge operation.
-@retval nullptr if corruption was noticed */
-static
-buf_block_t*
-btr_defragment_merge_pages(
- dict_index_t* index, /*!< in: index tree */
- buf_block_t* from_block, /*!< in: origin of merge */
- buf_block_t* to_block, /*!< in: destination of merge */
- ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */
- ulint reserved_space, /*!< in: space reserved for future
- insert to avoid immediate page split */
- ulint* max_data_size, /*!< in/out: max data size to
- fit in a single compressed page. */
- mem_heap_t* heap, /*!< in/out: pointer to memory heap */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- page_t* from_page = buf_block_get_frame(from_block);
- page_t* to_page = buf_block_get_frame(to_block);
- ulint level = btr_page_get_level(from_page);
- ulint n_recs = page_get_n_recs(from_page);
- ulint new_data_size = page_get_data_size(to_page);
- ulint max_ins_size =
- page_get_max_insert_size(to_page, n_recs);
- ulint max_ins_size_reorg =
- page_get_max_insert_size_after_reorganize(
- to_page, n_recs);
- ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
- ? max_ins_size_reorg - reserved_space : 0;
- ulint move_size = 0;
- ulint n_recs_to_move = 0;
- rec_t* rec = NULL;
- ulint target_n_recs = 0;
- rec_t* orig_pred;
-
- // Estimate how many records can be moved from the from_page to
- // the to_page.
- if (zip_size) {
- ulint page_diff = srv_page_size - *max_data_size;
- max_ins_size_to_use = (max_ins_size_to_use > page_diff)
- ? max_ins_size_to_use - page_diff : 0;
- }
- n_recs_to_move = btr_defragment_calc_n_recs_for_size(
- from_block, index, max_ins_size_to_use, &move_size);
-
- // If max_ins_size >= move_size, we can move the records without
- // reorganizing the page, otherwise we need to reorganize the page
- // first to release more space.
- if (move_size > max_ins_size) {
- dberr_t err = btr_page_reorganize_block(page_zip_level,
- to_block, index, mtr);
- if (err != DB_SUCCESS) {
- if (!dict_index_is_clust(index)
- && page_is_leaf(to_page)) {
- ibuf_reset_free_bits(to_block);
- }
- // If reorganization fails, that means page is
- // not compressable. There's no point to try
- // merging into this page. Continue to the
- // next page.
- return err == DB_FAIL ? from_block : nullptr;
- }
- ut_ad(page_validate(to_page, index));
- max_ins_size = page_get_max_insert_size(to_page, n_recs);
- if (max_ins_size < move_size) {
- return nullptr;
- }
- }
-
- // Move records to pack to_page more full.
- orig_pred = NULL;
- target_n_recs = n_recs_to_move;
- dberr_t err;
- while (n_recs_to_move > 0) {
- if (!(rec = page_rec_get_nth(from_page, n_recs_to_move + 1))) {
- return nullptr;
- }
- orig_pred = page_copy_rec_list_start(
- to_block, from_block, rec, index, mtr, &err);
- if (orig_pred)
- break;
- if (err != DB_FAIL) {
- return nullptr;
- }
-
- // If we reach here, that means compression failed after packing
- // n_recs_to_move number of records to to_page. We try to reduce
- // the targeted data size on the to_page by
- // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
- btr_defragment_compression_failures++;
- max_ins_size_to_use =
- move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
- ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
- : 0;
- if (max_ins_size_to_use == 0) {
- n_recs_to_move = 0;
- move_size = 0;
- break;
- }
- n_recs_to_move = btr_defragment_calc_n_recs_for_size(
- from_block, index, max_ins_size_to_use, &move_size);
- }
- // If less than target_n_recs are moved, it means there are
- // compression failures during page_copy_rec_list_start. Adjust
- // the max_data_size estimation to reduce compression failures
- // in the following runs.
- if (target_n_recs > n_recs_to_move
- && *max_data_size > new_data_size + move_size) {
- *max_data_size = new_data_size + move_size;
- }
- // Set ibuf free bits if necessary.
- if (!dict_index_is_clust(index)
- && page_is_leaf(to_page)) {
- if (zip_size) {
- ibuf_reset_free_bits(to_block);
- } else {
- ibuf_update_free_bits_if_full(
- to_block,
- srv_page_size,
- ULINT_UNDEFINED);
- }
- }
- btr_cur_t parent;
- parent.page_cur.index = index;
- parent.page_cur.block = from_block;
-
- if (!btr_page_search_father(mtr, &parent)) {
- to_block = nullptr;
- } else if (n_recs_to_move == n_recs) {
- /* The whole page is merged with the previous page,
- free it. */
- lock_update_merge_left(*to_block, orig_pred,
- from_block->page.id());
- btr_search_drop_page_hash_index(from_block, false);
- if (btr_level_list_remove(*from_block, *index, mtr)
- != DB_SUCCESS
- || btr_cur_node_ptr_delete(&parent, mtr) != DB_SUCCESS
- || btr_page_free(index, from_block, mtr) != DB_SUCCESS) {
- return nullptr;
- }
- } else {
- // There are still records left on the page, so
- // increment n_defragmented. Node pointer will be changed
- // so remove the old node pointer.
- if (n_recs_to_move > 0) {
- // Part of the page is merged to left, remove
- // the merged records, update record locks and
- // node pointer.
- dtuple_t* node_ptr;
- page_delete_rec_list_start(rec, from_block,
- index, mtr);
- lock_update_split_and_merge(to_block,
- orig_pred,
- from_block);
- // FIXME: reuse the node_ptr!
- if (btr_cur_node_ptr_delete(&parent, mtr)
- != DB_SUCCESS) {
- return nullptr;
- }
- rec = page_rec_get_next(
- page_get_infimum_rec(from_page));
- if (!rec) {
- return nullptr;
- }
- node_ptr = dict_index_build_node_ptr(
- index, rec, page_get_page_no(from_page),
- heap, level);
- if (btr_insert_on_non_leaf_level(0, index, level+1,
- node_ptr, mtr)
- != DB_SUCCESS) {
- return nullptr;
- }
- }
- to_block = from_block;
- }
- return to_block;
-}
-
-/*************************************************************//**
-Tries to merge N consecutive pages, starting from the page pointed by the
-cursor. Skip space 0. Only consider leaf pages.
-This function first loads all N pages into memory, then for each of
-the pages other than the first page, it tries to move as many records
-as possible to the left sibling to keep the left sibling full. During
-the process, if any page becomes empty, that page will be removed from
-the level list. Record locks, hash, and node pointers are updated after
-page reorganization.
-@return pointer to the last block processed, or NULL if reaching end of index */
-static
-buf_block_t*
-btr_defragment_n_pages(
- buf_block_t* block, /*!< in: starting block for defragmentation */
- dict_index_t* index, /*!< in: index tree */
- uint n_pages,/*!< in: number of pages to defragment */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- /* We will need to load the n+1 block because if the last page is freed
- and we need to modify the prev_page_no of that block. */
- buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
- page_t* first_page;
- buf_block_t* current_block;
- ulint total_data_size = 0;
- ulint total_n_recs = 0;
- ulint data_size_per_rec;
- ulint optimal_page_size;
- ulint reserved_space;
- ulint max_data_size = 0;
- uint n_defragmented = 0;
- uint n_new_slots;
- mem_heap_t* heap;
- ibool end_of_index = FALSE;
-
- /* It doesn't make sense to call this function with n_pages = 1. */
- ut_ad(n_pages > 1);
-
- if (!page_is_leaf(block->page.frame)) {
- return NULL;
- }
-
- if (!index->table->space || !index->table->space_id) {
- /* Ignore space 0. */
- return NULL;
- }
-
- if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
- n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
- }
-
- first_page = buf_block_get_frame(block);
- const ulint zip_size = index->table->space->zip_size();
-
- /* 1. Load the pages and calculate the total data size. */
- blocks[0] = block;
- for (uint i = 1; i <= n_pages; i++) {
- page_t* page = buf_block_get_frame(blocks[i-1]);
- uint32_t page_no = btr_page_get_next(page);
- total_data_size += page_get_data_size(page);
- total_n_recs += page_get_n_recs(page);
- if (page_no == FIL_NULL) {
- n_pages = i;
- end_of_index = TRUE;
- break;
- }
-
- blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
- mtr);
- if (!blocks[i]) {
- return nullptr;
- }
- }
-
- if (n_pages == 1) {
- if (!page_has_prev(first_page)) {
- /* last page in the index */
- if (dict_index_get_page(index)
- == page_get_page_no(first_page))
- return NULL;
- /* given page is the last page.
- Lift the records to father. */
- dberr_t err;
- btr_lift_page_up(index, block, mtr, &err);
- }
- return NULL;
- }
-
- /* 2. Calculate how many pages data can fit in. If not compressable,
- return early. */
- ut_a(total_n_recs != 0);
- data_size_per_rec = total_data_size / total_n_recs;
- // For uncompressed pages, the optimal data size if the free space of a
- // empty page.
- optimal_page_size = page_get_free_space_of_empty(
- page_is_comp(first_page));
- // For compressed pages, we take compression failures into account.
- if (zip_size) {
- ulint size = 0;
- uint i = 0;
- // We estimate the optimal data size of the index use samples of
- // data size. These samples are taken when pages failed to
- // compress due to insertion on the page. We use the average
- // of all samples we have as the estimation. Different pages of
- // the same index vary in compressibility. Average gives a good
- // enough estimation.
- for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
- if (index->stat_defrag_data_size_sample[i] == 0) {
- break;
- }
- size += index->stat_defrag_data_size_sample[i];
- }
- if (i != 0) {
- size /= i;
- optimal_page_size = ut_min(optimal_page_size, size);
- }
- max_data_size = optimal_page_size;
- }
-
- reserved_space = ut_min(static_cast<ulint>(
- static_cast<double>(optimal_page_size)
- * (1 - srv_defragment_fill_factor)),
- (data_size_per_rec
- * srv_defragment_fill_factor_n_recs));
- optimal_page_size -= reserved_space;
- n_new_slots = uint((total_data_size + optimal_page_size - 1)
- / optimal_page_size);
- if (n_new_slots >= n_pages) {
- /* Can't defragment. */
- if (end_of_index)
- return NULL;
- return blocks[n_pages-1];
- }
-
- /* 3. Defragment pages. */
- heap = mem_heap_create(256);
- // First defragmented page will be the first page.
- current_block = blocks[0];
- // Start from the second page.
- for (uint i = 1; i < n_pages; i ++) {
- buf_block_t* new_block = btr_defragment_merge_pages(
- index, blocks[i], current_block, zip_size,
- reserved_space, &max_data_size, heap, mtr);
- if (new_block != current_block) {
- n_defragmented ++;
- current_block = new_block;
- if (!new_block) {
- break;
- }
- }
- }
- mem_heap_free(heap);
- n_defragmented ++;
- btr_defragment_count++;
- if (n_pages == n_defragmented) {
- btr_defragment_failures++;
- } else {
- index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
- }
- if (end_of_index)
- return NULL;
- return current_block;
-}
-
-
-
-void btr_defragment_start() {
- if (!srv_defragment)
- return;
- ut_ad(!btr_defragment_wq.empty());
- submit_defragment_task();
-}
-
-
-/**
-Callback used by defragment timer
-
-Throttling "sleep", is implemented via rescheduling the
-threadpool timer, which, when fired, will resume the work again,
-where it is left.
-
-The state (current item) is stored in function parameter.
-*/
-static void btr_defragment_chunk(void*)
-{
- THD *thd = innobase_create_background_thd("InnoDB defragment");
- set_current_thd(thd);
-
- btr_defragment_item_t* item = nullptr;
- mtr_t mtr;
-
- mysql_mutex_lock(&btr_defragment_mutex);
-
- while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
- if (!item) {
- if (btr_defragment_wq.empty()) {
-release_and_exit:
- mysql_mutex_unlock(&btr_defragment_mutex);
-func_exit:
- set_current_thd(nullptr);
- destroy_background_thd(thd);
- return;
- }
- item = *btr_defragment_wq.begin();
- ut_ad(item);
- }
-
- if (!item->cond) {
-processed:
- btr_defragment_wq.remove(item);
- item = nullptr;
- continue;
- }
-
- mysql_mutex_unlock(&btr_defragment_mutex);
-
- ulonglong now = my_interval_timer();
- ulonglong elapsed = now - item->last_processed;
-
- if (elapsed < srv_defragment_interval) {
- /* If we see an index again before the interval
- determined by the configured frequency is reached,
- we just sleep until the interval pass. Since
- defragmentation of all indices queue up on a single
- thread, it's likely other indices that follow this one
- don't need to sleep again. */
- int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
- if (sleep_ms) {
- btr_defragment_timer->set_time(sleep_ms, 0);
- goto func_exit;
- }
- }
- log_free_check();
- mtr_start(&mtr);
- dict_index_t *index = item->pcur->index();
- index->set_modified(mtr);
- /* To follow the latching order defined in WL#6326,
- acquire index->lock X-latch. This entitles us to
- acquire page latches in any order for the index. */
- mtr_x_lock_index(index, &mtr);
- if (buf_block_t *last_block =
- item->pcur->restore_position(
- BTR_PURGE_TREE_ALREADY_LATCHED, &mtr)
- == btr_pcur_t::CORRUPTED
- ? nullptr
- : btr_defragment_n_pages(btr_pcur_get_block(item->pcur),
- index, srv_defragment_n_pages,
- &mtr)) {
- /* If we haven't reached the end of the index,
- place the cursor on the last record of last page,
- store the cursor position, and put back in queue. */
- page_t* last_page = buf_block_get_frame(last_block);
- rec_t* rec = page_rec_get_prev(
- page_get_supremum_rec(last_page));
- if (rec && page_rec_is_user_rec(rec)) {
- page_cur_position(rec, last_block,
- btr_pcur_get_page_cur(
- item->pcur));
- }
- btr_pcur_store_position(item->pcur, &mtr);
- mtr_commit(&mtr);
- /* Update the last_processed time of this index. */
- item->last_processed = now;
- mysql_mutex_lock(&btr_defragment_mutex);
- } else {
- mtr_commit(&mtr);
- /* Reaching the end of the index. */
- dict_stats_empty_defrag_stats(index);
- if (dberr_t err= dict_stats_save_defrag_stats(index)) {
- ib::error() << "Saving defragmentation stats for table "
- << index->table->name
- << " index " << index->name()
- << " failed with error " << err;
- } else {
- err = dict_stats_save_defrag_summary(index,
- thd);
-
- if (err != DB_SUCCESS) {
- ib::error() << "Saving defragmentation summary for table "
- << index->table->name
- << " index " << index->name()
- << " failed with error " << err;
- }
- }
-
- mysql_mutex_lock(&btr_defragment_mutex);
- if (item->cond) {
- pthread_cond_signal(item->cond);
- }
- goto processed;
- }
- }
-
- goto release_and_exit;
-}
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
index de0f9e93..61afa3c9 100644
--- a/storage/innobase/btr/btr0pcur.cc
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -158,20 +158,14 @@ before_first:
cursor->rel_pos = BTR_PCUR_ON;
}
- if (index->is_ibuf()) {
- ut_ad(!index->table->not_redundant());
- cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec));
- } else {
- cursor->old_n_fields = static_cast<uint16>(
- dict_index_get_n_unique_in_tree(index));
- if (index->is_spatial() && !page_rec_is_leaf(rec)) {
- ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index)
- == DICT_INDEX_SPATIAL_NODEPTR_SIZE);
- /* For R-tree, we have to compare
- the child page numbers as well. */
- cursor->old_n_fields
- = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
- }
+ cursor->old_n_fields = static_cast<uint16>(
+ dict_index_get_n_unique_in_tree(index));
+ if (index->is_spatial() && !page_rec_is_leaf(rec)) {
+ ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index)
+ == DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+ /* For R-tree, we have to compare
+ the child page numbers as well. */
+ cursor->old_n_fields = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
}
cursor->old_n_core_fields = index->n_core_fields;
@@ -524,11 +518,11 @@ btr_pcur_move_to_next_page(
}
dberr_t err;
- bool first_access = false;
+ bool first_access = false;
buf_block_t* next_block = btr_block_get(
*cursor->index(), next_page_no,
rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)),
- page_is_leaf(page), mtr, &err, &first_access);
+ mtr, &err, &first_access);
if (UNIV_UNLIKELY(!next_block)) {
return err;
@@ -549,8 +543,7 @@ btr_pcur_move_to_next_page(
mtr->rollback_to_savepoint(s - 2, s - 1);
if (first_access) {
buf_read_ahead_linear(next_block->page.id(),
- next_block->zip_size(),
- ibuf_inside(mtr));
+ next_block->zip_size());
}
return DB_SUCCESS;
}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index 1c5928c4..c32d32ab 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -298,13 +298,6 @@ static void btr_search_info_update_hash(btr_search_t *info, btr_cur_t *cursor)
dict_index_t* index = cursor->index();
int cmp;
- if (dict_index_is_ibuf(index)) {
- /* So many deletes are performed on an insert buffer tree
- that we do not consider a hash index useful on it: */
-
- return;
- }
-
uint16_t n_unique = dict_index_get_n_unique_in_tree(index);
if (info->n_hash_potential == 0) {
@@ -705,7 +698,6 @@ btr_search_update_hash_ref(
ut_ad(block->page.id().space() == index->table->space_id);
ut_ad(index == cursor->index());
- ut_ad(!dict_index_is_ibuf(index));
auto part = btr_search_sys.get_part(*index);
part->latch.wr_lock(SRW_LOCK_CALL);
ut_ad(!block->index || block->index == index);
@@ -1050,7 +1042,7 @@ btr_search_guess_on_hash(
index_id_t index_id;
ut_ad(mtr->is_active());
- ut_ad(index->is_btree() || index->is_ibuf());
+ ut_ad(index->is_btree());
/* Note that, for efficiency, the struct info may not be protected by
any latch here! */
@@ -1259,7 +1251,6 @@ retry:
ut_ad(block->page.id().space() == index->table->space_id);
ut_a(index_id == index->id);
- ut_ad(!dict_index_is_ibuf(index));
n_fields = block->curr_n_fields;
n_bytes = block->curr_n_bytes;
@@ -1462,7 +1453,6 @@ btr_search_build_page_hash_index(
ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch);
ut_ad(index);
ut_ad(block->page.id().space() == index->table->space_id);
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(page_is_leaf(block->page.frame));
ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
@@ -1788,7 +1778,6 @@ void btr_search_update_hash_on_delete(btr_cur_t *cursor)
ut_ad(block->page.id().space() == index->table->space_id);
ut_a(index == cursor->index());
ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
- ut_ad(!dict_index_is_ibuf(index));
rec = btr_cur_get_rec(cursor);
@@ -1861,7 +1850,6 @@ void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
}
ut_a(cursor->index() == index);
- ut_ad(!dict_index_is_ibuf(index));
ahi_latch->wr_lock(SRW_LOCK_CALL);
if (!block->index || !btr_search_enabled) {
@@ -1954,7 +1942,6 @@ drop:
}
ut_a(index == cursor->index());
- ut_ad(!dict_index_is_ibuf(index));
n_fields = block->curr_n_fields;
n_bytes = block->curr_n_bytes;
@@ -2208,7 +2195,6 @@ func_exit:
invokes btr_search_drop_page_hash_index(). */
ut_a(block->page.state() == buf_page_t::REMOVE_HASH);
state_ok:
- ut_ad(!dict_index_is_ibuf(block->index));
ut_ad(block->page.id().space()
== block->index->table->space_id);
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
index 85a698bc..f43c6672 100644
--- a/storage/innobase/buf/buf0buddy.cc
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2021, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -454,7 +454,7 @@ byte *buf_buddy_alloc_low(ulint i, bool *lru)
}
/* Try replacing an uncompressed page in the buffer pool. */
- block = buf_LRU_get_free_block(true);
+ block = buf_LRU_get_free_block(have_mutex);
if (lru) {
*lru = true;
}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 49f73105..bdeaae23 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -46,7 +46,6 @@ Created 11/5/1995 Heikki Tuuri
#include "buf0dblwr.h"
#include "lock0lock.h"
#include "btr0sea.h"
-#include "ibuf0ibuf.h"
#include "trx0undo.h"
#include "trx0purge.h"
#include "log0log.h"
@@ -584,7 +583,7 @@ bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
DBUG_EXECUTE_IF(
"page_intermittent_checksum_mismatch", {
static int page_counter;
- if (page_counter++ == 3) {
+ if (page_counter++ == 6) {
crc32++;
}
});
@@ -719,7 +718,8 @@ bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
DBUG_EXECUTE_IF(
"page_intermittent_checksum_mismatch", {
static int page_counter;
- if (page_counter++ == 3) return true;
+ if (page_counter++ == 6)
+ return true;
});
if ((checksum_field1 != crc32
@@ -2060,9 +2060,6 @@ calc_buf_pool_size:
" and dictionary.";
}
- /* normalize ibuf.max_size */
- ibuf_max_size_update(srv_change_buffer_max_size);
-
if (srv_buf_pool_old_size != srv_buf_pool_size) {
buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes."
@@ -2241,7 +2238,6 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
ut_ad(bpage == buf_pool.page_hash.get(id, chain));
- ut_ad(!buf_pool.watch_is_sentinel(*bpage));
ut_d(const auto state= bpage->state());
ut_ad(state >= buf_page_t::FREED);
ut_ad(state <= buf_page_t::READ_FIX);
@@ -2285,131 +2281,6 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
buf_pool.page_hash.replace(chain, bpage, dpage);
}
-buf_page_t *buf_pool_t::watch_set(const page_id_t id,
- buf_pool_t::hash_chain &chain)
-{
- ut_ad(&chain == &page_hash.cell_get(id.fold()));
- page_hash.lock_get(chain).lock();
-
- buf_page_t *bpage= page_hash.get(id, chain);
-
- if (bpage)
- {
-got_block:
- bpage->fix();
- if (watch_is_sentinel(*bpage))
- {
- ut_ad(!bpage->oldest_modification());
- bpage= nullptr;
- }
- page_hash.lock_get(chain).unlock();
- return bpage;
- }
-
- page_hash.lock_get(chain).unlock();
- /* Allocate a watch[] and then try to insert it into the page_hash. */
- mysql_mutex_lock(&mutex);
-
- /* The maximum number of purge tasks should never exceed
- the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
- watch when setting another watch. */
- for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
- {
- ut_ad(w->access_time == 0);
- ut_ad(!w->oldest_modification());
- ut_ad(!w->zip.data);
- ut_ad(!w->in_zip_hash);
- static_assert(buf_page_t::NOT_USED == 0, "efficiency");
- if (ut_d(auto s=) w->state())
- {
- /* This watch may be in use for some other page. */
- ut_ad(s >= buf_page_t::UNFIXED);
- continue;
- }
- /* w is pointing to watch[], which is protected by mutex.
- Normally, buf_page_t::id for objects that are reachable by
- page_hash.get(id, chain) are protected by hash_lock. */
- w->set_state(buf_page_t::UNFIXED + 1);
- w->id_= id;
-
- page_hash.lock_get(chain).lock();
- bpage= page_hash.get(id, chain);
- if (UNIV_LIKELY_NULL(bpage))
- {
- w->set_state(buf_page_t::NOT_USED);
- mysql_mutex_unlock(&mutex);
- goto got_block;
- }
-
- ut_ad(w->state() == buf_page_t::UNFIXED + 1);
- buf_pool.page_hash.append(chain, w);
- mysql_mutex_unlock(&mutex);
- page_hash.lock_get(chain).unlock();
- return nullptr;
- }
-
- ut_error;
-}
-
-/** Stop watching whether a page has been read in.
-watch_set(id) must have returned nullptr before.
-@param id page identifier
-@param chain unlocked hash table chain */
-TRANSACTIONAL_TARGET
-void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain)
-{
- mysql_mutex_assert_not_owner(&mutex);
- buf_page_t *w;
- {
- transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)};
- /* The page must exist because watch_set() did fix(). */
- w= page_hash.get(id, chain);
- ut_ad(w->in_page_hash);
- if (!watch_is_sentinel(*w))
- {
- no_watch:
- w->unfix();
- w= nullptr;
- }
- else
- {
- ut_ad(!w->oldest_modification());
- const auto state= w->state();
- ut_ad(~buf_page_t::LRU_MASK & state);
- ut_ad(state >= buf_page_t::UNFIXED + 1);
- if (state != buf_page_t::UNFIXED + 1)
- goto no_watch;
- }
- }
-
- if (!w)
- return;
-
- const auto old= w;
- /* The following is based on buf_pool_t::watch_remove(). */
- mysql_mutex_lock(&mutex);
- w= page_hash.get(id, chain);
-
- {
- transactional_lock_guard<page_hash_latch> g
- {buf_pool.page_hash.lock_get(chain)};
- auto f= w->unfix();
- ut_ad(f < buf_page_t::READ_FIX || w != old);
-
- if (f == buf_page_t::UNFIXED && w == old)
- {
- page_hash.remove(chain, w);
- // Now that w is detached from page_hash, release it to watch[].
- ut_ad(w->id_ == id);
- ut_ad(!w->frame);
- ut_ad(!w->zip.data);
- w->set_state(buf_page_t::NOT_USED);
- }
- }
-
- mysql_mutex_unlock(&mutex);
-}
-
/** Mark the page status as FREED for the given tablespace and page number.
@param[in,out] space tablespace
@param[in] page page number
@@ -2453,8 +2324,6 @@ void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
}
block->page.lock.x_lock();
- if (block->page.is_ibuf_exist())
- ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size());
#ifdef BTR_CUR_HASH_ADAPT
if (block->index)
btr_search_drop_page_hash_index(block, false);
@@ -2494,7 +2363,7 @@ lookup:
if (hash_lock.is_locked())
xabort();
bpage= buf_pool.page_hash.get(page_id, chain);
- if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ if (!bpage)
{
xend();
goto must_read_page;
@@ -2519,7 +2388,7 @@ lookup:
{
hash_lock.lock_shared();
bpage= buf_pool.page_hash.get(page_id, chain);
- if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ if (!bpage)
{
hash_lock.unlock_shared();
goto must_read_page;
@@ -2578,7 +2447,7 @@ lookup:
return bpage;
must_read_page:
- switch (dberr_t err= buf_read_page(page_id, zip_size)) {
+ switch (dberr_t err= buf_read_page(page_id, zip_size, chain)) {
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
mariadb_increment_pages_read();
@@ -2701,29 +2570,32 @@ err_exit:
@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
@param[in] guess guessed block or NULL
@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+or BUF_PEEK_IF_IN_POOL
@param[in] mtr mini-transaction
@param[out] err DB_SUCCESS or error code
-@param[in] allow_ibuf_merge Allow change buffer merge to happen
-while reading the page from file
-then it makes sure that it does merging of change buffer changes while
-reading the page from file.
@return pointer to the block or NULL */
TRANSACTIONAL_TARGET
buf_block_t*
-buf_page_get_low(
+buf_page_get_gen(
const page_id_t page_id,
ulint zip_size,
ulint rw_latch,
buf_block_t* guess,
ulint mode,
mtr_t* mtr,
- dberr_t* err,
- bool allow_ibuf_merge)
+ dberr_t* err)
{
- unsigned access_time;
ulint retries = 0;
+ /* BUF_GET_RECOVER is only used by recv_sys_t::recover(),
+ which must be invoked during early server startup when crash
+ recovery may be in progress. The only case when it may be
+ invoked outside recovery is when dict_create() has initialized
+ a new database and is invoking dict_boot(). In this case, the
+ LSN will be small. */
+ ut_ad(mode == BUF_GET_RECOVER
+ ? recv_recovery_is_on() || log_sys.get_lsn() < 50000
+ : !recv_recovery_is_on() || recv_sys.after_apply);
ut_ad(!mtr || mtr->is_active());
ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL);
ut_ad((rw_latch == RW_S_LATCH)
@@ -2738,7 +2610,6 @@ buf_page_get_low(
#ifdef UNIV_DEBUG
switch (mode) {
default:
- ut_ad(!allow_ibuf_merge);
ut_ad(mode == BUF_PEEK_IF_IN_POOL);
break;
case BUF_GET_POSSIBLY_FREED:
@@ -2746,8 +2617,8 @@ buf_page_get_low(
/* The caller may pass a dummy page size,
because it does not really matter. */
break;
+ case BUF_GET_RECOVER:
case BUF_GET:
- case BUF_GET_IF_IN_POOL_OR_WATCH:
ut_ad(!mtr->is_freeing_tree());
fil_space_t* s = fil_space_get(page_id.space());
ut_ad(s);
@@ -2755,9 +2626,6 @@ buf_page_get_low(
}
#endif /* UNIV_DEBUG */
- ut_ad(!mtr || !ibuf_inside(mtr)
- || ibuf_page_low(page_id, zip_size, FALSE, NULL));
-
++buf_pool.stat.n_page_gets;
mariadb_increment_pages_accessed();
@@ -2791,8 +2659,7 @@ loop:
hash_lock.lock_shared();
block = reinterpret_cast<buf_block_t*>(
buf_pool.page_hash.get(page_id, chain));
- if (UNIV_LIKELY(block
- && !buf_pool.watch_is_sentinel(block->page))) {
+ if (UNIV_LIKELY(block != nullptr)) {
state = block->page.fix();
hash_lock.unlock_shared();
goto got_block;
@@ -2804,17 +2671,6 @@ loop:
case BUF_GET_IF_IN_POOL:
case BUF_PEEK_IF_IN_POOL:
return nullptr;
- case BUF_GET_IF_IN_POOL_OR_WATCH:
- /* Buffer-fixing inside watch_set() will prevent eviction */
- block = reinterpret_cast<buf_block_t*>
- (buf_pool.watch_set(page_id, chain));
-
- if (block) {
- state = block->page.state();
- goto got_block_fixed;
- }
-
- return nullptr;
}
/* The call path is buf_read_page() ->
@@ -2828,11 +2684,11 @@ loop:
corrupted, or if an encrypted page with a valid
checksum cannot be decypted. */
- switch (dberr_t local_err = buf_read_page(page_id, zip_size)) {
+ switch (dberr_t local_err = buf_read_page(page_id, zip_size, chain)) {
case DB_SUCCESS:
case DB_SUCCESS_LOCKED_REC:
mariadb_increment_pages_read();
- buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr));
+ buf_read_ahead_random(page_id, zip_size);
break;
default:
if (mode != BUF_GET_POSSIBLY_FREED
@@ -2854,7 +2710,6 @@ loop:
got_block:
ut_ad(!block->page.in_zip_hash);
state++;
-got_block_fixed:
ut_ad(state > buf_page_t::FREED);
if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) {
@@ -2943,14 +2798,14 @@ wait_for_unzip:
wait_for_read:
/* The page is being read or written, or
another thread is executing buf_zip_decompress()
- in buf_page_get_low() on it. */
+ in buf_page_get_gen() on it. */
block->page.unfix();
std::this_thread::sleep_for(
std::chrono::microseconds(100));
goto loop;
}
- buf_block_t *new_block = buf_LRU_get_free_block(false);
+ buf_block_t *new_block = buf_LRU_get_free_block(have_no_mutex);
buf_block_init_low(new_block);
wait_for_unfix:
@@ -2966,15 +2821,11 @@ wait_for_unfix:
ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain));
/* Wait for any other threads to release their buffer-fix
- on the compressed-only block descriptor.
- FIXME: Never fix() before acquiring the lock.
- Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free()
- we are violating that principle. */
+ on the compressed-only block descriptor. */
state = block->page.state();
switch (state) {
case buf_page_t::UNFIXED + 1:
- case buf_page_t::IBUF_EXIST + 1:
case buf_page_t::REINIT + 1:
break;
default:
@@ -2996,7 +2847,7 @@ wait_for_unfix:
goto wait_for_unfix;
}
- /* Ensure that another buf_page_get_low() will wait for
+ /* Ensure that another buf_page_get_gen() will wait for
new_block->page.lock.x_unlock(). */
block->page.set_state(buf_page_t::READ_FIX);
@@ -3028,13 +2879,6 @@ wait_for_unfix:
buf_pool.n_pend_unzip++;
- access_time = block->page.is_accessed();
-
- if (!access_time && !recv_no_ibuf_operations
- && ibuf_page_exists(block->page.id(), block->zip_size())) {
- state = buf_page_t::IBUF_EXIST + 1;
- }
-
/* Decompress the page while not holding
buf_pool.mutex. */
const auto ok = buf_zip_decompress(block, false);
@@ -3053,63 +2897,6 @@ wait_for_unfix:
}
}
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-re_evict:
- if (mode != BUF_GET_IF_IN_POOL
- && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
- } else if (!ibuf_debug || recv_recovery_is_on()) {
- } else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
- for (ulint i = 0; i < mtr->get_savepoint(); i++) {
- if (buf_block_t* b = mtr->block_at_savepoint(i)) {
- if (b->page.oldest_modification() > 2
- && b->page.lock.have_any()) {
- /* We are holding a dirty page latch
- that would hang buf_flush_sync(). */
- space->release();
- goto re_evict_fail;
- }
- }
- }
-
- /* Try to evict the block from the buffer pool, to use the
- insert buffer (change buffer) as much as possible. */
-
- mysql_mutex_lock(&buf_pool.mutex);
-
- block->unfix();
-
- /* Blocks cannot be relocated or enter or exit the
- buf_pool while we are holding the buf_pool.mutex. */
- const bool evicted = buf_LRU_free_page(&block->page, true);
- space->release();
-
- if (!evicted) {
- block->fix();
- }
-
- mysql_mutex_unlock(&buf_pool.mutex);
-
- if (evicted) {
- if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
- buf_pool.watch_set(page_id, chain);
- }
- return(NULL);
- }
-
- buf_flush_sync();
-
- state = block->page.state();
-
- if (state == buf_page_t::UNFIXED + 1
- && !block->page.oldest_modification()) {
- goto re_evict;
- }
-
- /* Failed to evict the page; change it directly */
- }
-re_evict_fail:
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
goto ignore_block;
}
@@ -3130,184 +2917,45 @@ re_evict_fail:
state to FREED). Therefore, after acquiring the page latch we
must recheck the state. */
- if (state >= buf_page_t::UNFIXED
- && allow_ibuf_merge
- && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX
- && page_is_leaf(block->page.frame)) {
- block->page.lock.x_lock();
- state = block->page.state();
- ut_ad(state < buf_page_t::READ_FIX);
-
- if (state >= buf_page_t::IBUF_EXIST
- && state < buf_page_t::REINIT) {
- block->page.clear_ibuf_exist();
- if (dberr_t local_err =
- ibuf_merge_or_delete_for_page(block, page_id,
- block->zip_size())) {
- if (err) {
- *err = local_err;
- }
- goto release_and_ignore_block;
- }
- } else if (state < buf_page_t::UNFIXED) {
-release_and_ignore_block:
- block->page.lock.x_unlock();
- goto ignore_block;
- }
-
-#ifdef BTR_CUR_HASH_ADAPT
- btr_search_drop_page_hash_index(block, true);
-#endif /* BTR_CUR_HASH_ADAPT */
-
- switch (rw_latch) {
- case RW_NO_LATCH:
- block->page.lock.x_unlock();
- break;
- case RW_S_LATCH:
- block->page.lock.x_unlock();
- block->page.lock.s_lock();
- break;
- case RW_SX_LATCH:
- block->page.lock.x_u_downgrade();
- break;
- default:
- ut_ad(rw_latch == RW_X_LATCH);
- }
-
- mtr->memo_push(block, mtr_memo_type_t(rw_latch));
- } else {
- switch (rw_latch) {
- case RW_NO_LATCH:
- mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+ switch (rw_latch) {
+ case RW_NO_LATCH:
+ mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+ return block;
+ case RW_S_LATCH:
+ block->page.lock.s_lock();
+ break;
+ case RW_SX_LATCH:
+ block->page.lock.u_lock();
+ ut_ad(!block->page.is_io_fixed());
+ break;
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ if (block->page.lock.x_lock_upgraded()) {
+ ut_ad(block->page.id() == page_id);
+ block->unfix();
+ mtr->page_lock_upgrade(*block);
return block;
- case RW_S_LATCH:
- block->page.lock.s_lock();
- break;
- case RW_SX_LATCH:
- block->page.lock.u_lock();
- ut_ad(!block->page.is_io_fixed());
- break;
- default:
- ut_ad(rw_latch == RW_X_LATCH);
- if (block->page.lock.x_lock_upgraded()) {
- ut_ad(block->page.id() == page_id);
- block->unfix();
- mtr->page_lock_upgrade(*block);
- return block;
- }
}
+ }
- mtr->memo_push(block, mtr_memo_type_t(rw_latch));
- state = block->page.state();
+ mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+ state = block->page.state();
- if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
- mtr->release_last_page();
- goto ignore_unfixed;
- }
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+ mtr->release_last_page();
+ goto ignore_unfixed;
+ }
- ut_ad(state < buf_page_t::READ_FIX
- || state > buf_page_t::WRITE_FIX);
+ ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX);
#ifdef BTR_CUR_HASH_ADAPT
- btr_search_drop_page_hash_index(block, true);
+ btr_search_drop_page_hash_index(block, true);
#endif /* BTR_CUR_HASH_ADAPT */
- }
ut_ad(page_id_t(page_get_space_id(block->page.frame),
page_get_page_no(block->page.frame)) == page_id);
- return block;
-}
-
-/** Get access to a database page. Buffered redo log may be applied.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in] guess guessed block or NULL
-@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in,out] mtr mini-transaction, or NULL
-@param[out] err DB_SUCCESS or error code
-@param[in] allow_ibuf_merge Allow change buffer merge while
-reading the pages from file.
-@return pointer to the block or NULL */
-buf_block_t*
-buf_page_get_gen(
- const page_id_t page_id,
- ulint zip_size,
- ulint rw_latch,
- buf_block_t* guess,
- ulint mode,
- mtr_t* mtr,
- dberr_t* err,
- bool allow_ibuf_merge)
-{
- buf_block_t *block= recv_sys.recover(page_id);
- if (UNIV_LIKELY(!block))
- return buf_page_get_low(page_id, zip_size, rw_latch,
- guess, mode, mtr, err, allow_ibuf_merge);
- else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1)))
- {
- corrupted:
- if (err)
- *err= DB_CORRUPTION;
- return nullptr;
- }
- /* Recovery is a special case; we fix() before acquiring lock. */
- auto s= block->page.fix();
- ut_ad(s >= buf_page_t::FREED);
- /* The block may be write-fixed at this point because we are not
- holding a lock, but it must not be read-fixed. */
- ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
- if (err)
- *err= DB_SUCCESS;
- const bool must_merge= allow_ibuf_merge &&
- ibuf_page_exists(page_id, block->zip_size());
- if (s < buf_page_t::UNFIXED)
- {
- got_freed_page:
- ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
- mysql_mutex_lock(&buf_pool.mutex);
- block->page.unfix();
- buf_LRU_free_page(&block->page, true);
- mysql_mutex_unlock(&buf_pool.mutex);
- goto corrupted;
- }
- else if (must_merge &&
- fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX &&
- page_is_leaf(block->page.frame))
- {
- block->page.lock.x_lock();
- s= block->page.state();
- ut_ad(s > buf_page_t::FREED);
- ut_ad(s < buf_page_t::READ_FIX);
- if (s < buf_page_t::UNFIXED)
- {
- block->page.lock.x_unlock();
- goto got_freed_page;
- }
- else
- {
- if (block->page.is_ibuf_exist())
- block->page.clear_ibuf_exist();
- if (dberr_t e=
- ibuf_merge_or_delete_for_page(block, page_id, block->zip_size()))
- {
- if (err)
- *err= e;
- buf_pool.corrupted_evict(&block->page, s);
- return nullptr;
- }
- }
- if (rw_latch == RW_X_LATCH)
- {
- mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
- return block;
- }
- block->page.lock.x_unlock();
- }
- mtr->page_lock(block, rw_latch);
- return block;
+ return block;
}
TRANSACTIONAL_TARGET
@@ -3344,9 +2992,6 @@ buf_block_t *buf_page_optimistic_get(buf_block_t *block,
return nullptr;
}
- ut_ad(!ibuf_inside(mtr) ||
- ibuf_page(block->page.id(), block->zip_size(), nullptr));
-
if (modify_clock != block->modify_clock || block->page.is_freed())
{
block->page.lock.s_unlock();
@@ -3369,8 +3014,6 @@ buf_block_t *buf_page_optimistic_get(buf_block_t *block,
else
{
ut_ad(!block->page.is_io_fixed());
- ut_ad(!ibuf_inside(mtr) ||
- ibuf_page(block->page.id(), block->zip_size(), nullptr));
if (modify_clock != block->modify_clock || block->page.is_freed())
{
@@ -3459,12 +3102,11 @@ retry:
buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain);
- if (bpage && !buf_pool.watch_is_sentinel(*bpage))
+ if (bpage)
{
#ifdef BTR_CUR_HASH_ADAPT
const dict_index_t *drop_hash_entry= nullptr;
#endif
- bool ibuf_exist= false;
if (!mtr->have_x_latch(reinterpret_cast<const buf_block_t&>(*bpage)))
{
@@ -3490,10 +3132,7 @@ retry:
if (state < buf_page_t::UNFIXED)
bpage->set_reinit(buf_page_t::FREED);
else
- {
bpage->set_reinit(state & buf_page_t::LRU_MASK);
- ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
- }
if (UNIV_LIKELY(bpage->frame != nullptr))
{
@@ -3519,10 +3158,7 @@ retry:
if (state < buf_page_t::UNFIXED)
bpage->set_reinit(buf_page_t::FREED);
else
- {
bpage->set_reinit(state & buf_page_t::LRU_MASK);
- ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
- }
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_relocate(bpage, &free_block->page);
@@ -3562,9 +3198,6 @@ retry:
false);
#endif /* BTR_CUR_HASH_ADAPT */
- if (ibuf_exist && !recv_recovery_is_on())
- ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
-
return reinterpret_cast<buf_block_t*>(bpage);
}
@@ -3605,13 +3238,6 @@ retry:
bpage->set_accessed();
- /* Delete possible entries for the page from the insert buffer:
- such can exist if the page belonged to an index which was dropped */
- if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
- !srv_is_undo_tablespace(page_id.space()) &&
- !recv_recovery_is_on())
- ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
-
static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8);
mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
@@ -3675,32 +3301,15 @@ ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read)
const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame;
switch (fil_page_get_type(frame)) {
- ulint level;
case FIL_PAGE_TYPE_INSTANT:
case FIL_PAGE_INDEX:
case FIL_PAGE_RTREE:
- level = btr_page_get_level(frame);
-
- /* Check if it is an index page for insert buffer */
- if (fil_page_get_type(frame) == FIL_PAGE_INDEX
- && btr_page_get_index_id(frame)
- == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
- if (level == 0) {
- counter = MONITOR_RW_COUNTER(
- read, MONITOR_INDEX_IBUF_LEAF_PAGE);
- } else {
- counter = MONITOR_RW_COUNTER(
- read,
- MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
- }
+ if (page_is_leaf(frame)) {
+ counter = MONITOR_RW_COUNTER(
+ read, MONITOR_INDEX_LEAF_PAGE);
} else {
- if (level == 0) {
- counter = MONITOR_RW_COUNTER(
- read, MONITOR_INDEX_LEAF_PAGE);
- } else {
- counter = MONITOR_RW_COUNTER(
- read, MONITOR_INDEX_NON_LEAF_PAGE);
- }
+ counter = MONITOR_RW_COUNTER(
+ read, MONITOR_INDEX_NON_LEAF_PAGE);
}
break;
@@ -3712,14 +3321,6 @@ ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read)
counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE);
break;
- case FIL_PAGE_IBUF_FREE_LIST:
- counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE);
- break;
-
- case FIL_PAGE_IBUF_BITMAP:
- counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE);
- break;
-
case FIL_PAGE_TYPE_SYS:
counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE);
break;
@@ -3932,41 +3533,30 @@ database_corrupted:
<< FORCE_RECOVERY_MSG;
}
- if (!srv_force_recovery)
- goto release_page;
- }
-
- if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
- {
+ if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED ||
+ !srv_force_recovery)
+ {
release_page:
- buf_pool.corrupted_evict(this, buf_page_t::READ_FIX);
- return err;
+ buf_pool.corrupted_evict(this, buf_page_t::READ_FIX);
+ return err;
+ }
}
- const bool recovery= recv_recovery_is_on();
+ const bool recovery= frame && recv_recovery_is_on();
if (recovery && !recv_recover_page(node.space, this))
return DB_PAGE_CORRUPTED;
- const bool ibuf_may_exist= frame && !recv_no_ibuf_operations &&
- (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) &&
- fil_page_get_type(read_frame) == FIL_PAGE_INDEX &&
- page_is_leaf(read_frame);
-
if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
buf_page_monitor(*this, true);
DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no()));
if (!recovery)
{
- ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist
- ? READ_FIX - IBUF_EXIST
- : READ_FIX - UNFIXED);
+ ut_d(auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED);
ut_ad(f >= READ_FIX);
ut_ad(f < WRITE_FIX);
}
- else if (ibuf_may_exist)
- set_ibuf_exist();
lock.x_unlock(true);
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index e2702adc..2bf49608 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -53,6 +53,7 @@ void buf_dblwr_t::init()
active_slot= &slots[0];
mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
pthread_cond_init(&cond, nullptr);
+ block_size= FSP_EXTENT_SIZE;
}
}
@@ -67,7 +68,7 @@ inline void buf_dblwr_t::init(const byte *header)
block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
- const uint32_t buf_size= 2 * block_size();
+ const uint32_t buf_size= 2 * block_size;
for (int i= 0; i < 2; i++)
{
slots[i].write_buf= static_cast<byte*>
@@ -86,7 +87,7 @@ bool buf_dblwr_t::create()
return true;
mtr_t mtr;
- const ulint size= block_size();
+ const ulint size= block_size;
start_again:
mtr.start();
@@ -251,7 +252,7 @@ loads the pages from double write buffer into memory.
dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
{
ut_ad(this == &buf_dblwr);
- const uint32_t size= block_size();
+ const uint32_t size= block_size;
/* We do the file i/o past the buffer pool */
byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
@@ -283,6 +284,7 @@ func_exit:
init(TRX_SYS_DOUBLEWRITE + read_buf);
const bool upgrade_to_innodb_file_per_table=
+ !srv_read_only_mode &&
mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
TRX_SYS_DOUBLEWRITE + read_buf) !=
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
@@ -488,7 +490,6 @@ void buf_dblwr_t::write_completed()
mysql_mutex_lock(&mutex);
ut_ad(is_created());
- ut_ad(srv_use_doublewrite_buf);
ut_ad(batch_running);
slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
ut_ad(flush_slot->reserved);
@@ -574,7 +575,7 @@ static void buf_dblwr_check_block(const buf_page_t *bpage)
bool buf_dblwr_t::flush_buffered_writes(const ulint size)
{
mysql_mutex_assert_owner(&mutex);
- ut_ad(size == block_size());
+ ut_ad(size == block_size);
for (;;)
{
@@ -647,7 +648,6 @@ static void *get_frame(const IORequest &request)
void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
{
ut_ad(this == &buf_dblwr);
- ut_ad(srv_use_doublewrite_buf);
ut_ad(is_created());
ut_ad(!srv_read_only_mode);
ut_ad(!request.bpage);
@@ -670,8 +670,14 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
pages_written+= flush_slot->first_free;
mysql_mutex_unlock(&mutex);
- /* Now flush the doublewrite buffer data to disk */
- fil_system.sys_space->flush<false>();
+ /* Make the doublewrite durable. Note: The doublewrite buffer is
+ always in the first file of the system tablespace. We will not
+ bother about fil_system.unflushed_spaces, which can result in a
+ redundant call during fil_flush_file_spaces() in
+ log_checkpoint(). Writes to the system tablespace should be rare,
+ except when executing DDL or using the non-default settings
+ innodb_file_per_table=OFF or innodb_undo_tablespaces=0. */
+ os_file_flush(request.node->handle);
/* The writes have been flushed to disk now and in recovery we will
find them in the doublewrite buffer blocks. Next, write the data pages. */
@@ -714,17 +720,18 @@ posted, and also when we may have to wait for a page latch!
Otherwise a deadlock of threads can occur. */
void buf_dblwr_t::flush_buffered_writes()
{
- if (!is_created() || !srv_use_doublewrite_buf)
+ mysql_mutex_lock(&mutex);
+
+ if (!in_use() && active_slot->first_free == 0)
{
+ mysql_mutex_unlock(&mutex);
fil_flush_file_spaces();
return;
}
ut_ad(!srv_read_only_mode);
- const ulint size= block_size();
- mysql_mutex_lock(&mutex);
- if (!flush_buffered_writes(size))
+ if (!flush_buffered_writes(block_size))
mysql_mutex_unlock(&mutex);
}
@@ -734,8 +741,6 @@ flush_buffered_writes() will be invoked to make space.
@param size payload size in bytes */
void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
{
- ut_ad(request.is_async());
- ut_ad(request.is_write());
ut_ad(request.bpage);
ut_ad(request.bpage->in_file());
ut_ad(request.node);
@@ -744,7 +749,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
ut_ad(request.node->space->referenced());
ut_ad(!srv_read_only_mode);
- const ulint buf_size= 2 * block_size();
+ const ulint buf_size= 2 * block_size;
mysql_mutex_lock(&mutex);
@@ -773,7 +778,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
ut_ad(active_slot->reserved == active_slot->first_free);
ut_ad(active_slot->reserved < buf_size);
new (active_slot->buf_block_arr + active_slot->first_free++)
- element{request, size};
+ element{request.doublewritten(), size};
active_slot->reserved= active_slot->first_free;
if (active_slot->first_free != buf_size ||
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
index cc51f8c6..d0a0f0dc 100644
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -576,7 +576,7 @@ buf_load()
}
}
if (!missing.empty()) {
- dict_check_tablespaces_and_store_max_id(&missing);
+ dict_load_tablespaces(&missing);
}
}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index d364be31..a69877ba 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
Copyright (c) 2013, 2014, Fusion-io
This program is free software; you can redistribute it and/or modify it under
@@ -350,9 +350,9 @@ void buf_page_write_complete(const IORequest &request, bool error)
else
{
bpage->write_complete(persistent, error, state);
- if (state < buf_page_t::WRITE_FIX_REINIT &&
- request.node->space->use_doublewrite())
+ if (request.is_doublewritten())
{
+ ut_ad(state < buf_page_t::WRITE_FIX_REINIT);
ut_ad(persistent);
buf_dblwr.write_completed();
}
@@ -1069,7 +1069,6 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
{
ut_ad(bpage == b);
bpage= nullptr;
- ut_ad(!buf_pool.watch_is_sentinel(*b));
ut_ad(b->oldest_modification() > 1);
flush:
if (b->flush(space))
@@ -1083,7 +1082,6 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
else if (b->oldest_modification() > 1 && b->lock.u_lock_try(true))
{
/* For the buf_pool.watch[] sentinels, oldest_modification() == 0 */
- ut_ad(!buf_pool.watch_is_sentinel(*b));
if (b->oldest_modification() < 2)
b->lock.u_unlock(true);
else
@@ -1197,7 +1195,7 @@ static void buf_flush_discard_page(buf_page_t *bpage)
ut_d(const auto state= bpage->state());
ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED ||
- state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT);
+ state == buf_page_t::REINIT);
bpage->lock.u_unlock(true);
buf_LRU_free_page(bpage, true);
}
@@ -1795,7 +1793,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
resize_log.write(CHECKPOINT_1, {c, get_block_size()});
}
- if (srv_file_flush_method != SRV_O_DSYNC)
+ if (!log_write_through)
ut_a(log.flush());
latch.wr_lock(SRW_LOCK_CALL);
ut_ad(checkpoint_pending);
@@ -1827,7 +1825,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
if (!is_pmem())
{
- if (srv_file_flush_method != SRV_O_DSYNC)
+ if (!log_write_through)
ut_a(resize_log.flush());
IF_WIN(log.close(),);
}
@@ -1969,13 +1967,7 @@ static bool log_checkpoint()
if (recv_recovery_is_on())
recv_sys.apply(true);
- switch (srv_file_flush_method) {
- case SRV_NOSYNC:
- case SRV_O_DIRECT_NO_FSYNC:
- break;
- default:
- fil_flush_file_spaces();
- }
+ fil_flush_file_spaces();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t end_lsn= log_sys.get_lsn();
@@ -2132,13 +2124,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
MONITOR_FLUSH_SYNC_PAGES, n_flushed);
}
- switch (srv_file_flush_method) {
- case SRV_NOSYNC:
- case SRV_O_DIRECT_NO_FSYNC:
- break;
- default:
- fil_flush_file_spaces();
- }
+ fil_flush_file_spaces();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t newest_lsn= log_sys.get_lsn();
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 33d01b6b..41e3b4e7 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -381,13 +381,14 @@ we put it to free list to be used.
* scan whole LRU list
* scan LRU list even if buf_pool.try_LRU_scan is not set
-@param have_mutex whether buf_pool.mutex is already being held
-@return the free control block, in state BUF_BLOCK_MEMORY */
-buf_block_t *buf_LRU_get_free_block(bool have_mutex)
+@param get how to allocate the block
+@return the free control block, in state BUF_BLOCK_MEMORY
+@retval nullptr if get==have_no_mutex_soft and memory was not available */
+buf_block_t* buf_LRU_get_free_block(buf_LRU_get get)
{
bool waited= false;
MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
- if (!have_mutex)
+ if (UNIV_LIKELY(get != have_mutex))
mysql_mutex_lock(&buf_pool.mutex);
buf_LRU_check_size_of_non_data_objects();
@@ -414,7 +415,7 @@ got_block:
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
- if (!have_mutex)
+ if (UNIV_LIKELY(get != have_mutex))
mysql_mutex_unlock(&buf_pool.mutex);
block->page.zip.clear();
@@ -436,6 +437,12 @@ got_block:
buf_pool.try_LRU_scan= false;
}
+ if (get == have_no_mutex_soft)
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return nullptr;
+ }
+
waited= true;
while (!(block= buf_LRU_get_free_only()))
@@ -765,7 +772,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
/* We must hold an exclusive hash_lock to prevent
bpage->can_relocate() from changing due to a concurrent
- execution of buf_page_get_low(). */
+ execution of buf_page_get_gen(). */
buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold());
page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
/* We cannot use transactional_lock_guard here,
@@ -810,7 +817,6 @@ relocate_compressed:
ut_d(uint32_t s=) b->fix();
ut_ad(s == buf_page_t::FREED
|| s == buf_page_t::UNFIXED
- || s == buf_page_t::IBUF_EXIST
|| s == buf_page_t::REINIT);
}
break;
@@ -1281,6 +1287,51 @@ bool buf_LRU_scan_and_free_block(ulint limit)
buf_LRU_free_from_common_LRU_list(limit);
}
+void buf_LRU_truncate_temp(uint32_t threshold)
+{
+ /* Set the extent descriptor page state as FREED */
+ for (uint32_t cur_xdes_page= xdes_calc_descriptor_page(
+ 0, fil_system.temp_space->free_limit);
+ cur_xdes_page >= threshold;)
+ {
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* block= buf_page_get_gen(
+ page_id_t(SRV_TMP_SPACE_ID, cur_xdes_page), 0, RW_X_LATCH,
+ nullptr, BUF_PEEK_IF_IN_POOL, &mtr))
+ {
+ uint32_t state= block->page.state();
+ ut_ad(state > buf_page_t::UNFIXED);
+ ut_ad(state < buf_page_t::READ_FIX);
+ block->page.set_freed(state);
+ }
+ cur_xdes_page-= uint32_t(srv_page_size);
+ mtr.commit();
+ }
+
+ const page_id_t limit{SRV_TMP_SPACE_ID, threshold};
+ mysql_mutex_lock(&buf_pool.mutex);
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage;)
+ {
+ buf_page_t* next= UT_LIST_GET_NEXT(LRU, bpage);
+ if (bpage->id() >= limit)
+ {
+ #ifdef UNIV_DEBUG
+ if (bpage->lock.u_lock_try(0))
+ {
+ ut_ad(bpage->state() == buf_page_t::FREED);
+ bpage->lock.u_unlock();
+ }
+ #endif /* UNIV_DEBUG */
+ ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
+ buf_LRU_free_page(bpage, true);
+ }
+ bpage= next;
+ }
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
#ifdef UNIV_DEBUG
/** Validate the LRU list. */
void buf_LRU_validate()
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 76a5e710..70e71845 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -35,7 +35,7 @@ Created 11/5/1995 Heikki Tuuri
#include "buf0lru.h"
#include "buf0buddy.h"
#include "buf0dblwr.h"
-#include "ibuf0ibuf.h"
+#include "page0zip.h"
#include "log0recv.h"
#include "trx0sys.h"
#include "os0file.h"
@@ -44,126 +44,89 @@ Created 11/5/1995 Heikki Tuuri
#include "log.h"
#include "mariadb_stats.h"
+TRANSACTIONAL_TARGET
+bool buf_pool_t::page_hash_contains(const page_id_t page_id, hash_chain &chain)
+{
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
+ return page_hash.get(page_id, chain);
+}
+
/** If there are buf_pool.curr_size per the number below pending reads, then
read-ahead is not done: this is to prevent flooding the buffer pool with
i/o-fixed buffer blocks */
#define BUF_READ_AHEAD_PEND_LIMIT 2
-/** Remove the sentinel block for the watch before replacing it with a
-real block. watch_unset() or watch_occurred() will notice
-that the block has been replaced with the real block.
-@param w sentinel
-@param chain locked hash table chain
-@return w->state() */
-inline uint32_t buf_pool_t::watch_remove(buf_page_t *w,
- buf_pool_t::hash_chain &chain)
-{
- mysql_mutex_assert_owner(&buf_pool.mutex);
- ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked());
- ut_ad(w >= &watch[0]);
- ut_ad(w < &watch[array_elements(watch)]);
- ut_ad(!w->in_zip_hash);
- ut_ad(!w->zip.data);
-
- uint32_t s{w->state()};
- w->set_state(buf_page_t::NOT_USED);
- ut_ad(s >= buf_page_t::UNFIXED);
- ut_ad(s < buf_page_t::READ_FIX);
-
- if (~buf_page_t::LRU_MASK & s)
- page_hash.remove(chain, w);
-
- ut_ad(!w->in_page_hash);
- w->id_= page_id_t(~0ULL);
- return s;
-}
-
/** Initialize a page for read to the buffer buf_pool. If the page is
(1) already in buf_pool, or
-(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
-(3) if the space is deleted or being deleted,
+(2) if the tablespace has been or is being deleted,
then this function does nothing.
Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
on the buffer frame. The io-handler must take care that the flag is cleared
and the lock released later.
-@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] unzip whether the uncompressed page is
- requested (for ROW_FORMAT=COMPRESSED)
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0,
+ bitwise-ORed with 1 in recovery
+@param chain buf_pool.page_hash cell for page_id
+@param block preallocated buffer block (set to nullptr if consumed)
@return pointer to the block
-@retval NULL in case of an error */
+@retval nullptr in case of an error */
TRANSACTIONAL_TARGET
-static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
- ulint zip_size, bool unzip)
+static buf_page_t *buf_page_init_for_read(const page_id_t page_id,
+ ulint zip_size,
+ buf_pool_t::hash_chain &chain,
+ buf_block_t *&block)
{
- mtr_t mtr;
-
- if (mode == BUF_READ_IBUF_PAGES_ONLY)
- {
- /* It is a read-ahead within an ibuf routine */
- ut_ad(!ibuf_bitmap_page(page_id, zip_size));
- ibuf_mtr_start(&mtr);
-
- if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
- {
- ibuf_mtr_commit(&mtr);
- return nullptr;
- }
- }
- else
- ut_ad(mode == BUF_READ_ANY_PAGE);
-
buf_page_t *bpage= nullptr;
- buf_block_t *block= nullptr;
- if (!zip_size || unzip || recv_recovery_is_on())
+ if (!zip_size || (zip_size & 1))
{
- block= buf_LRU_get_free_block(false);
- block->initialise(page_id, zip_size, buf_page_t::READ_FIX);
+ bpage= &block->page;
+ block->initialise(page_id, zip_size & ~1, buf_page_t::READ_FIX);
/* x_unlock() will be invoked
in buf_page_t::read_complete() by the io-handler thread. */
block->page.lock.x_lock(true);
}
- buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
-
- mysql_mutex_lock(&buf_pool.mutex);
-
- buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain);
- if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+ hash_lock.lock();
+ if (buf_pool.page_hash.get(page_id, chain))
{
+page_exists:
+ hash_lock.unlock();
/* The page is already in the buffer pool. */
- if (block)
+ if (bpage)
{
- block->page.lock.x_unlock(true);
- ut_d(block->page.set_state(buf_page_t::MEMORY));
- buf_LRU_block_free_non_file_page(block);
+ bpage->lock.x_unlock(true);
+ ut_d(mysql_mutex_lock(&buf_pool.mutex));
+ ut_d(bpage->set_state(buf_page_t::MEMORY));
+ ut_d(mysql_mutex_unlock(&buf_pool.mutex));
}
- goto func_exit;
+ return nullptr;
}
- if (UNIV_LIKELY(block != nullptr))
+ if (UNIV_UNLIKELY(mysql_mutex_trylock(&buf_pool.mutex)))
{
- bpage= &block->page;
-
- /* Insert into the hash table of file pages */
- if (hash_page)
- {
- transactional_lock_guard<page_hash_latch> g
- {buf_pool.page_hash.lock_get(chain)};
- bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
- (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
- buf_pool.page_hash.append(chain, &block->page);
- }
- else
+ hash_lock.unlock();
+ mysql_mutex_lock(&buf_pool.mutex);
+ hash_lock.lock();
+ if (buf_pool.page_hash.get(page_id, chain))
{
- transactional_lock_guard<page_hash_latch> g
- {buf_pool.page_hash.lock_get(chain)};
- buf_pool.page_hash.append(chain, &block->page);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto page_exists;
}
+ }
+
+ zip_size&= ~1;
+
+ if (UNIV_LIKELY(bpage != nullptr))
+ {
+ block= nullptr;
+ /* Insert into the hash table of file pages */
+ buf_pool.page_hash.append(chain, bpage);
+ hash_lock.unlock();
/* The block must be put to the LRU list, to the old blocks */
- buf_LRU_add_block(&block->page, true/* to old blocks */);
+ buf_LRU_add_block(bpage, true/* to old blocks */);
if (UNIV_UNLIKELY(zip_size))
{
@@ -171,19 +134,19 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
buf_buddy_alloc(). We must defer this operation until after the
block descriptor has been added to buf_pool.LRU and
buf_pool.page_hash. */
- block->page.zip.data= static_cast<page_zip_t*>
- (buf_buddy_alloc(zip_size));
+ bpage->zip.data= static_cast<page_zip_t*>(buf_buddy_alloc(zip_size));
/* To maintain the invariant
block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
we have to add this block to unzip_LRU
after block->page.zip.data is set. */
- ut_ad(block->page.belongs_to_unzip_LRU());
- buf_unzip_LRU_add_block(block, TRUE);
+ ut_ad(bpage->belongs_to_unzip_LRU());
+ buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), TRUE);
}
}
else
{
+ hash_lock.unlock();
/* The compressed page must be allocated before the
control block (bpage), in order to avoid the
invocation of buf_buddy_relocate_block() on
@@ -196,9 +159,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
check the page_hash again, as it may have been modified. */
if (UNIV_UNLIKELY(lru))
{
- hash_page= buf_pool.page_hash.get(page_id, chain);
-
- if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
+ if (UNIV_LIKELY_NULL(buf_pool.page_hash.get(page_id, chain)))
{
/* The block was added by some other thread. */
buf_buddy_free(data, zip_size);
@@ -218,11 +179,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
{
transactional_lock_guard<page_hash_latch> g
{buf_pool.page_hash.lock_get(chain)};
-
- if (hash_page)
- bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
- (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
-
buf_pool.page_hash.append(chain, bpage);
}
@@ -234,10 +190,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
buf_pool.stat.n_pages_read++;
func_exit:
mysql_mutex_unlock(&buf_pool.mutex);
-
- if (mode == BUF_READ_IBUF_PAGES_ONLY)
- ibuf_mtr_commit(&mtr);
-
ut_ad(!bpage || bpage->in_file());
return bpage;
@@ -248,24 +200,25 @@ buffer buf_pool if it is not already there, in which case does nothing.
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
flag is cleared and the x-lock released by an i/o-handler thread.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0,
+ bitwise-ORed with 1 in recovery
+@param[in,out] chain buf_pool.page_hash cell for page_id
@param[in,out] space tablespace
+@param[in,out] block preallocated buffer block
@param[in] sync true if synchronous aio is desired
-@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...,
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] unzip true=request uncompressed page
@return error code
@retval DB_SUCCESS if the page was read
@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */
static
dberr_t
buf_read_page_low(
- fil_space_t* space,
- bool sync,
- ulint mode,
const page_id_t page_id,
ulint zip_size,
- bool unzip)
+ buf_pool_t::hash_chain& chain,
+ fil_space_t* space,
+ buf_block_t*& block,
+ bool sync = false)
{
buf_page_t* bpage;
@@ -274,25 +227,7 @@ buf_read_page_low(
return DB_PAGE_CORRUPTED;
}
- if (sync) {
- } else if (trx_sys_hdr_page(page_id)
- || ibuf_bitmap_page(page_id, zip_size)
- || (!recv_no_ibuf_operations
- && ibuf_page(page_id, zip_size, nullptr))) {
-
- /* Trx sys header is so low in the latching order that we play
- safe and do not leave the i/o-completion to an asynchronous
- i/o-thread. Change buffer pages must always be read with
- synchronous i/o, to make sure they do not get involved in
- thread deadlocks. */
- sync = true;
- }
-
- /* The following call will also check if the tablespace does not exist
- or is being dropped; if we succeed in initing the page in the buffer
- pool for read, then DISCARD cannot proceed until the read has
- completed */
- bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
+ bpage = buf_page_init_for_read(page_id, zip_size, chain, block);
if (!bpage) {
space->release();
@@ -310,10 +245,10 @@ buf_read_page_low(
DBUG_LOG("ib_buf",
"read page " << page_id << " zip_size=" << zip_size
- << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
+ << (sync ? " sync" : " async"));
- void* dst = zip_size ? bpage->zip.data : bpage->frame;
- const ulint len = zip_size ? zip_size : srv_page_size;
+ void* dst = zip_size > 1 ? bpage->zip.data : bpage->frame;
+ const ulint len = zip_size & ~1 ? zip_size & ~1 : srv_page_size;
auto fio = space->io(IORequest(sync
? IORequest::READ_SYNC
@@ -338,25 +273,35 @@ buf_read_page_low(
return fio.err;
}
+/** Acquire a buffer block. */
+static buf_block_t *buf_read_acquire()
+{
+ return buf_LRU_get_free_block(have_no_mutex_soft);
+}
+
+/** Free a buffer block if needed. */
+static void buf_read_release(buf_block_t *block)
+{
+ if (block)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+}
+
/** Applies a random read-ahead in buf_pool if there are at least a threshold
value of accessed pages from the random read-ahead area. Does not read any
page, not even the one at the position (space, offset), if the read-ahead
-mechanism is not activated. NOTE 1: the calling thread may own latches on
+mechanism is not activated. NOTE: the calling thread may own latches on
pages: to avoid deadlocks this function must be written such that it cannot
-end up waiting for these latches! NOTE 2: the calling thread must want
-access to the page given: this rule is set to prevent unintended read-aheads
-performed by ibuf routines, a situation which could result in a deadlock if
-the OS does not support asynchronous i/o.
+end up waiting for these latches!
@param[in] page_id page id of a page which the current thread
wants to access
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] ibuf whether we are inside ibuf routine
-@return number of page read requests issued; NOTE that if we read ibuf
-pages, it may happen that the page at the given page number does not
-get read even if we return a positive value! */
+@return number of page read requests issued */
TRANSACTIONAL_TARGET
-ulint
-buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
+ulint buf_read_ahead_random(const page_id_t page_id, ulint zip_size)
{
if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID)
/* Disable the read-ahead for temporary tablespace */
@@ -366,11 +311,6 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
/* No read-ahead to avoid thread deadlocks */
return 0;
- if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
- /* If it is an ibuf bitmap page or trx sys hdr, we do no
- read-ahead, as that could break the ibuf page access order */
- return 0;
-
if (os_aio_pending_reads_approx() >
buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
return 0;
@@ -407,18 +347,33 @@ read_ahead:
goto no_read_ahead;
/* Read all the suitable blocks within the area */
- const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+ buf_block_t *block= nullptr;
+ if (UNIV_LIKELY(!zip_size))
+ {
+ allocate_block:
+ if (UNIV_UNLIKELY(!(block= buf_read_acquire())))
+ goto no_read_ahead;
+ }
+ else if (recv_recovery_is_on())
+ {
+ zip_size|= 1;
+ goto allocate_block;
+ }
for (page_id_t i= low; i < high; ++i)
{
- if (ibuf_bitmap_page(i, zip_size))
- continue;
if (space->is_stopping())
break;
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
space->reacquire();
- if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) ==
- DB_SUCCESS)
+ if (buf_read_page_low(i, zip_size, chain, space, block) == DB_SUCCESS)
+ {
count++;
+ ut_ad(!block);
+ if ((UNIV_LIKELY(!zip_size) || (zip_size & 1)) &&
+ UNIV_UNLIKELY(!(block= buf_read_acquire())))
+ break;
+ }
}
if (count)
@@ -435,6 +390,7 @@ read_ahead:
}
space->release();
+ buf_read_release(block);
return count;
}
@@ -442,15 +398,17 @@ read_ahead:
if it is not already there. Sets the io_fix and an exclusive lock
on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@retval DB_SUCCESS if the page was read and is not corrupted
+@param page_id page id
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param chain buf_pool.page_hash cell for page_id
+@retval DB_SUCCESS if the page was read and is not corrupted,
@retval DB_SUCCESS_LOCKED_REC if the page was not read
-@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
-dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size,
+ buf_pool_t::hash_chain &chain)
{
fil_space_t *space= fil_space_t::get(page_id.space());
if (!space)
@@ -460,9 +418,26 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
return DB_TABLESPACE_DELETED;
}
- buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */
- return buf_read_page_low(space, true, BUF_READ_ANY_PAGE,
- page_id, zip_size, false);
+ /* Our caller should already have ensured that the page does not
+ exist in buf_pool.page_hash. */
+ buf_block_t *block= nullptr;
+ if (UNIV_LIKELY(!zip_size))
+ {
+ allocate_block:
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_LRU_stat_inc_io();
+ block= buf_LRU_get_free_block(have_mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+ else if (recv_recovery_is_on())
+ {
+ zip_size|= 1;
+ goto allocate_block;
+ }
+
+ dberr_t err= buf_read_page_low(page_id, zip_size, chain, space, block, true);
+ buf_read_release(block);
+ return err;
}
/** High-level function which reads a page asynchronously from a file to the
@@ -475,15 +450,39 @@ released by the i/o-handler thread.
void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
ulint zip_size)
{
- buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
- page_id, zip_size, false);
-
- /* We do not increment number of I/O operations used for LRU policy
- here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
- about evicting uncompressed version of compressed pages from the
- buffer pool. Since this function is called from buffer pool load
- these IOs are deliberate and are not part of normal workload we can
- ignore these in our heuristics. */
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ if (buf_pool.page_hash_contains(page_id, chain))
+ {
+ skip:
+ space->release();
+ return;
+ }
+
+ buf_block_t *block= nullptr;
+ if (UNIV_LIKELY(!zip_size))
+ {
+ allocate_block:
+ if (UNIV_UNLIKELY(!(block= buf_read_acquire())))
+ goto skip;
+ }
+ else if (recv_recovery_is_on())
+ {
+ zip_size|= 1;
+ goto allocate_block;
+ }
+
+ if (buf_read_page_low(page_id, zip_size, chain, space, block) ==
+ DB_SUCCESS)
+ ut_ad(!block);
+ else
+ buf_read_release(block);
+
+ /* We do not increment number of I/O operations used for LRU policy
+ here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+ about evicting uncompressed version of ROW_FORMAT=COMPRESSED pages
+ from the buffer pool. Since this function is called from buffer pool
+ load these IOs are deliberate and are not part of normal workload we
+ can ignore these in our heuristics. */
}
/** Applies linear read-ahead if in the buf_pool the page is a border page of
@@ -505,16 +504,11 @@ only very improbably.
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
function must be written such that it cannot end up waiting for these
latches!
-NOTE 3: the calling thread must want access to the page given: this rule is
-set to prevent unintended read-aheads performed by ibuf routines, a situation
-which could result in a deadlock if the OS does not support asynchronous io.
@param[in] page_id page id; see NOTE 3 above
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] ibuf whether if we are inside ibuf routine
@return number of page read requests issued */
TRANSACTIONAL_TARGET
-ulint
-buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
+ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size)
{
/* check if readahead is disabled.
Disable the read ahead logic for temporary tablespace */
@@ -541,11 +535,6 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
/* This is not a border page of the area */
return 0;
- if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
- /* If it is an ibuf bitmap page or trx sys hdr, we do no
- read-ahead, as that could break the ibuf page access order */
- return 0;
-
fil_space_t *space= fil_space_t::get(page_id.space());
if (!space)
return 0;
@@ -575,7 +564,7 @@ fail:
hash_lock.lock_shared();
const buf_page_t* bpage= buf_pool.page_hash.get(i, chain);
- if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ if (!bpage)
{
hash_lock.unlock_shared();
if (i == page_id)
@@ -651,18 +640,35 @@ failed:
}
/* If we got this far, read-ahead can be sensible: do it */
+ buf_block_t *block= nullptr;
+ if (UNIV_LIKELY(!zip_size))
+ {
+ allocate_block:
+ if (UNIV_UNLIKELY(!(block= buf_read_acquire())))
+ goto fail;
+ }
+ else if (recv_recovery_is_on())
+ {
+ zip_size|= 1;
+ goto allocate_block;
+ }
+
count= 0;
- for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
- new_low <= new_high_1; ++new_low)
+ for (; new_low <= new_high_1; ++new_low)
{
- if (ibuf_bitmap_page(new_low, zip_size))
- continue;
if (space->is_stopping())
break;
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(new_low.fold());
space->reacquire();
- if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) ==
+ if (buf_read_page_low(new_low, zip_size, chain, space, block) ==
DB_SUCCESS)
+ {
count++;
+ ut_ad(!block);
+ if ((UNIV_LIKELY(!zip_size) || (zip_size & 1)) &&
+ UNIV_UNLIKELY(!(block= buf_read_acquire())))
+ break;
+ }
}
if (count)
@@ -679,6 +685,7 @@ failed:
}
space->release();
+ buf_read_release(block);
return count;
}
@@ -686,31 +693,41 @@ failed:
@param space tablespace
@param page_id page identifier
@param recs log records
-@param init page initialization, or nullptr if the page needs to be read */
+@param init_lsn page initialization, or 0 if the page needs to be read */
void buf_read_recover(fil_space_t *space, const page_id_t page_id,
- page_recv_t &recs, recv_init *init)
+ page_recv_t &recs, lsn_t init_lsn)
{
ut_ad(space->id == page_id.space());
space->reacquire();
- const ulint zip_size= space->zip_size();
+ const ulint zip_size= space->zip_size() | 1;
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ buf_block_t *block= buf_LRU_get_free_block(have_no_mutex);
- if (init)
+ if (init_lsn)
{
- if (buf_page_t *bpage= buf_page_init_for_read(BUF_READ_ANY_PAGE, page_id,
- zip_size, true))
+ if (buf_page_t *bpage=
+ buf_page_init_for_read(page_id, zip_size, chain, block))
{
ut_ad(bpage->in_file());
os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs,
UT_LIST_GET_FIRST(space->chain),
- IORequest::READ_ASYNC}, ptrdiff_t(init));
+ IORequest::READ_ASYNC}, init_lsn);
+ return;
}
}
- else if (dberr_t err= buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
- page_id, zip_size, true))
+ else if (dberr_t err=
+ buf_read_page_low(page_id, zip_size, chain, space, block))
{
if (err != DB_SUCCESS_LOCKED_REC)
sql_print_error("InnoDB: Recovery failed to read page "
UINT32PF " from %s",
page_id.page_no(), space->chain.start->name);
}
+ else
+ {
+ ut_ad(!block);
+ return;
+ }
+
+ buf_LRU_block_free_non_file_page(block);
}
diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc
index b1952bcc..dc1c4b9a 100644
--- a/storage/innobase/data/data0type.cc
+++ b/storage/innobase/data/data0type.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -33,13 +33,6 @@ const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] = {
0x80, 0, 0, 0, 0, 0, 0
};
-/* At the database startup we store the default-charset collation number of
-this MySQL installation to this global variable. If we have < 4.1.2 format
-column definitions, or records in the insert buffer, we use this
-charset-collation code for them. */
-
-ulint data_mysql_default_charset_coll;
-
/*********************************************************************//**
Determine how many bytes the first n characters of the given string occupy.
If the string is shorter than n characters, returns the number of bytes
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
index cb60d813..84b8f860 100644
--- a/storage/innobase/dict/dict0boot.cc
+++ b/storage/innobase/dict/dict0boot.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2022, MariaDB Corporation.
+Copyright (c) 2016, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,7 +30,6 @@ Created 4/18/1996 Heikki Tuuri
#include "dict0load.h"
#include "trx0trx.h"
#include "srv0srv.h"
-#include "ibuf0ibuf.h"
#include "buf0flu.h"
#include "log0recv.h"
#include "os0file.h"
@@ -97,18 +96,6 @@ dict_hdr_get_new_id(
mtr.commit();
}
-/** Update dict_sys.row_id in the dictionary header file page. */
-void dict_hdr_flush_row_id(row_id_t id)
-{
- mtr_t mtr;
- mtr.start();
- buf_block_t* d= dict_hdr_get(&mtr);
- byte *row_id= DICT_HDR + DICT_HDR_ROW_ID + d->page.frame;
- if (mach_read_from_8(row_id) < id)
- mtr.write<8>(*d, row_id, id);
- mtr.commit();
-}
-
/** Create the DICT_HDR page on database initialization.
@return error code */
dberr_t dict_create()
@@ -130,10 +117,8 @@ dberr_t dict_create()
}
ut_a(d->page.id() == hdr_page_id);
- /* Start counting row, table, index, and tree ids from
+ /* Start counting table, index, and tree ids from
DICT_HDR_FIRST_ID */
- mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->page.frame,
- DICT_HDR_FIRST_ID);
mtr.write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->page.frame,
DICT_HDR_FIRST_ID);
mtr.write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->page.frame,
@@ -236,12 +221,11 @@ dberr_t dict_boot()
dict_sys.create();
dberr_t err;
- const buf_block_t *d = buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH,
- nullptr, BUF_GET, &mtr, &err);
- if (!d) {
+ const buf_block_t *d = recv_sys.recover(hdr_page_id, &mtr ,&err);
+ if (!d) {
mtr.commit();
return err;
- }
+ }
heap = mem_heap_create(450);
@@ -249,17 +233,6 @@ dberr_t dict_boot()
const byte* dict_hdr = &d->page.frame[DICT_HDR];
- /* Because we only write new row ids to disk-based data structure
- (dictionary header) when it is divisible by
- DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
- the latest value of the row id counter. Therefore we advance
- the counter at the database startup to avoid overlapping values.
- Note that when a user after database startup first time asks for
- a new row id, then because the counter is now divisible by
- ..._MARGIN, it will immediately be updated to the disk-based
- header. */
-
- dict_sys.recover_row_id(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID));
if (uint32_t max_space_id
= mach_read_from_4(dict_hdr + DICT_HDR_MAX_SPACE_ID)) {
max_space_id--;
@@ -422,22 +395,6 @@ dberr_t dict_boot()
UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
mtr.commit();
-
- err = ibuf_init_at_db_start();
-
- if (err == DB_SUCCESS || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO) {
- err = DB_SUCCESS;
- /* Load definitions of other indexes on system tables */
-
- dict_load_sys_table(dict_sys.sys_tables);
- dict_load_sys_table(dict_sys.sys_columns);
- dict_load_sys_table(dict_sys.sys_indexes);
- dict_load_sys_table(dict_sys.sys_fields);
- dict_sys.unlock();
- dict_sys.load_sys_tables();
- } else {
- dict_sys.unlock();
- }
-
+ dict_sys.unlock();
return err;
}
diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc
deleted file mode 100644
index bec6da8e..00000000
--- a/storage/innobase/dict/dict0defrag_bg.cc
+++ /dev/null
@@ -1,434 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2016, 2022, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file dict/dict0defrag_bg.cc
-Defragmentation routines.
-
-Created 25/08/2016 Jan Lindström
-*******************************************************/
-
-#include "dict0dict.h"
-#include "dict0stats.h"
-#include "dict0stats_bg.h"
-#include "dict0defrag_bg.h"
-#include "btr0btr.h"
-#include "srv0start.h"
-#include "trx0trx.h"
-#include "lock0lock.h"
-#include "row0mysql.h"
-
-static mysql_mutex_t defrag_pool_mutex;
-
-/** Iterator type for iterating over the elements of objects of type
-defrag_pool_t. */
-typedef defrag_pool_t::iterator defrag_pool_iterator_t;
-
-/** Pool where we store information on which tables are to be processed
-by background defragmentation. */
-defrag_pool_t defrag_pool;
-
-
-/*****************************************************************//**
-Initialize the defrag pool, called once during thread initialization. */
-void
-dict_defrag_pool_init(void)
-/*=======================*/
-{
- ut_ad(!srv_read_only_mode);
- mysql_mutex_init(0, &defrag_pool_mutex, nullptr);
-}
-
-/*****************************************************************//**
-Free the resources occupied by the defrag pool, called once during
-thread de-initialization. */
-void
-dict_defrag_pool_deinit(void)
-/*=========================*/
-{
- ut_ad(!srv_read_only_mode);
-
- mysql_mutex_destroy(&defrag_pool_mutex);
-}
-
-/*****************************************************************//**
-Get an index from the auto defrag pool. The returned index id is removed
-from the pool.
-@return true if the pool was non-empty and "id" was set, false otherwise */
-static
-bool
-dict_stats_defrag_pool_get(
-/*=======================*/
- table_id_t* table_id, /*!< out: table id, or unmodified if
- list is empty */
- index_id_t* index_id) /*!< out: index id, or unmodified if
- list is empty */
-{
- ut_ad(!srv_read_only_mode);
-
- mysql_mutex_lock(&defrag_pool_mutex);
-
- if (defrag_pool.empty()) {
- mysql_mutex_unlock(&defrag_pool_mutex);
- return(false);
- }
-
- defrag_pool_item_t& item = defrag_pool.back();
- *table_id = item.table_id;
- *index_id = item.index_id;
-
- defrag_pool.pop_back();
-
- mysql_mutex_unlock(&defrag_pool_mutex);
-
- return(true);
-}
-
-/*****************************************************************//**
-Add an index in a table to the defrag pool, which is processed by the
-background stats gathering thread. Only the table id and index id are
-added to the list, so the table can be closed after being enqueued and
-it will be opened when needed. If the table or index does not exist later
-(has been DROPped), then it will be removed from the pool and skipped. */
-void
-dict_stats_defrag_pool_add(
-/*=======================*/
- const dict_index_t* index) /*!< in: table to add */
-{
- defrag_pool_item_t item;
-
- ut_ad(!srv_read_only_mode);
-
- mysql_mutex_lock(&defrag_pool_mutex);
-
- /* quit if already in the list */
- for (defrag_pool_iterator_t iter = defrag_pool.begin();
- iter != defrag_pool.end();
- ++iter) {
- if ((*iter).table_id == index->table->id
- && (*iter).index_id == index->id) {
- mysql_mutex_unlock(&defrag_pool_mutex);
- return;
- }
- }
-
- item.table_id = index->table->id;
- item.index_id = index->id;
- defrag_pool.push_back(item);
- if (defrag_pool.size() == 1) {
- /* Kick off dict stats optimizer work */
- dict_stats_schedule_now();
- }
- mysql_mutex_unlock(&defrag_pool_mutex);
-}
-
-/*****************************************************************//**
-Delete a given index from the auto defrag pool. */
-void
-dict_stats_defrag_pool_del(
-/*=======================*/
- const dict_table_t* table, /*!<in: if given, remove
- all entries for the table */
- const dict_index_t* index) /*!< in: if given, remove this index */
-{
- ut_a((table && !index) || (!table && index));
- ut_ad(!srv_read_only_mode);
- ut_ad(dict_sys.frozen());
-
- mysql_mutex_lock(&defrag_pool_mutex);
-
- defrag_pool_iterator_t iter = defrag_pool.begin();
- while (iter != defrag_pool.end()) {
- if ((table && (*iter).table_id == table->id)
- || (index
- && (*iter).table_id == index->table->id
- && (*iter).index_id == index->id)) {
- /* erase() invalidates the iterator */
- iter = defrag_pool.erase(iter);
- if (index)
- break;
- } else {
- iter++;
- }
- }
-
- mysql_mutex_unlock(&defrag_pool_mutex);
-}
-
-/*****************************************************************//**
-Get the first index that has been added for updating persistent defrag
-stats and eventually save its stats. */
-static void dict_stats_process_entry_from_defrag_pool(THD *thd)
-{
- table_id_t table_id;
- index_id_t index_id;
-
- ut_ad(!srv_read_only_mode);
-
- /* pop the first index from the auto defrag pool */
- if (!dict_stats_defrag_pool_get(&table_id, &index_id))
- /* no index in defrag pool */
- return;
-
- /* If the table is no longer cached, we've already lost the in
- memory stats so there's nothing really to write to disk. */
- MDL_ticket *mdl= nullptr;
- if (dict_table_t *table=
- dict_table_open_on_id(table_id, false, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
- thd, &mdl))
- {
- if (dict_index_t *index= !table->corrupted
- ? dict_table_find_index_on_id(table, index_id) : nullptr)
- if (index->is_btree())
- dict_stats_save_defrag_stats(index);
- dict_table_close(table, false, thd, mdl);
- }
-}
-
-/**
-Get the first index that has been added for updating persistent defrag
-stats and eventually save its stats. */
-void dict_defrag_process_entries_from_defrag_pool(THD *thd)
-{
- while (!defrag_pool.empty())
- dict_stats_process_entry_from_defrag_pool(thd);
-}
-
-/*********************************************************************//**
-Save defragmentation result.
-@return DB_SUCCESS or error code */
-dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
-{
- if (index->is_ibuf())
- return DB_SUCCESS;
-
- MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
- dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
- DICT_ERR_IGNORE_NONE);
- if (table_stats)
- {
- dict_sys.freeze(SRW_LOCK_CALL);
- table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
- dict_sys.unfreeze();
- }
- if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
- {
-release_and_exit:
- if (table_stats)
- dict_table_close(table_stats, false, thd, mdl_table);
- return DB_STATS_DO_NOT_EXIST;
- }
-
- dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
- DICT_ERR_IGNORE_NONE);
- if (index_stats)
- {
- dict_sys.freeze(SRW_LOCK_CALL);
- index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
- dict_sys.unfreeze();
- }
- if (!index_stats)
- goto release_and_exit;
- if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
- {
- dict_table_close(index_stats, false, thd, mdl_index);
- goto release_and_exit;
- }
-
- trx_t *trx= trx_create();
- trx->mysql_thd= thd;
- trx_start_internal(trx);
- dberr_t ret= trx->read_only
- ? DB_READ_ONLY
- : lock_table_for_trx(table_stats, trx, LOCK_X);
- if (ret == DB_SUCCESS)
- ret= lock_table_for_trx(index_stats, trx, LOCK_X);
- row_mysql_lock_data_dictionary(trx);
- if (ret == DB_SUCCESS)
- ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed",
- index->stat_defrag_n_pages_freed,
- nullptr,
- "Number of pages freed during"
- " last defragmentation run.",
- trx);
- if (ret == DB_SUCCESS)
- trx->commit();
- else
- trx->rollback();
-
- if (table_stats)
- dict_table_close(table_stats, true, thd, mdl_table);
- if (index_stats)
- dict_table_close(index_stats, true, thd, mdl_index);
-
- row_mysql_unlock_data_dictionary(trx);
- trx->free();
-
- return ret;
-}
-
-/**************************************************************//**
-Gets the number of reserved and used pages in a B-tree.
-@return number of pages reserved, or ULINT_UNDEFINED if the index
-is unavailable */
-static
-ulint
-btr_get_size_and_reserved(
- dict_index_t* index, /*!< in: index */
- ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
- ulint* used, /*!< out: number of pages used (<= reserved) */
- mtr_t* mtr) /*!< in/out: mini-transaction where index
- is s-latched */
-{
- ulint dummy;
-
- ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
- ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
-
- if (index->page == FIL_NULL
- || dict_index_is_online_ddl(index)
- || !index->is_committed()
- || !index->table->space) {
- return(ULINT_UNDEFINED);
- }
-
- dberr_t err;
- buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
- *used = 0;
- if (!root) {
- return ULINT_UNDEFINED;
- }
-
- mtr->x_lock_space(index->table->space);
-
- ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
- + root->page.frame, used, mtr);
- if (flag == BTR_TOTAL_SIZE) {
- n += fseg_n_reserved_pages(*root,
- PAGE_HEADER + PAGE_BTR_SEG_TOP
- + root->page.frame, &dummy, mtr);
- *used += dummy;
- }
-
- return(n);
-}
-
-/*********************************************************************//**
-Save defragmentation stats for a given index.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_save_defrag_stats(
-/*============================*/
- dict_index_t* index) /*!< in: index */
-{
- if (index->is_ibuf())
- return DB_SUCCESS;
- if (!index->is_readable())
- return dict_stats_report_error(index->table, true);
-
- const time_t now= time(nullptr);
- mtr_t mtr;
- ulint n_leaf_pages;
- mtr.start();
- mtr_sx_lock_index(index, &mtr);
- ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
- &n_leaf_pages, &mtr);
- mtr.commit();
-
- if (n_leaf_reserved == ULINT_UNDEFINED)
- return DB_SUCCESS;
-
- THD *thd= current_thd;
- MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
- dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
- DICT_ERR_IGNORE_NONE);
- if (table_stats)
- {
- dict_sys.freeze(SRW_LOCK_CALL);
- table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
- dict_sys.unfreeze();
- }
- if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
- {
-release_and_exit:
- if (table_stats)
- dict_table_close(table_stats, false, thd, mdl_table);
- return DB_STATS_DO_NOT_EXIST;
- }
-
- dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
- DICT_ERR_IGNORE_NONE);
- if (index_stats)
- {
- dict_sys.freeze(SRW_LOCK_CALL);
- index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
- dict_sys.unfreeze();
- }
- if (!index_stats)
- goto release_and_exit;
-
- if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
- {
- dict_table_close(index_stats, false, thd, mdl_index);
- goto release_and_exit;
- }
-
- trx_t *trx= trx_create();
- trx->mysql_thd= thd;
- trx_start_internal(trx);
- dberr_t ret= trx->read_only
- ? DB_READ_ONLY
- : lock_table_for_trx(table_stats, trx, LOCK_X);
- if (ret == DB_SUCCESS)
- ret= lock_table_for_trx(index_stats, trx, LOCK_X);
-
- row_mysql_lock_data_dictionary(trx);
-
- if (ret == DB_SUCCESS)
- ret= dict_stats_save_index_stat(index, now, "n_page_split",
- index->stat_defrag_n_page_split, nullptr,
- "Number of new page splits on leaves"
- " since last defragmentation.", trx);
-
- if (ret == DB_SUCCESS)
- ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_defrag",
- n_leaf_pages, nullptr,
- "Number of leaf pages when"
- " this stat is saved to disk", trx);
-
- if (ret == DB_SUCCESS)
- ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_reserved",
- n_leaf_reserved, nullptr,
- "Number of pages reserved for"
- " this index leaves"
- " when this stat is saved to disk", trx);
-
- if (ret == DB_SUCCESS)
- trx->commit();
- else
- trx->rollback();
-
- if (table_stats)
- dict_table_close(table_stats, true, thd, mdl_table);
- if (index_stats)
- dict_table_close(index_stats, true, thd, mdl_index);
- row_mysql_unlock_data_dictionary(trx);
- trx->free();
-
- return ret;
-}
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index a1295c33..e33f86e9 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -1157,6 +1157,7 @@ inline void dict_sys_t::add(dict_table_t* table)
ulint fold = my_crc32c(0, table->name.m_name,
strlen(table->name.m_name));
+ table->row_id = 0;
table->autoinc_mutex.init();
table->lock_mutex_init();
@@ -1976,7 +1977,6 @@ dict_index_add_to_cache(
ut_ad(index->n_def == index->n_fields);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
ut_ad(!dict_index_is_online_ddl(index));
- ut_ad(!dict_index_is_ibuf(index));
ut_d(mem_heap_validate(index->heap));
ut_a(!dict_index_is_clust(index)
@@ -2062,13 +2062,6 @@ dict_index_add_to_cache(
new_index->stat_index_size = 1;
new_index->stat_n_leaf_pages = 1;
- new_index->stat_defrag_n_pages_freed = 0;
- new_index->stat_defrag_n_page_split = 0;
-
- new_index->stat_defrag_sample_next_slot = 0;
- memset(&new_index->stat_defrag_data_size_sample,
- 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
-
/* Add the new index as the last index for the table */
UT_LIST_ADD_LAST(new_index->table->indexes, new_index);
@@ -2357,15 +2350,7 @@ dict_index_copy_types(
ulint n_fields) /*!< in: number of
field types to copy */
{
- ulint i;
-
- if (dict_index_is_ibuf(index)) {
- dtuple_set_types_binary(tuple, n_fields);
-
- return;
- }
-
- for (i = 0; i < n_fields; i++) {
+ for (ulint i = 0; i < n_fields; i++) {
const dict_field_t* ifield;
dtype_t* dfield_type;
@@ -2604,17 +2589,14 @@ dict_index_build_internal_non_clust(
ulint i;
ibool* indexed;
- ut_ad(table && index);
- ut_ad(!dict_index_is_clust(index));
- ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->is_primary());
ut_ad(dict_sys.locked());
/* The clustered index should be the first in the list of indexes */
clust_index = UT_LIST_GET_FIRST(table->indexes);
ut_ad(clust_index);
- ut_ad(dict_index_is_clust(clust_index));
- ut_ad(!dict_index_is_ibuf(clust_index));
+ ut_ad(clust_index->is_clust());
/* Create a new index */
new_index = dict_mem_index_create(
@@ -3749,24 +3731,7 @@ dict_index_build_node_ptr(
dtuple_t* tuple;
dfield_t* field;
byte* buf;
- ulint n_unique;
-
- if (dict_index_is_ibuf(index)) {
- /* In a universal index tree, we take the whole record as
- the node pointer if the record is on the leaf level,
- on non-leaf levels we remove the last field, which
- contains the page number of the child page */
-
- ut_a(!dict_table_is_comp(index->table));
- n_unique = rec_get_n_fields_old(rec);
-
- if (level > 0) {
- ut_a(n_unique > 1);
- n_unique--;
- }
- } else {
- n_unique = dict_index_get_n_unique_in_tree_nonleaf(index);
- }
+ ulint n_unique = dict_index_get_n_unique_in_tree_nonleaf(index);
tuple = dtuple_create(heap, n_unique + 1);
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
index e7735586..645e0c79 100644
--- a/storage/innobase/dict/dict0load.cc
+++ b/storage/innobase/dict/dict0load.cc
@@ -887,20 +887,19 @@ static uint32_t dict_find_max_space_id(btr_pcur_t *pcur, mtr_t *mtr)
/** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system.
Open each data file if an encryption plugin has been loaded.
-@param spaces set of tablespace files to open */
-void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces)
+@param spaces set of tablespace files to open
+@param upgrade whether we need to invoke ibuf_upgrade() */
+void dict_load_tablespaces(const std::set<uint32_t> *spaces, bool upgrade)
{
uint32_t max_space_id = 0;
btr_pcur_t pcur;
mtr_t mtr;
- DBUG_ENTER("dict_check_tablespaces_and_store_max_id");
-
mtr.start();
dict_sys.lock(SRW_LOCK_CALL);
- if (!spaces && ibuf.empty
+ if (!spaces && !upgrade
&& !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
max_space_id = dict_find_max_space_id(&pcur, &mtr);
goto done;
@@ -1000,8 +999,6 @@ done:
fil_set_max_space_id_if_bigger(max_space_id);
dict_sys.unlock();
-
- DBUG_VOID_RETURN;
}
/** Error message for a delete-marked record in dict_load_column_low() */
@@ -1149,7 +1146,7 @@ err_len:
prtype = dtype_form_prtype(
prtype,
- data_mysql_default_charset_coll);
+ default_charset_info->number);
}
}
@@ -2493,9 +2490,7 @@ corrupted:
goto corrupted;
}
- if (table->supports_instant()) {
- err = btr_cur_instant_init(table);
- }
+ err = btr_cur_instant_init(table);
}
} else {
ut_ad(ignore_err & DICT_ERR_IGNORE_INDEX);
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index f647278d..ca0e6885 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -602,8 +602,6 @@ dict_stats_table_clone_create(
continue;
}
- ut_ad(!dict_index_is_ibuf(index));
-
ulint n_uniq = dict_index_get_n_unique(index);
heap_size += sizeof(dict_index_t);
@@ -652,8 +650,6 @@ dict_stats_table_clone_create(
continue;
}
- ut_ad(!dict_index_is_ibuf(index));
-
dict_index_t* idx;
idx = (dict_index_t*) mem_heap_zalloc(heap, sizeof(*idx));
@@ -697,9 +693,6 @@ dict_stats_table_clone_create(
heap,
idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
-
- idx->stat_defrag_n_page_split = 0;
- idx->stat_defrag_n_pages_freed = 0;
}
ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@@ -723,16 +716,9 @@ dict_stats_table_clone_free(
/*********************************************************************//**
Write all zeros (or 1 where it makes sense) into an index
statistics members. The resulting stats correspond to an empty index. */
-static
-void
-dict_stats_empty_index(
-/*===================*/
- dict_index_t* index, /*!< in/out: index */
- bool empty_defrag_stats)
- /*!< in: whether to empty defrag stats */
+static void dict_stats_empty_index(dict_index_t *index)
{
ut_ad(!(index->type & DICT_FTS));
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(index->table->stats_mutex_is_owner());
ulint n_uniq = index->n_uniq;
@@ -745,16 +731,12 @@ dict_stats_empty_index(
index->stat_index_size = 1;
index->stat_n_leaf_pages = 1;
-
- if (empty_defrag_stats) {
- dict_stats_empty_defrag_stats(index);
- dict_stats_empty_defrag_summary(index);
- }
}
-void dict_stats_empty_table(
- dict_table_t* table,
- bool empty_defrag_stats)
+/** Write all zeros (or 1 where it makes sense) into a table and its indexes'
+statistics members. The resulting stats correspond to an empty table.
+@param table table statistics to be emptied */
+void dict_stats_empty_table(dict_table_t *table)
{
/* Initialize table/index level stats is now protected by
table level lock_mutex.*/
@@ -778,9 +760,7 @@ void dict_stats_empty_table(
continue;
}
- ut_ad(!dict_index_is_ibuf(index));
-
- dict_stats_empty_index(index, empty_defrag_stats);
+ dict_stats_empty_index(index);
}
table->stat_initialized = TRUE;
@@ -906,14 +886,12 @@ dict_stats_copy(
if (dst_idx->type & DICT_FTS) {
continue;
}
- dict_stats_empty_index(dst_idx, true);
+ dict_stats_empty_index(dst_idx);
} else {
continue;
}
}
- ut_ad(!dict_index_is_ibuf(dst_idx));
-
if (!INDEX_EQ(src_idx, dst_idx)) {
for (src_idx = dict_table_get_first_index(src);
src_idx != NULL;
@@ -926,7 +904,7 @@ dict_stats_copy(
}
if (!INDEX_EQ(src_idx, dst_idx)) {
- dict_stats_empty_index(dst_idx, true);
+ dict_stats_empty_index(dst_idx);
continue;
}
@@ -937,7 +915,7 @@ dict_stats_copy(
/* Since src is smaller some elements in dst
will remain untouched by the following memmove(),
thus we init all of them here. */
- dict_stats_empty_index(dst_idx, true);
+ dict_stats_empty_index(dst_idx);
} else {
n_copy_el = dst_idx->n_uniq;
}
@@ -957,13 +935,6 @@ dict_stats_copy(
dst_idx->stat_index_size = src_idx->stat_index_size;
dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
-
- dst_idx->stat_defrag_modified_counter =
- src_idx->stat_defrag_modified_counter;
- dst_idx->stat_defrag_n_pages_freed =
- src_idx->stat_defrag_n_pages_freed;
- dst_idx->stat_defrag_n_page_split =
- src_idx->stat_defrag_n_page_split;
}
dst->stat_initialized = TRUE;
@@ -986,9 +957,6 @@ dict_index_t::stat_n_sample_sizes[]
dict_index_t::stat_n_non_null_key_vals[]
dict_index_t::stat_index_size
dict_index_t::stat_n_leaf_pages
-dict_index_t::stat_defrag_modified_counter
-dict_index_t::stat_defrag_n_pages_freed
-dict_index_t::stat_defrag_n_page_split
The returned object should be freed with dict_stats_snapshot_free()
when no longer needed.
@param[in] table table whose stats to copy
@@ -1105,11 +1073,10 @@ btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr)
dberr_t err;
auto offset= index()->page;
- bool merge= false;
ulint height= ULINT_UNDEFINED;
while (buf_block_t *block=
- btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err))
+ btr_block_get(*index(), offset, RW_S_LATCH, &mtr, &err))
{
page_cur.block= block;
@@ -1131,8 +1098,7 @@ btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr)
return DB_SUCCESS;
}
- if (!--height)
- merge= !index()->is_clust();
+ height--;
page_cur_open_on_rnd_user_rec(&page_cur);
@@ -1448,8 +1414,6 @@ Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
are not saved on disk. This was the only way to calculate statistics
before the Persistent Statistics feature was introduced.
-This function doesn't update the defragmentation related stats.
-Only persistent statistics supports defragmentation stats.
@return error code
@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
static
@@ -1470,13 +1434,9 @@ dict_stats_update_transient_for_index(
various means, also via secondary indexes. */
dummy_empty:
index->table->stats_mutex_lock();
- dict_stats_empty_index(index, false);
+ dict_stats_empty_index(index);
index->table->stats_mutex_unlock();
return err;
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
- } else if (ibuf_debug && !dict_index_is_clust(index)) {
- goto dummy_empty;
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else if (dict_index_is_online_ddl(index) || !index->is_committed()
|| !index->table->space) {
goto dummy_empty;
@@ -1571,7 +1531,7 @@ dict_stats_update_transient(
if (!table->space) {
/* Nothing to do. */
empty_table:
- dict_stats_empty_table(table, true);
+ dict_stats_empty_table(table);
return err;
} else if (index == NULL) {
/* Table definition is corrupt */
@@ -1582,9 +1542,6 @@ empty_table:
}
for (; index != NULL; index = dict_table_get_next_index(index)) {
-
- ut_ad(!dict_index_is_ibuf(index));
-
if (!index->is_btree()) {
continue;
}
@@ -1593,7 +1550,7 @@ empty_table:
|| !index->is_readable()
|| err == DB_SUCCESS_LOCKED_REC) {
index->table->stats_mutex_lock();
- dict_stats_empty_index(index, false);
+ dict_stats_empty_index(index);
index->table->stats_mutex_unlock();
continue;
}
@@ -1649,9 +1606,7 @@ static dberr_t page_cur_open_level(page_cur_t *page_cur, ulint level,
for (ulint height = ULINT_UNDEFINED;; height--)
{
- buf_block_t* block=
- btr_block_get(*index, page, RW_S_LATCH,
- !height && !index->is_clust(), mtr, &err);
+ buf_block_t* block= btr_block_get(*index, page, RW_S_LATCH, mtr, &err);
if (!block)
break;
@@ -2269,9 +2224,7 @@ dict_stats_analyze_index_below_cur(
block = buf_page_get_gen(page_id, zip_size,
RW_S_LATCH, NULL, BUF_GET,
- &mtr, &err,
- !index->is_clust()
- && 1 == btr_page_get_level(page));
+ &mtr, &err);
if (!block) {
goto func_exit;
}
@@ -3005,20 +2958,19 @@ dict_stats_update_persistent(
|| (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) {
/* Table definition is corrupt */
- dict_stats_empty_table(table, true);
+ dict_stats_empty_table(table);
return(DB_CORRUPTION);
}
- ut_ad(!dict_index_is_ibuf(index));
table->stats_mutex_lock();
- dict_stats_empty_index(index, false);
+ dict_stats_empty_index(index);
table->stats_mutex_unlock();
index_stats_t stats = dict_stats_analyze_index(index);
if (stats.is_bulk_operation()) {
- dict_stats_empty_table(table, false);
+ dict_stats_empty_table(table);
return DB_SUCCESS_LOCKED_REC;
}
@@ -3049,7 +3001,7 @@ dict_stats_update_persistent(
continue;
}
- dict_stats_empty_index(index, false);
+ dict_stats_empty_index(index);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -3061,7 +3013,7 @@ dict_stats_update_persistent(
if (stats.is_bulk_operation()) {
table->stats_mutex_unlock();
- dict_stats_empty_table(table, false);
+ dict_stats_empty_table(table);
return DB_SUCCESS_LOCKED_REC;
}
@@ -3189,25 +3141,21 @@ dict_stats_save_index_stat(
/** Report an error if updating table statistics failed because
.ibd file is missing, table decryption failed or table is corrupted.
@param[in,out] table Table
-@param[in] defragment true if statistics is for defragment
@retval DB_DECRYPTION_FAILED if decryption of the table failed
@retval DB_TABLESPACE_DELETED if .ibd file is missing
@retval DB_CORRUPTION if table is marked as corrupted */
-dberr_t
-dict_stats_report_error(dict_table_t* table, bool defragment)
+static dberr_t dict_stats_report_error(dict_table_t* table)
{
dberr_t err;
- const char* df = defragment ? " defragment" : "";
-
if (!table->space) {
- ib::warn() << "Cannot save" << df << " statistics for table "
+ ib::warn() << "Cannot save statistics for table "
<< table->name
<< " because the .ibd file is missing. "
<< TROUBLESHOOTING_MSG;
err = DB_TABLESPACE_DELETED;
} else {
- ib::warn() << "Cannot save" << df << " statistics for table "
+ ib::warn() << "Cannot save statistics for table "
<< table->name
<< " because file "
<< table->space->chain.start->name
@@ -3217,7 +3165,7 @@ dict_stats_report_error(dict_table_t* table, bool defragment)
err = table->corrupted ? DB_CORRUPTION : DB_DECRYPTION_FAILED;
}
- dict_stats_empty_table(table, defragment);
+ dict_stats_empty_table(table);
return err;
}
@@ -3400,8 +3348,6 @@ unlocked_free_and_exit:
continue;
}
- ut_ad(!dict_index_is_ibuf(index));
-
for (unsigned i = 0; i < index->n_uniq; i++) {
char stat_name[16];
@@ -3713,16 +3659,6 @@ dict_stats_fetch_index_stats_step(
== 0) {
index->stat_n_leaf_pages = (ulint) stat_value;
arg->stats_were_modified = true;
- } else if (stat_name_len == 12 /* strlen("n_page_split") */
- && strncasecmp("n_page_split", stat_name, stat_name_len)
- == 0) {
- index->stat_defrag_n_page_split = (ulint) stat_value;
- arg->stats_were_modified = true;
- } else if (stat_name_len == 13 /* strlen("n_pages_freed") */
- && strncasecmp("n_pages_freed", stat_name, stat_name_len)
- == 0) {
- index->stat_defrag_n_pages_freed = (ulint) stat_value;
- arg->stats_were_modified = true;
} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
&& strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
@@ -3830,7 +3766,7 @@ dict_stats_fetch_from_ps(
the persistent storage contains incomplete stats (e.g. missing stats
for some index) then we would end up with (partially) uninitialized
stats. */
- dict_stats_empty_table(table, true);
+ dict_stats_empty_table(table);
THD* thd = current_thd;
MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
@@ -3968,22 +3904,6 @@ release_and_exit:
}
/*********************************************************************//**
-Clear defragmentation stats modified counter for all indices in table. */
-static
-void
-dict_stats_empty_defrag_modified_counter(
- dict_table_t* table) /*!< in: table */
-{
- dict_index_t* index;
- ut_a(table);
- for (index = dict_table_get_first_index(table);
- index != NULL;
- index = dict_table_get_next_index(index)) {
- index->stat_defrag_modified_counter = 0;
- }
-}
-
-/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
void
dict_stats_update_for_index(
@@ -4059,13 +3979,13 @@ dict_stats_update(
/* If we have set a high innodb_force_recovery level, do
not calculate statistics, as a badly corrupted index can
cause a crash in it. */
- dict_stats_empty_table(table, false);
+ dict_stats_empty_table(table);
return(DB_SUCCESS);
}
if (trx_id_t bulk_trx_id = table->bulk_trx_id) {
if (trx_sys.find(nullptr, bulk_trx_id, false)) {
- dict_stats_empty_table(table, false);
+ dict_stats_empty_table(table);
return DB_SUCCESS_LOCKED_REC;
}
}
@@ -4127,8 +4047,7 @@ dict_stats_update(
goto transient;
case DICT_STATS_EMPTY_TABLE:
-
- dict_stats_empty_table(table, true);
+ dict_stats_empty_table(table);
/* If table is using persistent stats,
then save the stats on disk */
@@ -4189,7 +4108,6 @@ dict_stats_update(
t->stats_last_recalc = table->stats_last_recalc;
t->stat_modified_counter = 0;
- dict_stats_empty_defrag_modified_counter(t);
switch (err) {
case DB_SUCCESS:
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index b0c34dc6..700380d0 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -27,7 +27,6 @@ Created Apr 25, 2012 Vasil Dimov
#include "dict0dict.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
-#include "dict0defrag_bg.h"
#include "row0mysql.h"
#include "srv0start.h"
#include "fil0fil.h"
@@ -79,7 +78,6 @@ static void dict_stats_recalc_pool_deinit()
ut_ad(!srv_read_only_mode);
recalc_pool.clear();
- defrag_pool.clear();
/*
recalc_pool may still have its buffer allocated. It will free it when
its destructor is called.
@@ -89,9 +87,7 @@ static void dict_stats_recalc_pool_deinit()
to empty_pool object, which will free it when leaving this function:
*/
recalc_pool_t recalc_empty_pool;
- defrag_pool_t defrag_empty_pool;
recalc_pool.swap(recalc_empty_pool);
- defrag_pool.swap(defrag_empty_pool);
if (dict_stats_thd)
destroy_background_thd(dict_stats_thd);
@@ -260,7 +256,6 @@ void dict_stats_init()
ut_ad(!srv_read_only_mode);
mysql_mutex_init(recalc_pool_mutex_key, &recalc_pool_mutex, nullptr);
pthread_cond_init(&recalc_pool_cond, nullptr);
- dict_defrag_pool_init();
stats_initialised= true;
}
@@ -277,7 +272,6 @@ void dict_stats_deinit()
stats_initialised = false;
dict_stats_recalc_pool_deinit();
- dict_defrag_pool_deinit();
mysql_mutex_destroy(&recalc_pool_mutex);
pthread_cond_destroy(&recalc_pool_cond);
@@ -391,7 +385,6 @@ static void dict_stats_func(void*)
set_current_thd(dict_stats_thd);
while (dict_stats_process_entry_from_recalc_pool(dict_stats_thd)) {}
- dict_defrag_process_entries_from_defrag_pool(dict_stats_thd);
innobase_reset_background_thd(dict_stats_thd);
set_current_thd(nullptr);
diff --git a/storage/innobase/dict/drop.cc b/storage/innobase/dict/drop.cc
index dce71974..fcda4ad0 100644
--- a/storage/innobase/dict/drop.cc
+++ b/storage/innobase/dict/drop.cc
@@ -66,8 +66,6 @@ before transaction commit and must be rolled back explicitly are as follows:
#include "dict0stats.h"
#include "dict0stats_bg.h"
-#include "dict0defrag_bg.h"
-#include "btr0defragment.h"
#include "ibuf0ibuf.h"
#include "lock0lock.h"
@@ -240,8 +238,6 @@ void trx_t::commit(std::vector<pfs_os_file_t> &deleted)
flush_log_later= false;
if (dict_operation)
{
- std::vector<uint32_t> space_ids;
- space_ids.reserve(mod_tables.size());
ut_ad(dict_sys.locked());
lock_sys.wr_lock(SRW_LOCK_CALL);
mutex_lock();
@@ -268,15 +264,11 @@ void trx_t::commit(std::vector<pfs_os_file_t> &deleted)
{
dict_table_t *table= p.first;
dict_stats_recalc_pool_del(table->id, true);
- dict_stats_defrag_pool_del(table, nullptr);
- if (btr_defragment_active)
- btr_defragment_remove_table(table);
const fil_space_t *space= table->space;
ut_ad(!p.second.is_aux_table() || purge_sys.must_wait_FTS());
dict_sys.remove(table);
if (const auto id= space ? space->id : 0)
{
- space_ids.emplace_back(id);
pfs_os_file_t d= fil_delete_tablespace(id);
if (d != OS_FILE_CLOSED)
deleted.emplace_back(d);
@@ -289,9 +281,6 @@ void trx_t::commit(std::vector<pfs_os_file_t> &deleted)
mysql_mutex_lock(&lock_sys.wait_mutex);
lock_sys.deadlock_check();
mysql_mutex_unlock(&lock_sys.wait_mutex);
-
- for (const auto id : space_ids)
- ibuf_delete_for_discarded_space(id);
}
commit_cleanup();
}
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
index d4e6c6f3..0d67e1a9 100644
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -445,11 +445,11 @@ static byte* fil_encrypt_buf_for_non_full_checksum(
uint srclen = size - unencrypted_bytes;
const byte* src = src_frame + header_len;
byte* dst = dst_frame + header_len;
- uint32 dstlen = 0;
if (page_compressed) {
srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
}
+ uint dstlen = srclen;
int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
crypt_data, key_version,
@@ -516,7 +516,7 @@ static byte* fil_encrypt_buf_for_full_crc32(
+ FIL_PAGE_FCRC32_CHECKSUM);
const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
- uint dstlen = 0;
+ uint dstlen = srclen;
ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
@@ -647,7 +647,6 @@ static dberr_t fil_space_decrypt_full_crc32(
/* Calculate the offset where decryption starts */
const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
byte* dst = tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
- uint dstlen = 0;
bool corrupted = false;
uint size = buf_page_full_crc32_size(src_frame, NULL, &corrupted);
if (UNIV_UNLIKELY(corrupted)) {
@@ -656,6 +655,7 @@ static dberr_t fil_space_decrypt_full_crc32(
uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ FIL_PAGE_FCRC32_CHECKSUM);
+ uint dstlen = srclen;
int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
crypt_data, key_version,
@@ -711,8 +711,8 @@ static dberr_t fil_space_decrypt_for_non_full_checksum(
/* Calculate the offset where decryption starts */
const byte* src = src_frame + header_len;
byte* dst = tmp_frame + header_len;
- uint32 dstlen = 0;
uint srclen = uint(physical_size) - header_len - FIL_PAGE_DATA_END;
+ uint dstlen = srclen;
if (page_compressed) {
srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index e8d23657..b16a4c54 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -517,6 +517,9 @@ void fil_space_t::flush_low()
break;
}
+ if (fil_system.is_write_through())
+ goto skip_flush;
+
fil_n_pending_tablespace_flushes++;
for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
node= UT_LIST_GET_NEXT(chain, node))
@@ -541,8 +544,9 @@ void fil_space_t::flush_low()
mysql_mutex_unlock(&fil_system.mutex);
}
- clear_flush();
fil_n_pending_tablespace_flushes--;
+skip_flush:
+ clear_flush();
}
/** Try to extend a tablespace.
@@ -665,6 +669,19 @@ fil_space_extend_must_retry(
return false;
}
+bool recv_sys_t::check_sys_truncate()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ if (!truncated_sys_space.lsn)
+ return false;
+ if (fil_system.sys_space->size <= fil_system.sys_space->recv_size)
+ {
+ truncated_sys_space={0,0};
+ return false;
+ }
+ return true;
+}
+
/** @return whether the file is usable for io() */
ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
{
@@ -681,6 +698,8 @@ ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
else if (node->deferred);
else if (auto desired_size= recv_size)
{
+ if (id == TRX_SYS_SPACE && recv_sys.check_sys_truncate())
+ goto clear;
bool success;
while (fil_space_extend_must_retry(this, node, desired_size, &success))
mysql_mutex_lock(&fil_system.mutex);
@@ -768,7 +787,6 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
{
if (space->is_in_unflushed_spaces)
{
- ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
space->is_in_unflushed_spaces= false;
fil_system.unflushed_spaces.remove(*space);
}
@@ -801,7 +819,6 @@ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle)
if (space->is_in_unflushed_spaces)
{
- ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
space->is_in_unflushed_spaces= false;
unflushed_spaces.remove(*space);
}
@@ -1229,9 +1246,6 @@ void fil_system_t::create(ulint hash_size)
ut_ad(!is_initialised());
ut_ad(!(srv_page_size % FSP_EXTENT_SIZE));
ut_ad(srv_page_size);
- ut_ad(!spaces.array);
-
- m_initialised = true;
compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX));
compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN));
@@ -1242,6 +1256,8 @@ void fil_system_t::create(ulint hash_size)
spaces.create(hash_size);
+ need_unflushed_spaces = !write_through && buf_dblwr.need_fsync();
+
fil_space_crypt_init();
#ifdef __linux__
ssd.clear();
@@ -1315,13 +1331,12 @@ void fil_system_t::close()
if (is_initialised())
{
- m_initialised= false;
spaces.free();
mysql_mutex_destroy(&mutex);
fil_space_crypt_cleanup();
}
- ut_ad(!spaces.array);
+ ut_ad(!is_initialised());
#ifdef __linux__
ssd.clear();
@@ -1364,6 +1379,123 @@ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
mysql_mutex_unlock(&mutex);
}
+ATTRIBUTE_COLD void fil_space_t::reopen_all()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ fil_system.freeze_space_list++;
+
+ for (fil_space_t &space : fil_system.space_list)
+ {
+ for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ if (node->is_open())
+ goto need_to_close;
+ continue;
+
+ need_to_close:
+ uint32_t p= space.n_pending.fetch_or(CLOSING, std::memory_order_acquire);
+ if (p & (STOPPING | CLOSING))
+ continue;
+
+ for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ if (!node->is_open())
+ continue;
+
+ ulint type= OS_DATA_FILE;
+
+#if defined _WIN32 || defined O_DIRECT
+ switch (FSP_FLAGS_GET_ZIP_SSIZE(space.flags)) {
+ case 1: case 2:
+ type= OS_DATA_FILE_NO_O_DIRECT;
+ }
+#endif
+
+ for (ulint count= 10000; count--;)
+ {
+ p= space.pending();
+
+ if (!(p & CLOSING) || (p & STOPPING))
+ break;
+
+ if (!(p & PENDING) && !node->being_extended)
+ {
+ space.reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ /* Unconditionally flush the file, because
+ fil_system.write_through was updated prematurely,
+ potentially causing some flushes to be lost. */
+ os_file_flush(node->handle);
+ mysql_mutex_lock(&fil_system.mutex);
+ p= space.n_pending.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+ if (!(p & CLOSING) || (p & STOPPING))
+ break;
+
+ if (!(p & PENDING) && !node->being_extended)
+ {
+ ut_a(os_file_close(node->handle));
+ bool success;
+ node->handle= os_file_create(innodb_data_file_key, node->name,
+ node->is_raw_disk
+ ? OS_FILE_OPEN_RAW : OS_FILE_OPEN,
+ OS_FILE_AIO, type,
+ srv_read_only_mode, &success);
+ ut_a(success);
+ goto next_file;
+ }
+ }
+
+ space.reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ std::this_thread::sleep_for(std::chrono::microseconds(100));
+ mysql_mutex_lock(&fil_system.mutex);
+ space.release();
+
+ if (!node->is_open())
+ goto next_file;
+ }
+
+ if (!(p & CLOSING) || (p & STOPPING))
+ next_file:
+ continue;
+
+ sql_print_error("InnoDB: Failed to reopen file '%s' due to " UINT32PF
+ " operations", node->name, p & PENDING);
+ }
+ }
+
+ fil_system.freeze_space_list--;
+}
+
+void fil_system_t::set_write_through(bool write_through)
+{
+ mysql_mutex_lock(&mutex);
+
+ if (write_through != is_write_through())
+ {
+ this->write_through= write_through;
+ fil_space_t::reopen_all();
+ need_unflushed_spaces = !write_through && buf_dblwr.need_fsync();
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
+void fil_system_t::set_buffered(bool buffered)
+{
+ mysql_mutex_lock(&mutex);
+
+ if (buffered != is_buffered())
+ {
+ this->buffered= buffered;
+ fil_space_t::reopen_all();
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
/** Close all tablespace files at shutdown */
void fil_space_t::close_all()
{
@@ -1384,12 +1516,9 @@ void fil_space_t::close_all()
for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL;
node= UT_LIST_GET_NEXT(chain, node))
{
-
if (!node->is_open())
- {
next:
continue;
- }
for (ulint count= 10000; count--;)
{
@@ -1408,8 +1537,8 @@ void fil_space_t::close_all()
goto next;
}
- ib::error() << "File '" << node->name << "' has " << space.referenced()
- << " operations";
+ sql_print_error("InnoDB: File '%s' has " UINT32PF " operations",
+ node->name, space.referenced());
}
fil_system.detach(&space);
@@ -2718,19 +2847,18 @@ static void fil_invalid_page_access_msg(const char *name,
}
/** Update the data structures on write completion */
-inline void fil_node_t::complete_write()
+void fil_space_t::complete_write()
{
mysql_mutex_assert_not_owner(&fil_system.mutex);
- if (space->purpose != FIL_TYPE_TEMPORARY &&
- srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
- space->set_needs_flush())
+ if (purpose != FIL_TYPE_TEMPORARY &&
+ fil_system.use_unflushed_spaces() && set_needs_flush())
{
mysql_mutex_lock(&fil_system.mutex);
- if (!space->is_in_unflushed_spaces)
+ if (!is_in_unflushed_spaces)
{
- space->is_in_unflushed_spaces= true;
- fil_system.unflushed_spaces.push_front(*space);
+ is_in_unflushed_spaces= true;
+ fil_system.unflushed_spaces.push_front(*this);
}
mysql_mutex_unlock(&fil_system.mutex);
}
@@ -2830,7 +2958,7 @@ io_error:
if (!type.is_async()) {
if (type.is_write()) {
release_sync_write:
- node->complete_write();
+ complete_write();
release:
release();
goto func_exit;
@@ -2850,21 +2978,28 @@ void IORequest::write_complete(int io_error) const
{
ut_ad(fil_validate_skip());
ut_ad(node);
+ fil_space_t *space= node->space;
ut_ad(is_write());
- node->complete_write();
if (!bpage)
{
ut_ad(!srv_read_only_mode);
if (type == IORequest::DBLWR_BATCH)
+ {
buf_dblwr.flush_buffered_writes_completed(*this);
+ /* Above, we already invoked os_file_flush() on the
+ doublewrite buffer if needed. */
+ goto func_exit;
+ }
else
ut_ad(type == IORequest::WRITE_ASYNC);
}
else
buf_page_write_complete(*this, io_error);
- node->space->release();
+ space->complete_write();
+ func_exit:
+ space->release();
}
void IORequest::read_complete(int io_error) const
@@ -2874,10 +3009,6 @@ void IORequest::read_complete(int io_error) const
ut_ad(is_read());
ut_ad(bpage);
- /* IMPORTANT: since i/o handling for reads will read also the insert
- buffer in fil_system.sys_space, we have to be very careful not to
- introduce deadlocks. We never close fil_system.sys_space data files
- and never issue asynchronous reads of change buffer pages. */
const page_id_t id(bpage->id());
if (UNIV_UNLIKELY(io_error != 0))
@@ -2908,14 +3039,6 @@ void IORequest::read_complete(int io_error) const
possibly cached by the OS. */
void fil_flush_file_spaces()
{
- if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
- {
- ut_d(mysql_mutex_lock(&fil_system.mutex));
- ut_ad(fil_system.unflushed_spaces.empty());
- ut_d(mysql_mutex_unlock(&fil_system.mutex));
- return;
- }
-
rescan:
mysql_mutex_lock(&fil_system.mutex);
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
index eaf4e04a..baa4aca1 100644
--- a/storage/innobase/fil/fil0pagecompress.cc
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (C) 2013, 2021, MariaDB Corporation.
+Copyright (C) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -47,7 +47,6 @@ Updated 14/02/2015
#include "trx0sys.h"
#include "row0mysql.h"
#include "buf0lru.h"
-#include "ibuf0ibuf.h"
#include "zlib.h"
#include "row0mysql.h"
#include "lz4.h"
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 5f34fe93..1793ff44 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -33,7 +33,6 @@ Created 11/29/1995 Heikki Tuuri
#include "page0page.h"
#include "srv0srv.h"
#include "srv0start.h"
-#include "ibuf0ibuf.h"
#include "btr0btr.h"
#include "btr0sea.h"
#include "dict0boot.h"
@@ -508,7 +507,7 @@ dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
const page_id_t page_id(space->id, 0);
const ulint zip_size = space->zip_size();
- buf_block_t *free_block = buf_LRU_get_free_block(false);
+ buf_block_t *free_block = buf_LRU_get_free_block(have_no_mutex);
mtr->x_lock_space(space);
@@ -842,7 +841,7 @@ fsp_fill_free_list(
if (i)
{
- buf_block_t *f= buf_LRU_get_free_block(false);
+ buf_block_t *f= buf_LRU_get_free_block(have_no_mutex);
buf_block_t *block= buf_page_create(space, i, zip_size, mtr, f);
if (UNIV_UNLIKELY(block != f))
buf_pool.free_block(f);
@@ -853,11 +852,18 @@ fsp_fill_free_list(
if (space->purpose != FIL_TYPE_TEMPORARY)
{
- buf_block_t *f= buf_LRU_get_free_block(false);
+ buf_block_t *f= buf_LRU_get_free_block(have_no_mutex);
buf_block_t *block=
- buf_page_create(space, i + FSP_IBUF_BITMAP_OFFSET, zip_size, mtr, f);
+ buf_page_create(space, i + 1, zip_size, mtr, f);
if (UNIV_UNLIKELY(block != f))
buf_pool.free_block(f);
+ /* The zero-initialization will reset the change buffer bitmap bits
+ to safe values for possible import to an earlier version that
+ supports change buffering:
+
+ IBUF_BITMAP_FREE = 0 (no space left for buffering inserts)
+ IBUF_BITMAP_BUFFERED = 0 (no changes have been buffered)
+ IBUF_BITMAP_IBUF = 0 (not part of the change buffer) */
fsp_init_file_page(space, block, mtr);
mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
FIL_PAGE_IBUF_BITMAP);
@@ -882,9 +888,9 @@ fsp_fill_free_list(
if (UNIV_UNLIKELY(init_xdes))
{
/* The first page in the extent is a descriptor page and the
- second is an ibuf bitmap page: mark them used */
+ second was reserved for change buffer bitmap: mark them used */
xdes_set_free<false>(*xdes, descr, 0, mtr);
- xdes_set_free<false>(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr);
+ xdes_set_free<false>(*xdes, descr, 1, mtr);
xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
xdes, xoffset, space->free_limit, mtr))
@@ -1046,7 +1052,7 @@ fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
static buf_block_t* fsp_page_create(fil_space_t *space, uint32_t offset,
mtr_t *mtr)
{
- buf_block_t *free_block= buf_LRU_get_free_block(false),
+ buf_block_t *free_block= buf_LRU_get_free_block(have_no_mutex),
*block= buf_page_create(space, offset, space->zip_size(), mtr, free_block);
if (UNIV_UNLIKELY(block != free_block))
buf_pool.free_block(free_block);
@@ -1679,6 +1685,7 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
ut_ad(byte_offset >= FIL_PAGE_DATA);
ut_ad(byte_offset + FSEG_HEADER_SIZE
<= srv_page_size - FIL_PAGE_DATA_END);
+ buf_block_t* iblock= 0;
mtr->x_lock_space(space);
ut_d(space->modify_check(*mtr));
@@ -1691,8 +1698,6 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
goto funct_exit;
}
- buf_block_t* iblock;
-
inode_alloc:
inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err);
@@ -3092,3 +3097,726 @@ std::ostream &fseg_header::to_stream(std::ostream &out) const
return out;
}
#endif /* UNIV_DEBUG */
+
+/** Get the latched extent descriptor page or
+acquire the extent descriptor page.
+@param page_id page identifier to be acquired
+@param mtr mini-transaction
+@param err error code
+@return block descriptor */
+static
+buf_block_t *fsp_get_latched_xdes_page(
+ page_id_t page_id, mtr_t *mtr, dberr_t *err)
+{
+ buf_block_t *block= nullptr;
+ block= mtr->get_already_latched(
+ page_id, MTR_MEMO_PAGE_SX_FIX);
+ if (block)
+ return block;
+ return buf_page_get_gen(
+ page_id, 0, RW_SX_LATCH, nullptr,
+ BUF_GET_POSSIBLY_FREED, mtr, err);
+}
+
+/** Used during system tablespace truncation. Stores
+the "to be modified" extent descriptor page and its
+old page state */
+class fsp_xdes_old_page
+{
+ std::vector<buf_block_t*> m_old_xdes_pages;
+ const uint32_t m_space;
+public:
+ fsp_xdes_old_page(uint32_t space):m_space(space) {}
+ ulint n_pages()
+ {
+ uint32_t count=0;
+ for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++)
+ if (m_old_xdes_pages[i]) count++;
+ return count;
+ }
+
+ __attribute__((warn_unused_result))
+ dberr_t insert(uint32_t page_no, mtr_t *mtr)
+ {
+ uint32_t m_index= page_no >> srv_page_size_shift;
+ if (m_old_xdes_pages.size() > m_index &&
+ m_old_xdes_pages[m_index] != nullptr)
+ return DB_SUCCESS;
+
+ DBUG_EXECUTE_IF("shrink_buffer_pool_full",
+ return DB_OUT_OF_MEMORY;);
+ dberr_t err= DB_SUCCESS;
+ buf_block_t *block= fsp_get_latched_xdes_page(
+ page_id_t(m_space, page_no), mtr, &err);
+ if (block)
+ {
+ buf_block_t *old= buf_LRU_get_free_block(have_no_mutex_soft);
+ if (!old) return DB_OUT_OF_MEMORY;
+
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
+ old->page.frame, block->page.frame, srv_page_size);
+
+ if (m_index >= m_old_xdes_pages.size())
+ m_old_xdes_pages.resize(m_index + 1);
+ m_old_xdes_pages[m_index] = old;
+ }
+ return err;
+ }
+
+ buf_block_t *search(uint32_t page_no)
+ {
+ uint32_t m_index= page_no >> srv_page_size_shift;
+ if (m_index > m_old_xdes_pages.size())
+ return nullptr;
+ return m_old_xdes_pages[m_index];
+ }
+
+ void restore(mtr_t *mtr)
+ {
+ for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++)
+ {
+ if (m_old_xdes_pages[i] == nullptr) continue;
+ buf_block_t *block= mtr->get_already_latched(
+ page_id_t{m_space, i << srv_page_size_shift},
+ MTR_MEMO_PAGE_SX_FIX);
+ ut_ad(block);
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
+ block->page.frame, m_old_xdes_pages[i]->page.frame, srv_page_size);
+ }
+ }
+
+ ~fsp_xdes_old_page()
+ {
+ for (uint32_t i= 0; i < m_old_xdes_pages.size(); i++)
+ if (m_old_xdes_pages[i])
+ buf_block_free(m_old_xdes_pages[i]);
+ }
+};
+
+/** Update the current descriptor entry with last valid
+descriptor entry with skipped descriptor pages
+@param header File segment header
+@param hdr_offset FSP_FREE or FSP_FREE_FRAG
+@param cur_addr current descriptor
+@param last_valid_addr last valid descriptor
+@param skip_len number of truncated extent descriptor entry
+@param mtr mini-transaction
+@return error code or DB_SUCCESS */
+__attribute__((warn_unused_result))
+static
+dberr_t fsp_lst_update_skip(
+ buf_block_t *header, uint16_t hdr_offset,
+ fil_addr_t cur_addr, fil_addr_t last_valid_addr,
+ uint32_t skip_len, mtr_t *mtr)
+{
+ dberr_t err= DB_SUCCESS;
+ uint32_t space_id= header->page.id().space();
+ buf_block_t *cur= fsp_get_latched_xdes_page(
+ page_id_t(space_id, cur_addr.page), mtr, &err);
+
+ if (!cur) return err;
+ if (last_valid_addr.page == FIL_NULL)
+ {
+ /* First node, so update the FIRST pointer of base
+ with current extent descriptor and update
+ the PREV pointer of last valid descriptor with
+ FIL_NULL */
+ flst_write_addr(
+ *header,
+ header->page.frame + hdr_offset + FLST_FIRST,
+ cur_addr.page, cur_addr.boffset, mtr);
+
+ flst_write_addr(
+ *cur,
+ cur->page.frame + cur_addr.boffset + FLST_PREV,
+ last_valid_addr.page, last_valid_addr.boffset, mtr);
+ }
+ else
+ {
+ buf_block_t *prev= nullptr;
+ if (cur->page.id().page_no() == last_valid_addr.page)
+ prev= cur;
+ else
+ {
+ prev= fsp_get_latched_xdes_page(
+ page_id_t(space_id, last_valid_addr.page),
+ mtr, &err);
+ if (!prev) return err;
+ }
+
+ /* Update the NEXT pointer of last valid extent
+ descriptor entry with current extent descriptor */
+ flst_write_addr(
+ *prev,
+ prev->page.frame + last_valid_addr.boffset + FLST_NEXT,
+ cur_addr.page, cur_addr.boffset, mtr);
+
+ /* Update the PREV pointer of current extent
+ descriptor entry with last valid extent descriptor */
+ flst_write_addr(
+ *cur,
+ cur->page.frame + cur_addr.boffset + FLST_PREV,
+ last_valid_addr.page, last_valid_addr.boffset, mtr);
+ }
+
+ byte *len_bytes= &header->page.frame[hdr_offset + FLST_LEN];
+ uint32_t len= mach_read_from_4(len_bytes);
+ ut_ad(len > skip_len);
+ mtr->write<4>(*header, len_bytes, len - skip_len);
+ return DB_SUCCESS;
+}
+
+/** Write the FLST_NEXT pointer of the last valid node with FIL_NULL
+@param header File segment header
+@param hdr_offset FSP_HEADER_OFFSET + FSP_FREE or FSP_FREE_FRAG
+@param cur_addr current descriptor
+@param skip_len number of truncated extent descriptor entry
+@param orig_len original length of the list
+@param mtr mini-transaction
+@return error code or DB_SUCCESS */
+__attribute__((warn_unused_result))
+dberr_t
+fsp_lst_write_end(
+ buf_block_t *header, uint16_t hdr_offset,
+ fil_addr_t cur_addr, uint32_t skip_len, uint32_t orig_len,
+ mtr_t *mtr)
+{
+ dberr_t err= DB_SUCCESS;
+ byte *len_bytes= &header->page.frame[hdr_offset + FLST_LEN];
+ uint32_t len= mach_read_from_4(len_bytes);
+ if (skip_len == 0)
+ {
+func_exit:
+ if (hdr_offset == FSP_FREE_FRAG + FSP_HEADER_OFFSET)
+ {
+ byte *frag_used_byte= &header->page.frame[
+ FSP_HEADER_OFFSET + FSP_FRAG_N_USED];
+ uint32_t n_used_frag= mach_read_from_4(frag_used_byte);
+ /* Update the FSP_FRAG_N_USED value after removing
+ the truncated pages from FSP_FREE_FRAG list */
+ if (len != orig_len)
+ mtr->write<4>(*header, frag_used_byte,
+ n_used_frag - ((orig_len - len) * 2));
+ }
+ return DB_SUCCESS;
+ }
+
+ if (cur_addr.page == FIL_NULL)
+ {
+ /* There is no list, so reset base node */
+ mtr->memset(
+ header,
+ FLST_FIRST + FIL_ADDR_PAGE + hdr_offset, 4, 0xff);
+ mtr->memset(
+ header,
+ FLST_LAST + FIL_ADDR_PAGE + hdr_offset, 4, 0xff);
+ }
+ else
+ {
+ /* Update the FLST_LAST pointer in base node with current
+ valid extent descriptor and mark the FIL_NULL as next in
+ current extent descriptr */
+ flst_write_addr(
+ *header,
+ header->page.frame + hdr_offset + FLST_LAST,
+ cur_addr.page, cur_addr.boffset, mtr);
+
+ buf_block_t *cur_block= fsp_get_latched_xdes_page(
+ page_id_t(header->page.id().space(), cur_addr.page),
+ mtr, &err);
+
+ if (!cur_block) return err;
+
+ flst_write_addr(
+ *cur_block,
+ cur_block->page.frame + cur_addr.boffset + FLST_NEXT,
+ FIL_NULL, 0, mtr);
+ }
+
+ ut_ad(len >= skip_len);
+ len-= skip_len;
+ mtr->write<4>(*header, len_bytes, len);
+ goto func_exit;
+}
+
+/** Remove the truncated extents from the FSP_FREE list
+@param header tablespace header
+@param hdr_offset FSP_FREE or FSP_FREE_FRAG
+@param threshold Remove the pages from the list which is
+ greater than threshold
+@param mtr mini-transaction to remove the extents
+@return DB_SUCCESS on success or error code */
+__attribute__((warn_unused_result))
+static
+dberr_t fsp_shrink_list(buf_block_t *header, uint16_t hdr_offset,
+ uint32_t threshold, mtr_t *mtr)
+{
+ ut_ad(mach_read_from_4(header->page.frame + FIL_PAGE_OFFSET) == 0);
+ const uint32_t len= flst_get_len(hdr_offset + header->page.frame);
+ if (len == 0)
+ return DB_SUCCESS;
+
+ buf_block_t *descr_block= nullptr;
+ dberr_t err= DB_SUCCESS;
+ uint32_t skip_len= 0;
+ fil_addr_t last_valid_addr {FIL_NULL, 0}, next_addr{FIL_NULL, 0};
+ fil_addr_t addr= flst_get_first(header->page.frame + hdr_offset);
+
+ for (uint32_t i= len; i > 0; i--)
+ {
+ ut_d(fil_space_t *space= header->page.id().space() == 0
+ ? fil_system.sys_space
+ : fil_system.temp_space);
+ ut_ad(addr.page < space->size);
+ ut_ad(!(addr.page & (srv_page_size - 1)));
+ if (!descr_block || descr_block->page.id().page_no() != addr.page)
+ {
+ descr_block= fsp_get_latched_xdes_page(
+ page_id_t(header->page.id().space(), addr.page), mtr, &err);
+ if (!descr_block)
+ return err;
+ }
+
+ if (addr.page < threshold)
+ {
+ /* Update only if only non-truncated page */
+ if (skip_len)
+ {
+ err= fsp_lst_update_skip(
+ header, hdr_offset, addr, last_valid_addr, skip_len, mtr);
+ if (err) return err;
+ skip_len= 0;
+ }
+
+ if (threshold <= xdes_get_offset(
+ descr_block->page.frame + addr.boffset - XDES_FLST_NODE))
+ skip_len++;
+ else last_valid_addr= addr;
+ }
+ else skip_len++;
+
+ next_addr= flst_get_next_addr(
+ descr_block->page.frame + addr.boffset);
+ if (next_addr.page != addr.page && addr.page >= threshold)
+ {
+ mtr->release_last_page();
+ descr_block= nullptr;
+ }
+
+ if (next_addr.page == FIL_NULL)
+ {
+ err= fsp_lst_write_end(header, hdr_offset, last_valid_addr,
+ skip_len, len, mtr);
+ break;
+ }
+ addr= next_addr;
+ }
+ ut_d(if (err == DB_SUCCESS) flst_validate(header, hdr_offset, mtr););
+ return err;
+}
+
+/** Reset the XDES_BITMAP for the truncated extents
+@param space tablespace to be truncated
+@param threshold truncated size
+@param mtr mini-transaction to reset XDES_BITMAP
+@return DB_SUCCESS or error code on failure */
+__attribute__((warn_unused_result))
+static
+dberr_t fsp_xdes_reset(uint32_t space_id, uint32_t threshold, mtr_t *mtr)
+{
+ if (!(threshold & (srv_page_size - 1)))
+ return DB_SUCCESS;
+
+ uint32_t cur_descr_page= xdes_calc_descriptor_page(0, threshold);
+ ulint descr_offset= XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(0, threshold);
+ ulint last_descr_offset= XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(
+ 0, (cur_descr_page + srv_page_size - 1));
+ last_descr_offset+= XDES_SIZE;
+ dberr_t err= DB_SUCCESS;
+ buf_block_t *block= fsp_get_latched_xdes_page(
+ page_id_t(space_id, cur_descr_page), mtr, &err);
+ if (!block)
+ return err;
+ mtr->memset(
+ block, descr_offset, (last_descr_offset - descr_offset), 0);
+ return err;
+}
+
+/** This function does 2 things by traversing all the used
+extents in the system tablespace
+1) Find the last used extent
+2) Store the old page frame of the "to be modified" extent
+descriptor pages.
+@param space system tablespace
+@param last_used_extent value is 0 in case of finding the last used
+ extent; else it could be last used extent
+@param old_xdes_entry nullptr or object to store the
+ old page content of "to be modified"
+ extent descriptor pages
+@return DB_SUCCESS or error code */
+__attribute__((warn_unused_result))
+dberr_t fsp_traverse_extents(
+ fil_space_t *space, uint32_t *last_used_extent, mtr_t *mtr,
+ fsp_xdes_old_page *old_xdes_entry= nullptr)
+{
+ dberr_t err= DB_SUCCESS;
+ bool find_last_used_extent= (old_xdes_entry == nullptr);
+ uint32_t threshold= *last_used_extent;
+ uint32_t last_descr_page_no= xdes_calc_descriptor_page(
+ 0, space->free_limit - 1);
+
+ if (find_last_used_extent)
+ *last_used_extent= space->free_limit;
+ else
+ {
+ err= old_xdes_entry->insert(0, mtr);
+ if (err) return err;
+ if (threshold & (srv_page_size - 1))
+ err= old_xdes_entry->insert(
+ xdes_calc_descriptor_page(0, threshold), mtr);
+ }
+
+ buf_block_t *block= nullptr;
+ std::vector<uint32_t> modified_xdes;
+
+ for (uint32_t cur_extent=
+ ((space->free_limit - 1)/ FSP_EXTENT_SIZE) * FSP_EXTENT_SIZE;
+ cur_extent >= threshold;)
+ {
+ if (!block)
+ {
+ block= fsp_get_latched_xdes_page(
+ page_id_t(space->id, last_descr_page_no),
+ mtr, &err);
+ if (!block) return err;
+ }
+
+ xdes_t *descr= XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(0, cur_extent)
+ + block->page.frame;
+
+ if (find_last_used_extent)
+ {
+ ulint state= xdes_get_state(descr);
+ if (state == XDES_FREE)
+ *last_used_extent= cur_extent;
+ else if (state == XDES_FREE_FRAG &&
+ !(cur_extent & (srv_page_size - 1)) &&
+ xdes_get_n_used(descr) == 2)
+ /* Extent Descriptor Page */
+ *last_used_extent= cur_extent;
+ else return DB_SUCCESS;
+ }
+ else
+ {
+ fil_addr_t prev_addr= flst_get_prev_addr(
+ descr + XDES_FLST_NODE);
+ ut_ad(prev_addr.page < space->size ||
+ prev_addr.page == FIL_NULL);
+ ut_ad(prev_addr.page == FIL_NULL ||
+ !(prev_addr.page & (srv_page_size - 1)));
+
+ fil_addr_t next_addr= flst_get_next_addr(
+ descr + XDES_FLST_NODE);
+ ut_ad(next_addr.page < space->size ||
+ next_addr.page == FIL_NULL);
+ ut_ad(next_addr.page == FIL_NULL ||
+ !(next_addr.page & (srv_page_size - 1)));
+
+ if (prev_addr.page < threshold)
+ modified_xdes.push_back(prev_addr.page);
+
+ if (next_addr.page < threshold)
+ modified_xdes.push_back(next_addr.page);
+ }
+
+ cur_extent-= FSP_EXTENT_SIZE;
+ uint32_t cur_descr_page= xdes_calc_descriptor_page(0, cur_extent);
+ if (last_descr_page_no != cur_descr_page)
+ {
+ if (last_descr_page_no >= threshold)
+ mtr->release_last_page();
+ last_descr_page_no= cur_descr_page;
+ block= nullptr;
+ }
+ }
+
+ if (!find_last_used_extent)
+ {
+ for (auto it : modified_xdes)
+ {
+ err= old_xdes_entry->insert(it, mtr);
+ if (err) return err;
+ }
+ modified_xdes.clear();
+ }
+ return err;
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the system tablespace list */
+__attribute__((warn_unused_result))
+dberr_t fsp_tablespace_validate(fil_space_t *space)
+{
+ /* Validate all FSP list in system tablespace */
+ mtr_t local_mtr;
+ dberr_t err= DB_SUCCESS;
+ local_mtr.start();
+ if (buf_block_t *header= fsp_get_header(
+ space, &local_mtr, &err))
+ {
+ flst_validate(header, FSP_FREE + FSP_HEADER_OFFSET, &local_mtr);
+ flst_validate(header, FSP_FREE_FRAG + FSP_HEADER_OFFSET,
+ &local_mtr);
+ flst_validate(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+ &local_mtr);
+ flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+ &local_mtr);
+ flst_validate(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ &local_mtr);
+ }
+ local_mtr.commit();
+ return err;
+}
+#endif /* UNIV_DEBUG */
+
+void fsp_system_tablespace_truncate()
+{
+ uint32_t last_used_extent= 0;
+ fil_space_t *space= fil_system.sys_space;
+ mtr_t mtr;
+ mtr.start();
+ mtr.x_lock_space(space);
+ dberr_t err= fsp_traverse_extents(space, &last_used_extent, &mtr);
+ if (err != DB_SUCCESS)
+ {
+func_exit:
+ sql_print_warning("InnoDB: Cannot shrink the system tablespace "
+ "due to %s", ut_strerr(err));
+ mtr.commit();
+ return;
+ }
+ uint32_t fixed_size= srv_sys_space.get_min_size(),
+ header_size= space->size_in_header;
+ mtr.commit();
+
+ if (last_used_extent >= header_size || fixed_size >= header_size)
+ /* Tablespace is being used within fixed size */
+ return;
+
+ /* Set fixed size as threshold to truncate */
+ if (fixed_size > last_used_extent)
+ last_used_extent= fixed_size;
+
+ bool old_dblwr_buf= buf_dblwr.in_use();
+ /* Flush all pages in buffer pool, so that it doesn't have to
+ use doublewrite buffer and disable dblwr and there should
+ be enough space in redo log */
+ log_make_checkpoint();
+ fil_system.set_use_doublewrite(false);
+
+ buf_block_t *header= nullptr;
+ ut_ad(!fsp_tablespace_validate(space));
+
+ mtr.start();
+ mtr.x_lock_space(space);
+
+ {
+ /* Take the rough estimation of modified extent
+ descriptor page and store their old state */
+ fsp_xdes_old_page old_xdes_list(space->id);
+ err= fsp_traverse_extents(space, &last_used_extent, &mtr, &old_xdes_list);
+
+ if (err == DB_OUT_OF_MEMORY)
+ {
+ mtr.commit();
+ sql_print_warning("InnoDB: Cannot shrink the system "
+ "tablespace from " UINT32PF" to "
+ UINT32PF " pages due to insufficient "
+ "innodb_buffer_pool_size", space->size,
+ last_used_extent);
+ return;
+ }
+
+ sql_print_information("InnoDB: Truncating system tablespace from "
+ UINT32PF " to " UINT32PF " pages", space->size,
+ last_used_extent);
+
+ header= fsp_get_latched_xdes_page(
+ page_id_t(space->id, 0), &mtr, &err);
+ if (!header)
+ goto func_exit;
+
+ mtr.write<4, mtr_t::FORCED>(
+ *header, FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame,
+ last_used_extent);
+
+ if (space->free_limit > last_used_extent)
+ mtr.write<4,mtr_t::MAYBE_NOP>(*header, FSP_HEADER_OFFSET
+ + FSP_FREE_LIMIT + header->page.frame,
+ last_used_extent);
+ err= fsp_shrink_list(
+ header, FSP_HEADER_OFFSET + FSP_FREE, last_used_extent, &mtr);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ err= fsp_shrink_list(
+ header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, last_used_extent, &mtr);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ err= fsp_xdes_reset(space->id, last_used_extent, &mtr);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ mtr.trim_pages(page_id_t(0, last_used_extent));
+ size_t shrink_redo_size= mtr.get_log_size();
+
+ DBUG_EXECUTE_IF("mtr_log_max_size", goto mtr_max;);
+ if (shrink_redo_size >
+ (2 << 20) - 8 /* encryption nonce */ - 5 /* EOF, checksum */)
+ {
+#ifndef DBUG_OFF
+mtr_max:
+#endif
+ /* Replace the modified copy from buffer pool with
+ original copy of the pages. */
+ old_xdes_list.restore(&mtr);
+ mtr.discard_modifications();
+ mtr.commit();
+ ut_ad(!fsp_tablespace_validate(space));
+ sql_print_error(
+ "InnoDB: Cannot shrink the system tablespace "
+ "because the mini-transaction log size (%zu bytes) "
+ "exceeds 2 MiB", shrink_redo_size + 8 + 5);
+ return;
+ }
+ }
+
+ if (space->free_limit > last_used_extent)
+ space->free_limit= last_used_extent;
+ space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE +
+ header->page.frame);
+
+ mtr.commit_shrink(*space, last_used_extent);
+ sql_print_information("InnoDB: System tablespace truncated successfully");
+ fil_system.set_use_doublewrite(old_dblwr_buf);
+}
+
+inline void fil_space_t::clear_freed_ranges(uint32_t threshold)
+{
+ ut_ad(id == SRV_TMP_SPACE_ID);
+ std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+ range_set current_ranges;
+ for (const auto &range : freed_ranges)
+ {
+ if (range.first >= threshold)
+ continue;
+ else if (range.last > threshold)
+ {
+ range_t new_range{range.first, threshold - 1};
+ current_ranges.add_range(new_range);
+ continue;
+ }
+ current_ranges.add_range(range);
+ }
+ freed_ranges= std::move(current_ranges);
+}
+
+void fsp_shrink_temp_space()
+{
+ uint32_t last_used_extent= 0;
+ fil_space_t *space= fil_system.temp_space;
+ mtr_t mtr;
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ mtr.x_lock_space(space);
+ dberr_t err= fsp_traverse_extents(space, &last_used_extent, &mtr);
+ if (err != DB_SUCCESS)
+ {
+func_exit:
+ sql_print_warning("InnoDB: Cannot shrink the temporary tablespace "
+ "due to %s", ut_strerr(err));
+ mtr.commit();
+ return;
+ }
+ uint32_t fixed_size= srv_tmp_space.get_min_size(),
+ header_size= space->size_in_header;
+
+ if (last_used_extent >= header_size || fixed_size >= header_size)
+ {
+ /* Tablespace is being used within fixed size */
+ mtr.commit();
+ return;
+ }
+
+ /* Set fixed size as threshold to truncate */
+ if (fixed_size > last_used_extent)
+ last_used_extent= fixed_size;
+
+ sql_print_information("InnoDB: Truncating temporary tablespace from "
+ UINT32PF " to " UINT32PF " pages", space->size,
+ last_used_extent);
+
+ buf_block_t *header= fsp_get_latched_xdes_page(
+ page_id_t(space->id, 0), &mtr, &err);
+ if (!header)
+ goto func_exit;
+
+ mach_write_to_4(
+ FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame,
+ last_used_extent);
+
+ if (space->free_limit > last_used_extent)
+ mach_write_to_4(
+ FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame,
+ last_used_extent);
+
+ mtr.set_modified(*header);
+
+ err= fsp_shrink_list(header, FSP_HEADER_OFFSET + FSP_FREE,
+ last_used_extent, &mtr);
+
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ err= fsp_shrink_list(
+ header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ last_used_extent, &mtr);
+ DBUG_EXECUTE_IF("fail_temp_truncate", err= DB_ERROR;);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ err= fsp_xdes_reset(space->id, last_used_extent, &mtr);
+ if (err != DB_SUCCESS)
+ goto func_exit;
+
+ space->clear_freed_ranges(last_used_extent);
+ buf_LRU_truncate_temp(last_used_extent);
+ mysql_mutex_lock(&fil_system.mutex);
+
+ space->size= last_used_extent;
+ if (space->free_limit > last_used_extent)
+ space->free_limit= space->size;
+
+ space->free_len= flst_get_len(
+ FSP_HEADER_OFFSET + FSP_FREE+ header->page.frame);
+
+ /* Last file new size after truncation */
+ uint32_t new_last_file_size=
+ last_used_extent -
+ (fixed_size - srv_tmp_space.m_files.at(
+ srv_tmp_space.m_files.size() - 1).param_size());
+
+ space->size_in_header= space->size;
+ space->chain.end->size= new_last_file_size;
+ srv_tmp_space.set_last_file_size(new_last_file_size);
+ mysql_mutex_unlock(&fil_system.mutex);
+ os_file_truncate(
+ space->chain.end->name, space->chain.end->handle,
+ os_offset_t{space->chain.end->size} << srv_page_size_shift, true);
+ mtr.commit();
+ sql_print_information("InnoDB: Temporary tablespace truncated successfully");
+}
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
index 4ac9da50..be52e7e4 100644
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -101,6 +101,7 @@ SysTablespace::parse_params(
ut_ad(m_last_file_size_max == 0);
ut_ad(!m_auto_extend_last_file);
+ ut_ad(!m_auto_shrink);
char* new_str = mem_strdup(filepath_spec);
char* str = new_str;
@@ -147,6 +148,11 @@ SysTablespace::parse_params(
str = parse_units(str, &size);
}
+ if (0 == strncmp(str, ":autoshrink",
+ (sizeof ":autoshrink") - 1)) {
+ str += (sizeof ":autoshrink") - 1;
+ }
+
if (*str != '\0') {
ut_free(new_str);
ib::error()
@@ -267,6 +273,12 @@ SysTablespace::parse_params(
str = parse_units(str, &m_last_file_size_max);
}
+ if (0 == strncmp(str, ":autoshrink",
+ (sizeof ":autoshrink") - 1)) {
+ str += (sizeof ":autoshrink") - 1;
+ m_auto_shrink = true;
+ }
+
if (*str != '\0') {
ut_free(new_str);
ib::error() << "syntax error in file path or"
@@ -334,6 +346,7 @@ SysTablespace::shutdown()
m_created_new_raw = 0;
m_is_tablespace_full = false;
m_sanity_checks_done = false;
+ m_auto_shrink = false;
}
/** Verify the size of the physical file.
@@ -992,6 +1005,7 @@ SysTablespace::normalize_size()
for (files_t::iterator it = m_files.begin(); it != end; ++it) {
it->m_size <<= (20U - srv_page_size_shift);
+ it->m_user_param_size = it->m_size;
}
m_last_file_size_max <<= (20U - srv_page_size_shift);
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
index 48e2fbe3..84b38bc6 100644
--- a/storage/innobase/fut/fut0lst.cc
+++ b/storage/innobase/fut/fut0lst.cc
@@ -35,8 +35,8 @@ Created 11/28/1995 Heikki Tuuri
@param[in] page page number
@param[in] boffset byte offset
@param[in,out] mtr mini-transaction */
-static void flst_write_addr(const buf_block_t& block, byte *faddr,
- uint32_t page, uint16_t boffset, mtr_t* mtr)
+void flst_write_addr(const buf_block_t &block, byte *faddr,
+ uint32_t page, uint16_t boffset, mtr_t *mtr)
{
ut_ad(mtr->memo_contains_page_flagged(faddr, MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
@@ -47,6 +47,14 @@ static void flst_write_addr(const buf_block_t& block, byte *faddr,
static_assert(FIL_ADDR_BYTE == 4, "compatibility");
static_assert(FIL_ADDR_SIZE == 6, "compatibility");
+ if (!mtr->is_logged())
+ {
+ mach_write_to_4(faddr + FIL_ADDR_PAGE, page);
+ mach_write_to_2(faddr + FIL_ADDR_BYTE, boffset);
+ mtr->set_modified(block);
+ return;
+ }
+
const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page;
const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset;
if (same_page)
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
index 83afd732..f75eab07 100644
--- a/storage/innobase/gis/gis0rtree.cc
+++ b/storage/innobase/gis/gis0rtree.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2022, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -34,7 +34,6 @@ Created 2013/03/27 Allen Lai and Jimmy Yang
#include "btr0pcur.h"
#include "rem0cmp.h"
#include "lock0lock.h"
-#include "ibuf0ibuf.h"
#include "trx0undo.h"
#include "srv0mon.h"
#include "gis0geo.h"
@@ -538,7 +537,7 @@ err_exit:
mem_heap_free(heap);
}
-MY_ATTRIBUTE((nonnull, warn_unused_result))
+MY_ATTRIBUTE((nonnull(1,3,4,5,6,8), warn_unused_result))
/**************************************************************//**
Update parent page's MBR and Predicate lock information during a split */
static
@@ -552,6 +551,7 @@ rtr_adjust_upper_level(
buf_block_t* new_block, /*!< in/out: the new half page */
rtr_mbr_t* mbr, /*!< in: MBR on the old page */
rtr_mbr_t* new_mbr, /*!< in: MBR on the new page */
+ que_thr_t* thr, /*!< in/out: query thread */
mtr_t* mtr) /*!< in: mtr */
{
ulint page_no;
@@ -570,7 +570,6 @@ rtr_adjust_upper_level(
/* Create a memory heap where the data tuple is stored */
heap = mem_heap_create(1024);
- cursor.thr = sea_cur->thr;
cursor.page_cur.index = sea_cur->index();
cursor.page_cur.block = block;
@@ -584,7 +583,8 @@ rtr_adjust_upper_level(
/* Set new mbr for the old page on the upper level. */
/* Look up the index for the node pointer to page */
- offsets = rtr_page_get_father_block(NULL, heap, mtr, sea_cur, &cursor);
+ offsets = rtr_page_get_father_block(nullptr, heap, sea_cur, &cursor,
+ thr, mtr);
page_cursor = btr_cur_get_page_cur(&cursor);
@@ -669,7 +669,7 @@ rtr_adjust_upper_level(
if (next_page_no == FIL_NULL) {
} else if (buf_block_t* next_block =
btr_block_get(*sea_cur->index(), next_page_no, RW_X_LATCH,
- false, mtr, &err)) {
+ mtr, &err)) {
if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame
+ FIL_PAGE_PREV,
block->page.frame
@@ -691,11 +691,6 @@ rtr_adjust_upper_level(
/*************************************************************//**
Moves record list to another page for rtree splitting.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return error code
@retval DB_FAIL on ROW_FORMAT=COMPRESSED compression failure */
static
@@ -731,8 +726,7 @@ rtr_split_page_move_rec_list(
ulint max_to_move = 0;
rtr_rec_move_t* rec_move = NULL;
- ut_ad(!dict_index_is_ibuf(index));
- ut_ad(dict_index_is_spatial(index));
+ ut_ad(index->is_spatial());
rec_offs_init(offsets_);
@@ -867,7 +861,8 @@ rtr_page_split_and_insert(
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
mtr_t* mtr, /*!< in: mtr */
- dberr_t* err) /*!< out: error code */
+ dberr_t* err, /*!< out: error code */
+ que_thr_t* thr) /*!< in: query thread */
{
buf_block_t* block;
page_t* page;
@@ -895,6 +890,8 @@ rtr_page_split_and_insert(
int first_rec_group = 1;
IF_DBUG(bool iterated = false,);
+ buf_pool.pages_split++;
+
if (!*heap) {
*heap = mem_heap_create(1024);
}
@@ -1159,7 +1156,7 @@ after_insert:
/* Adjust the upper level. */
*err = rtr_adjust_upper_level(cursor, flags, block, new_block,
- &mbr, &new_mbr, mtr);
+ &mbr, &new_mbr, thr, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
return nullptr;
}
@@ -1179,13 +1176,6 @@ after_insert:
/* If the new res insert fail, we need to do another split
again. */
if (!rec) {
- /* We play safe and reset the free bits for new_page */
- if (!dict_index_is_clust(cursor->index())
- && !cursor->index()->table->is_temporary()) {
- ibuf_reset_free_bits(new_block);
- ibuf_reset_free_bits(block);
- }
-
/* We need to clean the parent path here and search father
node later, otherwise, it's possible that find a wrong
parent. */
@@ -1212,6 +1202,244 @@ after_insert:
return(rec);
}
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts the tuple.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+rtr_root_raise_and_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err, /*!< out: error code */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ rec_t* rec;
+ dtuple_t* node_ptr;
+ ulint level;
+ rec_t* node_ptr_rec;
+ page_cur_t* page_cursor;
+ page_zip_des_t* root_page_zip;
+ page_zip_des_t* new_page_zip;
+ buf_block_t* root;
+ buf_block_t* new_block;
+
+ root = btr_cur_get_block(cursor);
+ root_page_zip = buf_block_get_page_zip(root);
+ ut_ad(!page_is_empty(root->page.frame));
+ index = btr_cur_get_index(cursor);
+ ut_ad(index->is_spatial());
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!root_page_zip
+ || page_zip_validate(root_page_zip, root->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ const page_id_t root_id{root->page.id()};
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
+
+ if (index->page != root_id.page_no()) {
+ ut_ad("corrupted root page number" == 0);
+ return nullptr;
+ }
+
+ if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+ *root, *index->table->space)
+ || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+ *root, *index->table->space)) {
+ return nullptr;
+ }
+
+ /* Allocate a new page to the tree. Root splitting is done by first
+ moving the root records to the new page, emptying the root, putting
+ a node pointer to the new page, and then splitting the new page. */
+
+ level = btr_page_get_level(root->page.frame);
+
+ new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err);
+
+ if (!new_block) {
+ return nullptr;
+ }
+
+ new_page_zip = buf_block_get_page_zip(new_block);
+ ut_a(!new_page_zip == !root_page_zip);
+ ut_a(!new_page_zip
+ || page_zip_get_size(new_page_zip)
+ == page_zip_get_size(root_page_zip));
+
+ btr_page_create(new_block, new_page_zip, index, level, mtr);
+ if (page_has_siblings(new_block->page.frame)) {
+ compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+ memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV,
+ 0xff, 8);
+ mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
+ 0xff, 8);
+ }
+ }
+
+ /* Copy the records from root to the new page one by one. */
+ dberr_t e;
+ if (!err) {
+ err = &e;
+ }
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || !page_copy_rec_list_end(new_block, root,
+ page_get_infimum_rec(root->page.frame),
+ index, mtr, err)) {
+ switch (*err) {
+ case DB_SUCCESS:
+ break;
+ case DB_FAIL:
+ *err = DB_SUCCESS;
+ break;
+ default:
+ return nullptr;
+ }
+
+ ut_a(new_page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(new_block, root_page_zip,
+ root->page.frame, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+ if (index->has_locking()) {
+ lock_move_rec_list_end(
+ new_block, root,
+ page_get_infimum_rec(root->page.frame));
+ }
+
+ /* Move any existing predicate locks */
+ lock_prdt_rec_move(new_block, root_id);
+ }
+
+ constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
+ if (!index->is_primary()) {
+ /* In secondary indexes,
+ PAGE_MAX_TRX_ID can be reset on the root page, because
+ the field only matters on leaf pages, and the root no
+ longer is a leaf page. (Older versions of InnoDB did
+ set PAGE_MAX_TRX_ID on all secondary index pages.) */
+ byte* p = my_assume_aligned<8>(
+ PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame);
+ if (mach_read_from_8(p)) {
+ mtr->memset(root, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + root->page.zip.data, 0, 8);
+ }
+ }
+ } else {
+ /* PAGE_ROOT_AUTO_INC is only present in the clustered index
+ root page; on other clustered index pages, we want to reserve
+ the field PAGE_MAX_TRX_ID for future use. */
+ byte* p = my_assume_aligned<8>(
+ PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame);
+ if (mach_read_from_8(p)) {
+ mtr->memset(new_block, max_trx_id, 8, 0);
+ if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+ memset_aligned<8>(max_trx_id
+ + new_block->page.zip.data,
+ 0, 8);
+ }
+ }
+ }
+
+ /* If this is a pessimistic insert which is actually done to
+ perform a pessimistic update then we have stored the lock
+ information of the record to be inserted on the infimum of the
+ root page: we cannot discard the lock structs on the root page */
+
+ if (index->has_locking()) {
+ lock_update_root_raise(*new_block, root_id);
+ }
+
+ /* Create a memory heap where the node pointer is stored */
+ if (!*heap) {
+ *heap = mem_heap_create(1000);
+ }
+
+ const uint32_t new_page_no = new_block->page.id().page_no();
+ rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame));
+ ut_ad(rec); /* We just created the page. */
+
+ /* Build the node pointer (= node key and page address) for the
+ child */
+ rtr_mbr_t new_mbr;
+ rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
+ node_ptr = rtr_index_build_node_ptr(index, &new_mbr, rec, new_page_no,
+ *heap);
+ /* The node pointer must be marked as the predefined minimum record,
+ as there is no lower alphabetical limit to records in the leftmost
+ node of a level: */
+ dtuple_set_info_bits(node_ptr,
+ dtuple_get_info_bits(node_ptr)
+ | REC_INFO_MIN_REC_FLAG);
+
+ /* Rebuild the root page to get free space */
+ btr_page_empty(root, root_page_zip, index, level + 1, mtr);
+ ut_ad(!page_has_siblings(root->page.frame));
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Insert node pointer to the root */
+
+ page_cur_set_before_first(root, page_cursor);
+
+ node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+ offsets, heap, 0, mtr);
+
+ /* The root page should only contain the node pointer
+ to new_block at this point. Thus, the data should fit. */
+ ut_a(node_ptr_rec);
+
+ page_cursor->block = new_block;
+ page_cursor->index = index;
+
+ if (tuple) {
+ ut_ad(dtuple_check_typed(tuple));
+ /* Reposition the cursor to the child node */
+ ulint low_match = 0, up_match = 0;
+
+ if (page_cur_search_with_match(tuple, PAGE_CUR_LE,
+ &up_match, &low_match,
+ page_cursor, nullptr)) {
+ if (err) {
+ *err = DB_CORRUPTION;
+ }
+ return nullptr;
+ }
+ } else {
+ page_cursor->rec = page_get_infimum_rec(new_block->page.frame);
+ }
+
+ /* Split the child and insert tuple */
+ return rtr_page_split_and_insert(flags, cursor, offsets, heap,
+ tuple, n_ext, mtr, err, thr);
+}
+
/****************************************************************//**
Following the right link to find the proper block for insert.
@return the proper block.*/
@@ -1240,6 +1468,8 @@ rtr_ins_enlarge_mbr(
/* Check path info is not empty. */
ut_ad(!btr_cur->rtr_info->parent_path->empty());
+ ut_ad(btr_cur->rtr_info->thr || !btr_cur->index()->is_committed()
+ || btr_cur->index()->table->name.is_temporary());
/* Create a memory heap. */
heap = mem_heap_create(1024);
@@ -1265,7 +1495,8 @@ rtr_ins_enlarge_mbr(
cursor.page_cur.index = page_cursor->index;
cursor.page_cur.block = block;
offsets = rtr_page_get_father_block(
- NULL, heap, mtr, btr_cur, &cursor);
+ nullptr, heap, btr_cur, &cursor,
+ btr_cur->rtr_info->thr, mtr);
page = buf_block_get_frame(block);
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
index 4aab68e9..10a12a78 100644
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@@ -34,7 +34,6 @@ Created 2014/01/16 Jimmy Yang
#include "btr0pcur.h"
#include "rem0cmp.h"
#include "lock0lock.h"
-#include "ibuf0ibuf.h"
#include "trx0trx.h"
#include "srv0mon.h"
#include "que0que.h"
@@ -114,8 +113,8 @@ rtr_latch_leaves(
left_page_no = btr_page_get_prev(block->page.frame);
if (left_page_no != FIL_NULL) {
- btr_block_get(*cursor->index(), left_page_no, RW_X_LATCH,
- true, mtr);
+ btr_block_get(*cursor->index(), left_page_no,
+ RW_X_LATCH, mtr);
}
mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
@@ -124,7 +123,7 @@ rtr_latch_leaves(
if (right_page_no != FIL_NULL) {
btr_block_get(*cursor->index(), right_page_no,
- RW_X_LATCH, true, mtr);
+ RW_X_LATCH, mtr);
}
break;
case BTR_SEARCH_LEAF:
@@ -539,10 +538,10 @@ static void rtr_compare_cursor_rec(const rec_t *rec, dict_index_t *index,
#endif
TRANSACTIONAL_TARGET
-dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
- page_cur_mode_t mode,
- btr_latch_mode latch_mode,
- btr_cur_t *cur, mtr_t *mtr)
+dberr_t rtr_search_to_nth_level(btr_cur_t *cur, que_thr_t *thr,
+ const dtuple_t *tuple,
+ btr_latch_mode latch_mode, mtr_t *mtr,
+ page_cur_mode_t mode, ulint level)
{
page_cur_mode_t page_mode;
page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
@@ -665,7 +664,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
dberr_t err;
auto block_savepoint= mtr->get_savepoint();
buf_block_t *block= buf_page_get_gen(page_id, zip_size, rw_latch, guess,
- buf_mode, mtr, &err, false);
+ buf_mode, mtr, &err);
if (!block)
{
if (err)
@@ -730,7 +729,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
index->set_ssn(page_get_ssn_id(page) + 1);
/* Save the MBR */
- cur->rtr_info->thr= cur->thr;
+ cur->rtr_info->thr= thr;
rtr_get_mbr_from_tuple(tuple, &cur->rtr_info->mbr);
#ifdef BTR_CUR_ADAPT
@@ -839,7 +838,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
lock_prdt_t prdt;
{
- trx_t* trx= thr_get_trx(cur->thr);
+ trx_t* trx= thr_get_trx(thr);
TMLockTrxGuard g{TMLockTrxArgs(*trx)};
lock_init_prdt_from_mbr(&prdt, &cur->rtr_info->mbr, mode,
trx->lock.lock_heap);
@@ -848,7 +847,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
if (rw_latch == RW_NO_LATCH && height != 0)
block->page.lock.s_lock();
- lock_prdt_lock(block, &prdt, index, LOCK_S, LOCK_PREDICATE, cur->thr);
+ lock_prdt_lock(block, &prdt, index, LOCK_S, LOCK_PREDICATE, thr);
if (rw_latch == RW_NO_LATCH && height != 0)
block->page.lock.s_unlock();
@@ -956,7 +955,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
if (upper_rw_latch == RW_NO_LATCH)
{
ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
- btr_block_get(*index, page_id.page_no(), RW_X_LATCH, false, mtr, &err);
+ btr_block_get(*index, page_id.page_no(), RW_X_LATCH, mtr, &err);
}
else
{
@@ -985,19 +984,21 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
goto func_exit;
}
-dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+dberr_t rtr_search_leaf(btr_cur_t *cur, que_thr_t *thr, const dtuple_t *tuple,
btr_latch_mode latch_mode,
mtr_t *mtr, page_cur_mode_t mode)
{
- return rtr_search_to_nth_level(0, tuple, mode, latch_mode, cur, mtr);
+ return rtr_search_to_nth_level(cur, thr, tuple, latch_mode, mtr, mode, 0);
}
/** Search for a spatial index leaf page record.
-@param pcur cursor
+@param pcur cursor
+@param thr query thread
@param tuple search tuple
@param mode search mode
@param mtr mini-transaction */
-dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, que_thr_t *thr,
+ const dtuple_t *tuple,
page_cur_mode_t mode, mtr_t *mtr)
{
#ifdef UNIV_DEBUG
@@ -1016,7 +1017,8 @@ dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
pcur->search_mode= mode;
pcur->pos_state= BTR_PCUR_IS_POSITIONED;
pcur->trx_if_known= nullptr;
- return rtr_search_leaf(&pcur->btr_cur, tuple, BTR_SEARCH_LEAF, mtr, mode);
+ return rtr_search_leaf(&pcur->btr_cur, thr, tuple, BTR_SEARCH_LEAF, mtr,
+ mode);
}
/**************************************************************//**
@@ -1026,6 +1028,7 @@ bool rtr_search(
const dtuple_t* tuple, /*!< in: tuple on which search done */
btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ que_thr_t* thr, /*!< in/out; query thread */
mtr_t* mtr) /*!< in: mtr */
{
static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
@@ -1054,15 +1057,16 @@ bool rtr_search(
btr_cur_t* btr_cursor = btr_pcur_get_btr_cur(cursor);
btr_cursor->rtr_info
- = rtr_create_rtr_info(false, false,
- btr_cursor, cursor->index());
+ = rtr_create_rtr_info(false, false, thr, btr_cursor);
- if (btr_cursor->thr) {
+ if (!thr) {
+ /* Purge will U lock the tree instead of take Page Locks */
+ } else {
btr_cursor->rtr_info->need_page_lock = true;
- btr_cursor->rtr_info->thr = btr_cursor->thr;
+ btr_cursor->rtr_info->thr = thr;
}
- if (rtr_search_leaf(btr_cursor, tuple, latch_mode, mtr)
+ if (rtr_search_leaf(btr_cursor, thr, tuple, latch_mode, mtr)
!= DB_SUCCESS) {
return true;
}
@@ -1109,12 +1113,14 @@ bool rtr_search(
about parent nodes in search
@param[out] cursor cursor on node pointer record,
its page x-latched
+@param[in,out] thr query thread
@return whether the cursor was successfully positioned */
-bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor,
+ que_thr_t *thr)
{
mem_heap_t *heap = mem_heap_create(100);
rec_offs *offsets= rtr_page_get_father_block(nullptr, heap,
- mtr, sea_cur, cursor);
+ sea_cur, cursor, thr, mtr);
mem_heap_free(heap);
return offsets != nullptr;
}
@@ -1131,12 +1137,13 @@ static const rec_t* rtr_get_father_node(
btr_cur_t* sea_cur,/*!< in: search cursor */
btr_cur_t* btr_cur,/*!< in/out: tree cursor; the cursor page is
s- or x-latched, but see also above! */
+ que_thr_t* thr, /*!< in/out: query thread */
ulint page_no,/*!< Current page no */
mtr_t* mtr) /*!< in: mtr */
{
const rec_t* rec = nullptr;
auto had_rtr = btr_cur->rtr_info;
- dict_index_t* const index = btr_cur->index();
+ ut_d(dict_index_t* const index = btr_cur->index());
/* Try to optimally locate the parent node. Level should always
less than sea_cur->tree_height unless the root is splitting */
@@ -1167,10 +1174,10 @@ static const rec_t* rtr_get_father_node(
rtr_clean_rtr_info(btr_cur->rtr_info, true);
}
- btr_cur->rtr_info = rtr_create_rtr_info(false, false, btr_cur, index);
+ btr_cur->rtr_info = rtr_create_rtr_info(false, false, thr, btr_cur);
- if (rtr_search_to_nth_level(level, tuple, PAGE_CUR_RTREE_LOCATE,
- BTR_CONT_MODIFY_TREE, btr_cur, mtr)
+ if (rtr_search_to_nth_level(btr_cur, thr, tuple, BTR_CONT_MODIFY_TREE,
+ mtr, PAGE_CUR_RTREE_LOCATE, level)
!= DB_SUCCESS) {
} else if (sea_cur && sea_cur->tree_height == level) {
rec = btr_cur_get_rec(btr_cur);
@@ -1218,6 +1225,7 @@ rtr_page_get_father_node_ptr(
btr_cur_t* cursor, /*!< in: cursor pointing to user record,
out: cursor on node pointer record,
its page x-latched */
+ que_thr_t* thr, /*!< in/out: query thread */
mtr_t* mtr) /*!< in: mtr */
{
dtuple_t* tuple;
@@ -1253,7 +1261,7 @@ rtr_page_get_father_node_ptr(
const rec_t* node_ptr = rtr_get_father_node(level + 1, tuple,
sea_cur, cursor,
- page_no, mtr);
+ thr, page_no, mtr);
if (!node_ptr) {
return nullptr;
}
@@ -1279,18 +1287,20 @@ rtr_page_get_father_block(
/*======================*/
rec_offs* offsets,/*!< in: work area for the return value */
mem_heap_t* heap, /*!< in: memory heap to use */
- mtr_t* mtr, /*!< in: mtr */
btr_cur_t* sea_cur,/*!< in: search cursor, contains information
about parent nodes in search */
- btr_cur_t* cursor) /*!< out: cursor on node pointer record,
+ btr_cur_t* cursor, /*!< out: cursor on node pointer record,
its page x-latched */
+ que_thr_t* thr, /*!< in/out: query thread */
+ mtr_t* mtr) /*!< in/out: mtr */
{
rec_t *rec=
page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
if (!rec)
return nullptr;
cursor->page_cur.rec= rec;
- return rtr_page_get_father_node_ptr(offsets, heap, sea_cur, cursor, mtr);
+ return rtr_page_get_father_node_ptr(offsets, heap, sea_cur, cursor,
+ thr, mtr);
}
/*******************************************************************//**
@@ -1303,12 +1313,12 @@ rtr_create_rtr_info(
bool init_matches, /*!< in: Whether to initiate the
"matches" structure for collecting
matched leaf records */
- btr_cur_t* cursor, /*!< in: tree search cursor */
- dict_index_t* index) /*!< in: index struct */
+ que_thr_t* thr, /*!< in/out: query thread */
+ btr_cur_t* cursor) /*!< in: tree search cursor */
{
rtr_info_t* rtr_info;
- index = index ? index : cursor->index();
+ dict_index_t* index = cursor->index();
ut_ad(index);
rtr_info = static_cast<rtr_info_t*>(ut_zalloc_nokey(sizeof(*rtr_info)));
@@ -1316,6 +1326,7 @@ rtr_create_rtr_info(
rtr_info->allocated = true;
rtr_info->cursor = cursor;
rtr_info->index = index;
+ rtr_info->thr = thr;
if (init_matches) {
rtr_info->matches = static_cast<matched_rec_t*>(
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index dfe034ec..89d41bfd 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -73,7 +73,6 @@ extern my_bool opt_readonly;
#include "buf0lru.h"
#include "dict0boot.h"
#include "dict0load.h"
-#include "btr0defragment.h"
#include "dict0crea.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
@@ -83,7 +82,6 @@ extern my_bool opt_readonly;
#include "fts0plugin.h"
#include "fts0priv.h"
#include "fts0types.h"
-#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "log0crypt.h"
#include "mtr0mtr.h"
@@ -219,6 +217,8 @@ enum default_row_format_enum {
DEFAULT_ROW_FORMAT_DYNAMIC = 2,
};
+static my_bool innodb_truncate_temporary_tablespace_now;
+
/** Whether ROW_FORMAT=COMPRESSED tables are read-only */
static my_bool innodb_read_only_compressed;
@@ -353,7 +353,7 @@ static TYPELIB innodb_default_row_format_typelib = {
};
/** Names of allowed values of innodb_flush_method */
-const char* innodb_flush_method_names[] = {
+static const char* innodb_flush_method_names[] = {
"fsync",
"O_DSYNC",
"littlesync",
@@ -368,6 +368,8 @@ const char* innodb_flush_method_names[] = {
NullS
};
+static constexpr ulong innodb_flush_method_default = IF_WIN(6,4);
+
/** Enumeration of innodb_flush_method */
TYPELIB innodb_flush_method_typelib = {
array_elements(innodb_flush_method_names) - 1,
@@ -376,6 +378,21 @@ TYPELIB innodb_flush_method_typelib = {
NULL
};
+/** Deprecated parameter */
+static ulong innodb_flush_method;
+
+/** Names of allowed values of innodb_doublewrite */
+static const char *innodb_doublewrite_names[]=
+ {"OFF", "ON", "fast", nullptr};
+
+/** Enumeration of innodb_doublewrite */
+TYPELIB innodb_doublewrite_typelib= {
+ array_elements(innodb_doublewrite_names) - 1,
+ "innodb_doublewrite_typelib",
+ innodb_doublewrite_names,
+ nullptr
+};
+
/** Names of allowed values of innodb_deadlock_report */
static const char *innodb_deadlock_report_names[]= {
"off", /* Do not report any details of deadlocks */
@@ -396,25 +413,6 @@ static TYPELIB innodb_deadlock_report_typelib = {
NULL
};
-/** Allowed values of innodb_change_buffering */
-static const char* innodb_change_buffering_names[] = {
- "none", /* IBUF_USE_NONE */
- "inserts", /* IBUF_USE_INSERT */
- "deletes", /* IBUF_USE_DELETE_MARK */
- "changes", /* IBUF_USE_INSERT_DELETE_MARK */
- "purges", /* IBUF_USE_DELETE */
- "all", /* IBUF_USE_ALL */
- NullS
-};
-
-/** Enumeration of innodb_change_buffering */
-static TYPELIB innodb_change_buffering_typelib = {
- array_elements(innodb_change_buffering_names) - 1,
- "innodb_change_buffering_typelib",
- innodb_change_buffering_names,
- NULL
-};
-
/** Allowed values of innodb_instant_alter_column_allowed */
const char* innodb_instant_alter_column_allowed_names[] = {
"never", /* compatible with MariaDB 5.5 to 10.2 */
@@ -528,9 +526,6 @@ mysql_pfs_key_t fts_cache_mutex_key;
mysql_pfs_key_t fts_cache_init_mutex_key;
mysql_pfs_key_t fts_delete_mutex_key;
mysql_pfs_key_t fts_doc_id_mutex_key;
-mysql_pfs_key_t ibuf_bitmap_mutex_key;
-mysql_pfs_key_t ibuf_mutex_key;
-mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
mysql_pfs_key_t recalc_pool_mutex_key;
mysql_pfs_key_t purge_sys_pq_mutex_key;
mysql_pfs_key_t recv_sys_mutex_key;
@@ -563,8 +558,6 @@ static PSI_mutex_info all_innodb_mutexes[] = {
PSI_KEY(fts_cache_init_mutex),
PSI_KEY(fts_delete_mutex),
PSI_KEY(fts_doc_id_mutex),
- PSI_KEY(ibuf_mutex),
- PSI_KEY(ibuf_pessimistic_insert_mutex),
PSI_KEY(index_online_log),
PSI_KEY(page_zip_stat_per_index_mutex),
PSI_KEY(purge_sys_pq_mutex),
@@ -971,20 +964,6 @@ static SHOW_VAR innodb_status_variables[]= {
{"dblwr_writes", &export_vars.innodb_dblwr_writes, SHOW_SIZE_T},
{"deadlocks", &lock_sys.deadlocks, SHOW_SIZE_T},
{"history_list_length", &export_vars.innodb_history_list_length,SHOW_SIZE_T},
- {"ibuf_discarded_delete_marks", &ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK],
- SHOW_SIZE_T},
- {"ibuf_discarded_deletes", &ibuf.n_discarded_ops[IBUF_OP_DELETE],
- SHOW_SIZE_T},
- {"ibuf_discarded_inserts", &ibuf.n_discarded_ops[IBUF_OP_INSERT],
- SHOW_SIZE_T},
- {"ibuf_free_list", &ibuf.free_list_len, SHOW_SIZE_T},
- {"ibuf_merged_delete_marks", &ibuf.n_merged_ops[IBUF_OP_DELETE_MARK],
- SHOW_SIZE_T},
- {"ibuf_merged_deletes", &ibuf.n_merged_ops[IBUF_OP_DELETE], SHOW_SIZE_T},
- {"ibuf_merged_inserts", &ibuf.n_merged_ops[IBUF_OP_INSERT], SHOW_SIZE_T},
- {"ibuf_merges", &ibuf.n_merges, SHOW_SIZE_T},
- {"ibuf_segment_size", &ibuf.seg_size, SHOW_SIZE_T},
- {"ibuf_size", &ibuf.size, SHOW_SIZE_T},
{"log_waits", &log_sys.waits, SHOW_SIZE_T},
{"log_write_requests", &log_sys.write_to_buf, SHOW_SIZE_T},
{"log_writes", &log_sys.write_to_log, SHOW_SIZE_T},
@@ -1037,12 +1016,6 @@ static SHOW_VAR innodb_status_variables[]= {
{"have_snappy", &(provider_service_snappy->is_loaded), SHOW_BOOL},
{"have_punch_hole", &innodb_have_punch_hole, SHOW_BOOL},
- /* Defragmentation */
- {"defragment_compression_failures",
- &export_vars.innodb_defragment_compression_failures, SHOW_SIZE_T},
- {"defragment_failures", &export_vars.innodb_defragment_failures,SHOW_SIZE_T},
- {"defragment_count", &export_vars.innodb_defragment_count, SHOW_SIZE_T},
-
{"instant_alter_column",
&export_vars.innodb_instant_alter_column, SHOW_ULONG},
@@ -1525,7 +1498,6 @@ static void innodb_drop_database(handlerton*, char *path)
dfield_set_data(&dfield, namebuf, len);
dict_index_copy_types(&tuple, sys_index, 1);
std::vector<pfs_os_file_t> to_close;
- std::vector<uint32_t> space_ids;
mtr_t mtr;
mtr.start();
pcur.btr_cur.page_cur.index = sys_index;
@@ -1569,7 +1541,6 @@ static void innodb_drop_database(handlerton*, char *path)
ut_ad("corrupted SYS_TABLES.SPACE" == 0);
else if (uint32_t space_id= mach_read_from_4(s))
{
- space_ids.emplace_back(space_id);
pfs_os_file_t detached= fil_delete_tablespace(space_id);
if (detached != OS_FILE_CLOSED)
to_close.emplace_back(detached);
@@ -1579,8 +1550,6 @@ static void innodb_drop_database(handlerton*, char *path)
mtr.commit();
for (pfs_os_file_t detached : to_close)
os_file_close(detached);
- for (const auto id : space_ids)
- ibuf_delete_for_discarded_space(id);
/* Any changes must be persisted before we return. */
if (mtr.commit_lsn())
@@ -3933,8 +3902,6 @@ static int innodb_init_params()
DBUG_RETURN(HA_ERR_INITIALIZATION);
}
- DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL);
-
/* Check that interdependent parameters have sane values. */
if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
@@ -4011,27 +3978,31 @@ static int innodb_init_params()
fts_sort_pll_degree = num_pll_degree;
- /* Store the default charset-collation number of this MySQL
- installation */
-
- data_mysql_default_charset_coll = (ulint) default_charset_info->number;
-
-#if !defined _WIN32 && defined O_DIRECT
- if (srv_use_atomic_writes && my_may_have_atomic_write) {
- /*
- Force O_DIRECT on Unixes (on Windows writes are always
- unbuffered)
- */
- switch (srv_file_flush_method) {
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
- break;
- default:
- srv_file_flush_method = SRV_O_DIRECT;
- fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
- }
- }
+ if (innodb_flush_method == 1 /* O_DSYNC */) {
+ log_sys.log_write_through = true;
+ fil_system.write_through = true;
+ fil_system.buffered = false;
+#if defined __linux__ || defined _WIN32
+ log_sys.log_buffered = false;
+ goto skip_buffering_tweak;
+#endif
+ } else if (innodb_flush_method >= 4 /* O_DIRECT */
+ IF_WIN(&& innodb_flush_method < 8 /* normal */,)) {
+ /* O_DIRECT and similar settings do nothing */
+ if (innodb_flush_method == 5 /* O_DIRECT_NO_FSYNC */
+ && buf_dblwr.use) {
+ buf_dblwr.use = buf_dblwr.USE_FAST;
+ }
+#ifdef O_DIRECT
+ } else if (srv_use_atomic_writes && my_may_have_atomic_write) {
+ /* If atomic writes are enabled, do the same as with
+ innodb_flush_method=O_DIRECT: retain the default settings */
#endif
+ } else {
+ log_sys.log_write_through = false;
+ fil_system.write_through = false;
+ fil_system.buffered = true;
+ }
#if defined __linux__ || defined _WIN32
if (srv_flush_log_at_trx_commit == 2) {
@@ -4039,14 +4010,9 @@ static int innodb_init_params()
innodb_flush_log_at_trx_commit=2. */
log_sys.log_buffered = true;
}
+skip_buffering_tweak:
#endif
-#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
- /* Currently native AIO is supported only on windows and linux
- and that also when the support is compiled in. In all other
- cases, we ignore the setting of innodb_use_native_aio. */
- srv_use_native_aio = FALSE;
-#endif
#ifdef HAVE_URING
if (srv_use_native_aio && io_uring_may_be_unsafe) {
sql_print_warning("innodb_use_native_aio may cause "
@@ -4054,28 +4020,39 @@ static int innodb_init_params()
"https://jira.mariadb.org/browse/MDEV-26674",
io_uring_may_be_unsafe);
}
+#elif !defined LINUX_NATIVE_AIO && !defined _WIN32
+ /* Currently native AIO is supported only on windows and linux
+ and that also when the support is compiled in. In all other
+ cases, we ignore the setting of innodb_use_native_aio. */
+ srv_use_native_aio = FALSE;
#endif
-#ifdef _WIN32
- switch (srv_file_flush_method) {
- case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
- srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
- break;
- case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
- srv_file_flush_method = SRV_FSYNC;
- break;
- default:
- ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
- }
-#else
- ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
-#endif
innodb_buffer_pool_size_init();
srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
DBUG_RETURN(0);
}
+
+/*********************************************************************//**
+Setup costs factors for InnoDB to be able to approximate how many
+ms different opperations takes. See cost functions in handler.h how
+the different variables are used */
+
+static void innobase_update_optimizer_costs(OPTIMIZER_COSTS *costs)
+{
+ /*
+ The following number was found by check_costs.pl when using 1M rows
+ and all rows are cached. See optimizer_costs.txt for details
+ */
+ costs->row_next_find_cost= 0.00007013;
+ costs->row_lookup_cost= 0.00076597;
+ costs->key_next_find_cost= 0.00009900;
+ costs->key_lookup_cost= 0.00079112;
+ costs->row_copy_cost= 0.00006087;
+}
+
+
/** Initialize the InnoDB storage engine plugin.
@param[in,out] p InnoDB handlerton
@return error code
@@ -4143,6 +4120,8 @@ static int innodb_init(void* p)
innobase_hton->prepare_commit_versioned
= innodb_prepare_commit_versioned;
+ innobase_hton->update_optimizer_costs= innobase_update_optimizer_costs;
+
innodb_remember_check_sysvar_funcs();
compile_time_assert(DATA_MYSQL_TRUE_VARCHAR == MYSQL_TYPE_VARCHAR);
@@ -4223,8 +4202,6 @@ static int innodb_init(void* p)
innobase_old_blocks_pct = buf_LRU_old_ratio_update(
innobase_old_blocks_pct, true);
- ibuf_max_size_update(srv_change_buffer_max_size);
-
mysql_mutex_init(pending_checkpoint_mutex_key,
&log_requests.mutex,
MY_MUTEX_INIT_FAST);
@@ -4286,6 +4263,11 @@ innobase_end(handlerton*, ha_panic_function)
}
}
+ /* Do system tablespace truncation during slow shutdown */
+ if (!srv_fast_shutdown && !high_level_read_only
+ && srv_operation == SRV_OPERATION_NORMAL) {
+ fsp_system_tablespace_truncate();
+ }
innodb_shutdown();
mysql_mutex_destroy(&log_requests.mutex);
@@ -5102,13 +5084,11 @@ ha_innobase::index_flags(
}
ulong flags= key == table_share->primary_key
- ? HA_CLUSTERED_INDEX : 0;
+ ? HA_CLUSTERED_INDEX : HA_KEYREAD_ONLY | HA_DO_RANGE_FILTER_PUSHDOWN;
flags |= HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
- | HA_READ_RANGE | HA_KEYREAD_ONLY
- | HA_DO_INDEX_COND_PUSHDOWN
- | HA_DO_RANGE_FILTER_PUSHDOWN;
-
+ | HA_READ_RANGE
+ | HA_DO_INDEX_COND_PUSHDOWN;
return(flags);
}
@@ -5172,13 +5152,21 @@ ha_innobase::keys_to_use_for_scanning()
return(&key_map_full);
}
-/** Ensure that indexed virtual columns will be computed. */
+/****************************************************************//**
+Ensure that indexed virtual columns will be computed.
+Needs to be done for indexes that are being added with inplace ALTER
+in a different thread, because from the server point of view these
+columns are not yet indexed.
+*/
void ha_innobase::column_bitmaps_signal()
{
if (!table->vfield || table->current_lock != F_WRLCK)
return;
dict_index_t* clust_index= dict_table_get_first_index(m_prebuilt->table);
+ if (!clust_index->online_log)
+ return;
+
uint num_v= 0;
for (uint j = 0; j < table->s->virtual_fields; j++)
{
@@ -5289,7 +5277,8 @@ create_table_info_t::create_table_info_t(
m_create_info(create_info),
m_table_name(table_name), m_table(NULL),
m_remote_path(remote_path),
- m_innodb_file_per_table(file_per_table)
+ m_innodb_file_per_table(file_per_table),
+ m_creating_stub(thd_ddl_options(thd)->import_tablespace())
{
}
@@ -5841,6 +5830,7 @@ ha_innobase::open(const char* name, int, uint)
/* Will be allocated if it is needed in ::update_row() */
m_upd_buf = NULL;
m_upd_buf_size = 0;
+ m_disable_rowid_filter = false;
char* is_part = is_partition(norm_name);
THD* thd = ha_thd();
@@ -5849,8 +5839,22 @@ ha_innobase::open(const char* name, int, uint)
DEBUG_SYNC(thd, "ib_open_after_dict_open");
- if (NULL == ib_table) {
-
+ if (UNIV_LIKELY(ib_table != nullptr)) {
+ } else if (thd_ddl_options(thd)->import_tablespace()) {
+ /* If the table does not exist and we are trying to
+ import, create a "stub" table similar to the effects
+ of CREATE TABLE followed by ALTER TABLE ... DISCARD
+ TABLESPACE. */
+
+ HA_CREATE_INFO create_info;
+ if (int err = prepare_create_stub_for_import(thd, norm_name,
+ create_info))
+ DBUG_RETURN(err);
+ create(norm_name, table, &create_info, true, nullptr);
+ DEBUG_SYNC(thd, "ib_after_create_stub_for_import");
+ ib_table = open_dict_table(name, norm_name, is_part,
+ DICT_ERR_IGNORE_FK_NOKEY);
+ } else {
if (is_part) {
sql_print_error("Failed to open table %s.\n",
norm_name);
@@ -6567,8 +6571,7 @@ uint8_t
get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field)
{
/* The following asserts try to check that the MySQL type code fits in
- 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
- the type */
+ 8 bits: this is used when DATA_NOT_NULL is ORed to the type */
static_assert(MYSQL_TYPE_STRING < 256, "compatibility");
static_assert(MYSQL_TYPE_VAR_STRING < 256, "compatibility");
@@ -7314,7 +7317,8 @@ ha_innobase::build_template(
/* Below we check column by column if we need to access
the clustered index. */
- if (pushed_rowid_filter && rowid_filter_is_active) {
+ if (pushed_rowid_filter && rowid_filter_is_active
+ && !m_disable_rowid_filter) {
fetch_primary_key_cols = TRUE;
m_prebuilt->pk_filter = this;
} else {
@@ -7371,7 +7375,8 @@ ha_innobase::build_template(
simplified to handle both. It should handle the issues. */
const bool pushed_down = active_index != MAX_KEY
- && active_index == pushed_idx_cond_keyno;
+ && active_index == pushed_idx_cond_keyno
+ && !m_disable_rowid_filter;
m_prebuilt->idx_cond = pushed_down ? this : nullptr;
@@ -9403,6 +9408,11 @@ ha_innobase::rnd_init(
{
int err;
+ /* Don't use rowid filter when doing full table scan or rnd_pos calls.*/
+ if (!scan) {
+ m_disable_rowid_filter = true;
+ }
+
/* Store the active index value so that we can restore the original
value after a scan */
@@ -9412,6 +9422,12 @@ ha_innobase::rnd_init(
err = change_active_index(m_primary_key);
}
+ if (err && !scan) {
+ /* Restore the original value in case of error */
+ m_disable_rowid_filter = false;
+ }
+
+
/* Don't use semi-consistent read in random row reads (by position).
This means we must disable semi_consistent_read if scan is false */
@@ -9432,6 +9448,7 @@ int
ha_innobase::rnd_end(void)
/*======================*/
{
+ m_disable_rowid_filter = false;
return(index_end());
}
@@ -9483,6 +9500,7 @@ ha_innobase::rnd_pos(
/* Note that we assume the length of the row reference is fixed
for the table, and it is == ref_length */
+ DBUG_ASSERT(m_disable_rowid_filter == true);
int error = index_read(buf, pos, (uint)ref_length, HA_READ_KEY_EXACT);
if (error != 0) {
@@ -9513,6 +9531,11 @@ ha_innobase::ft_init()
trx->will_lock = true;
}
+ /* If there is an FTS scan in progress, stop it */
+ fts_result_t* result = (reinterpret_cast<NEW_FT_INFO*>(ft_handler))->ft_result;
+ if (result)
+ result->current= NULL;
+
DBUG_RETURN(rnd_init(false));
}
@@ -10629,6 +10652,10 @@ create_table_info_t::create_table_def()
? doc_id_col : n_cols - num_v;
}
+ /* Assume the tablespace is not available until we are able to
+ import it.*/
+ table->file_unreadable = m_creating_stub;
+
if (DICT_TF_HAS_DATA_DIR(m_flags)) {
ut_a(strlen(m_remote_path));
@@ -11642,6 +11669,10 @@ index_bad:
}
}
+ /* If we are trying to import a tablespace, mark tablespace as
+ discarded. */
+ m_flags2 |= ulint{m_creating_stub} << DICT_TF2_POS_DISCARDED;
+
row_type = m_create_info->row_type;
if (zip_ssize && zip_allowed) {
@@ -12788,6 +12819,7 @@ int create_table_info_t::create_table(bool create_fk)
dict_table_get_all_fts_indexes(m_table, fts->indexes);
}
+ create_fk&= !m_creating_stub;
dberr_t err = create_fk ? create_foreign_keys() : DB_SUCCESS;
if (err == DB_SUCCESS) {
@@ -13193,6 +13225,9 @@ ha_innobase::create(const char *name, TABLE *form, HA_CREATE_INFO *create_info,
}
if (!error)
+ /* We can't possibly have foreign key information when creating a
+ stub table for importing .frm / .cfg / .ibd because it is not
+ stored in any of these files. */
error= info.create_table(own_trx);
if (own_trx || (info.flags2() & DICT_TF2_TEMPORARY))
@@ -13215,7 +13250,11 @@ ha_innobase::create(const char *name, TABLE *form, HA_CREATE_INFO *create_info,
if (!error)
{
- dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE);
+ /* Skip stats update when creating a stub table for importing,
+ as it is not needed and would report error due to the table
+ not being readable yet. */
+ if (!info.creating_stub())
+ dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE);
if (!info.table()->is_temporary())
log_write_up_to(trx->commit_lsn, true);
info.table()->release();
@@ -14376,13 +14415,15 @@ ha_innobase::estimate_rows_upper_bound()
DBUG_RETURN((ha_rows) estimate);
}
+
/*********************************************************************//**
How many seeks it will take to read through the table. This is to be
comparable to the number returned by records_in_range so that we can
decide if we should scan the table or use keys.
@return estimated time measured in disk seeks */
-double
+#ifdef NOT_USED
+IO_AND_CPU_COST
ha_innobase::scan_time()
/*====================*/
{
@@ -14402,24 +14443,28 @@ ha_innobase::scan_time()
TODO: This will be further improved to return some approximate
estimate but that would also needs pre-population of stats
structure. As of now approach is in sync with MyISAM. */
- return(ulonglong2double(stats.data_file_length) / IO_SIZE + 2);
+ return { (ulonglong2double(stats.data_file_length) / IO_SIZE * DISK_READ_COST), 0.0 };
}
ulint stat_clustered_index_size;
-
+ IO_AND_CPU_COST cost;
ut_a(m_prebuilt->table->stat_initialized);
stat_clustered_index_size =
m_prebuilt->table->stat_clustered_index_size;
- return((double) stat_clustered_index_size);
+ cost.io= (double) stat_clustered_index_size * DISK_READ_COST;
+ cost.cpu= 0;
+ return(cost);
}
+#endif
/******************************************************************//**
Calculate the time it takes to read a set of ranges through an index
This enables us to optimise reads for clustered indexes.
@return estimated time measured in disk seeks */
+#ifdef NOT_USED
double
ha_innobase::read_time(
/*===================*/
@@ -14444,8 +14489,33 @@ ha_innobase::read_time(
return(time_for_scan);
}
- return(ranges + (double) rows / (double) total_rows * time_for_scan);
+ return(ranges * KEY_LOOKUP_COST + (double) rows / (double) total_rows * time_for_scan);
+}
+
+/******************************************************************//**
+Calculate the time it takes to read a set of rows with primary key.
+*/
+
+IO_AND_CPU_COST
+ha_innobase::rnd_pos_time(ha_rows rows)
+{
+ ha_rows total_rows;
+
+ /* Assume that the read time is proportional to the scan time for all
+ rows + at most one seek per range. */
+
+ IO_AND_CPU_COST time_for_scan = scan_time();
+
+ if ((total_rows = estimate_rows_upper_bound()) < rows) {
+
+ return(time_for_scan);
+ }
+ double frac= (double) rows + (double) rows / (double) total_rows;
+ time_for_scan.io*= frac;
+ time_for_scan.cpu*= frac;
+ return(time_for_scan);
}
+#endif
/*********************************************************************//**
Calculates the key number used inside MySQL for an Innobase index.
@@ -14673,7 +14743,7 @@ ha_innobase::info_low(
DBUG_ASSERT(ib_table->get_ref_count() > 0);
if (!ib_table->is_readable()) {
- dict_stats_empty_table(ib_table, true);
+ dict_stats_empty_table(ib_table);
}
if (flag & HA_STATUS_TIME) {
@@ -14919,13 +14989,6 @@ ha_innobase::info_low(
innodb_rec_per_key(index, j,
stats.records));
- /* Since MySQL seems to favor table scans
- too much over index searches, we pretend
- index selectivity is 2 times better than
- our estimate: */
-
- rec_per_key_int = rec_per_key_int / 2;
-
if (rec_per_key_int == 0) {
rec_per_key_int = 1;
}
@@ -15003,58 +15066,6 @@ ha_innobase::analyze(THD*, HA_CHECK_OPT*)
return(HA_ADMIN_OK);
}
-/*****************************************************************//**
-Defragment table.
-@return error number */
-inline int ha_innobase::defragment_table()
-{
- for (dict_index_t *index= dict_table_get_first_index(m_prebuilt->table);
- index; index= dict_table_get_next_index(index))
- {
- if (!index->is_btree())
- continue;
-
- if (btr_defragment_find_index(index))
- {
- // We borrow this error code. When the same index is already in
- // the defragmentation queue, issuing another defragmentation
- // only introduces overhead. We return an error here to let the
- // user know this is not necessary. Note that this will fail a
- // query that's trying to defragment a full table if one of the
- // indicies in that table is already in defragmentation. We
- // choose this behavior so user is aware of this rather than
- // silently defragment other indicies of that table.
- return ER_SP_ALREADY_EXISTS;
- }
-
- btr_pcur_t pcur;
-
- mtr_t mtr;
- mtr.start();
- if (dberr_t err= pcur.open_leaf(true, index, BTR_SEARCH_LEAF, &mtr))
- {
- mtr.commit();
- return convert_error_code_to_mysql(err, 0, m_user_thd);
- }
- else if (btr_pcur_get_block(&pcur)->page.id().page_no() == index->page)
- {
- mtr.commit();
- continue;
- }
-
- btr_pcur_move_to_next(&pcur, &mtr);
- btr_pcur_store_position(&pcur, &mtr);
- mtr.commit();
- ut_ad(pcur.index() == index);
- const bool interrupted= btr_defragment_add_index(&pcur, m_user_thd);
- ut_free(pcur.old_rec_buf);
- if (interrupted)
- return ER_QUERY_INTERRUPTED;
- }
-
- return 0;
-}
-
/**********************************************************************//**
This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
the table in MySQL. */
@@ -15076,25 +15087,6 @@ ha_innobase::optimize(
calls to OPTIMIZE, which is undesirable. */
bool try_alter = true;
- if (!m_prebuilt->table->is_temporary()
- && m_prebuilt->table->is_readable()
- && srv_defragment) {
- int err = defragment_table();
-
- if (err == 0) {
- try_alter = false;
- } else {
- push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
- uint(err),
- "InnoDB: Cannot defragment table %s: returned error code %d\n",
- m_prebuilt->table->name.m_name, err);
-
- if(err == ER_SP_ALREADY_EXISTS) {
- try_alter = false;
- }
- }
- }
-
if (innodb_optimize_fulltext_only) {
if (m_prebuilt->table->fts && m_prebuilt->table->fts->cache
&& m_prebuilt->table->space) {
@@ -15432,7 +15424,7 @@ static
FOREIGN_KEY_INFO*
get_foreign_key_info(
/*=================*/
- THD* thd, /*!< in: user thread handle */
+ const THD* thd, /*!< in: user thread handle */
dict_foreign_t* foreign)/*!< in: foreign key constraint */
{
FOREIGN_KEY_INFO f_key_info;
@@ -15565,7 +15557,7 @@ Gets the list of foreign keys in this table.
int
ha_innobase::get_foreign_key_list(
/*==============================*/
- THD* thd, /*!< in: user thread handle */
+ const THD* thd, /*!< in: user thread handle */
List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */
{
update_thd(ha_thd());
@@ -15603,7 +15595,7 @@ Gets the set of foreign keys where this table is the referenced table.
int
ha_innobase::get_parent_foreign_key_list(
/*=====================================*/
- THD* thd, /*!< in: user thread handle */
+ const THD* thd, /*!< in: user thread handle */
List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */
{
update_thd(ha_thd());
@@ -16505,7 +16497,10 @@ ha_innobase::store_lock(
|| sql_command == SQLCOM_REPLACE_SELECT
|| sql_command == SQLCOM_UPDATE
|| sql_command == SQLCOM_CREATE_SEQUENCE
- || sql_command == SQLCOM_CREATE_TABLE))) {
+ || sql_command == SQLCOM_CREATE_TABLE))
+ || (trx->isolation_level == TRX_ISO_REPEATABLE_READ
+ && sql_command == SQLCOM_ALTER_TABLE
+ && lock_type == TL_READ)) {
/* If the transaction isolation level is
READ UNCOMMITTED or READ COMMITTED and we are executing
@@ -17547,20 +17542,6 @@ innodb_old_blocks_pct_update(THD*, st_mysql_sys_var*, void*, const void* save)
innobase_old_blocks_pct = ratio;
}
-/****************************************************************//**
-Update the system variable innodb_old_blocks_pct using the "saved"
-value. This function is registered as a callback with MySQL. */
-static
-void
-innodb_change_buffer_max_size_update(THD*, st_mysql_sys_var*, void*,
- const void* save)
-{
- srv_change_buffer_max_size = *static_cast<const uint*>(save);
- mysql_mutex_unlock(&LOCK_global_system_variables);
- ibuf_max_size_update(srv_change_buffer_max_size);
- mysql_mutex_lock(&LOCK_global_system_variables);
-}
-
#ifdef UNIV_DEBUG
static uint srv_fil_make_page_dirty_debug = 0;
static uint srv_saved_page_number_debug;
@@ -18100,15 +18081,6 @@ innodb_reset_all_monitor_update(
innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE);
}
-static
-void
-innodb_defragment_frequency_update(THD*, st_mysql_sys_var*, void*,
- const void* save)
-{
- srv_defragment_frequency = (*static_cast<const uint*>(save));
- srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
-}
-
static inline char *my_strtok_r(char *str, const char *delim, char **saveptr)
{
#if defined _WIN32
@@ -18444,7 +18416,7 @@ buffer_pool_load_abort(
}
#if defined __linux__ || defined _WIN32
-static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
+static void innodb_log_file_buffering_update(THD *, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
@@ -18453,6 +18425,36 @@ static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
}
#endif
+static void innodb_log_file_write_through_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ log_sys.set_write_through(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static void innodb_data_file_buffering_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_system.set_buffered(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static void innodb_data_file_write_through_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fil_system.set_write_through(*static_cast<const my_bool*>(save));
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static void innodb_doublewrite_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ fil_system.set_use_doublewrite(*static_cast<const ulong*>(save));
+}
+
static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
void *var, const void *save)
{
@@ -18577,6 +18579,21 @@ innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save)
mysql_mutex_lock(&LOCK_global_system_variables);
}
+/** Truncate the temporary tablespace if the
+innodb_truncate_temporary_tablespace_now is enabled.
+@param save to-be-assigned value */
+static
+void
+innodb_trunc_temp_space_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+ /* Temp tablespace is not initialized in read only mode. */
+ if (!*static_cast<const my_bool*>(save) || srv_read_only_mode)
+ return;
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ fsp_shrink_temp_space();
+ mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
static SHOW_VAR innodb_status_variables_export[]= {
SHOW_FUNC_ENTRY("Innodb", &show_innodb_vars),
{NullS, NullS, SHOW_LONG}
@@ -18786,11 +18803,14 @@ static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
"The common part for InnoDB table spaces.",
NULL, NULL, NULL);
-static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf,
- PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
- "Enable InnoDB doublewrite buffer (enabled by default)."
- " Disable with --skip-innodb-doublewrite.",
- NULL, NULL, TRUE);
+static MYSQL_SYSVAR_ENUM(doublewrite, buf_dblwr.use,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether and how to use the doublewrite buffer. "
+ "OFF=Assume that writes of innodb_page_size are atomic; "
+ "ON=Prevent torn writes (the default); "
+ "fast=Like ON, but do not synchronize writes to data files",
+ nullptr, innodb_doublewrite_update, true,
+ &innodb_doublewrite_typelib);
static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
@@ -18875,7 +18895,7 @@ static MYSQL_SYSVAR_UINT(fast_shutdown, srv_fast_shutdown,
fast_shutdown_validate, NULL, 1, 0, 3, 0);
static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
- PLUGIN_VAR_NOCMDARG,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_DEPRECATED,
"Stores each InnoDB table to an .ibd file in the database dir.",
NULL, NULL, TRUE);
@@ -18905,11 +18925,10 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
" guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
NULL, NULL, 1, 0, 3, 0);
-static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,
- PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_DEPRECATED,
"With which method to flush data.",
- NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT),
- &innodb_flush_method_typelib);
+ NULL, NULL, innodb_flush_method_default, &innodb_flush_method_typelib);
static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -19105,60 +19124,6 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
"Load the buffer pool from a file named @@innodb_buffer_pool_filename",
NULL, NULL, TRUE);
-static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
- PLUGIN_VAR_RQCMDARG,
- "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
- "defragmentation will be paused. And new defragmentation command will fail."
- "Paused defragmentation commands will resume when this variable is set to "
- "true again.",
- NULL, NULL, FALSE);
-
-static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
- PLUGIN_VAR_RQCMDARG,
- "Number of pages considered at once when merging multiple pages to "
- "defragment",
- NULL, NULL, 7, 2, 32, 0);
-
-static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
- srv_defragment_stats_accuracy,
- PLUGIN_VAR_RQCMDARG,
- "How many defragment stats changes there are before the stats "
- "are written to persistent storage. Set to 0 meaning disable "
- "defragment stats tracking.",
- NULL, NULL, 0, 0, ~0U, 0);
-
-static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
- srv_defragment_fill_factor_n_recs,
- PLUGIN_VAR_RQCMDARG,
- "How many records of space defragmentation should leave on the page. "
- "This variable, together with innodb_defragment_fill_factor, is introduced "
- "so defragmentation won't pack the page too full and cause page split on "
- "the next insert on every page. The variable indicating more defragmentation"
- " gain is the one effective.",
- NULL, NULL, 20, 1, 100, 0);
-
-static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
- PLUGIN_VAR_RQCMDARG,
- "A number between [0.7, 1] that tells defragmentation how full it should "
- "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
- "This variable, together with innodb_defragment_fill_factor_n_recs, is "
- "introduced so defragmentation won't pack the page too full and cause "
- "page split on the next insert on every page. The variable indicating more "
- "defragmentation gain is the one effective.",
- NULL, NULL, 0.9, 0.7, 1, 0);
-
-static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
- PLUGIN_VAR_RQCMDARG,
- "Do not defragment a single index more than this number of time per second."
- "This controls the number of time defragmentation thread can request X_LOCK "
- "on an index. Defragmentation thread will check whether "
- "1/defragment_frequency (s) has passed since it worked on this index last "
- "time, and put the index back to the queue if not enough time has passed. "
- "The actual frequency can only be lower than this given number.",
- NULL, innodb_defragment_frequency_update,
- SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
-
-
static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
PLUGIN_VAR_RQCMDARG,
"How deep to scan LRU to keep it clean",
@@ -19341,6 +19306,21 @@ static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
nullptr, innodb_log_file_buffering_update, FALSE);
#endif
+static MYSQL_SYSVAR_BOOL(log_file_write_through, log_sys.log_write_through,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether each write to ib_logfile0 is write through",
+ nullptr, innodb_log_file_write_through_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(data_file_buffering, fil_system.buffered,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether the file system cache for data files is enabled",
+ nullptr, innodb_data_file_buffering_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether each write to data files writes through",
+ nullptr, innodb_data_file_write_through_update, FALSE);
+
static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG,
"Redo log size in bytes.",
@@ -19407,7 +19387,7 @@ static MYSQL_SYSVAR_UINT(undo_tablespaces, srv_undo_tablespaces,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of undo tablespaces to use.",
NULL, NULL,
- 0L, /* Default seting */
+ 3L, /* Default seting */
0L, /* Minimum value */
TRX_SYS_MAX_UNDO_SPACES, 0); /* Maximum value */
@@ -19488,31 +19468,6 @@ static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave,
NULL, NULL, FALSE);
#endif /* HAVE_LIBNUMA */
-static void innodb_change_buffering_update(THD *thd, struct st_mysql_sys_var*,
- void*, const void *save)
-{
- ulong i= *static_cast<const ulong*>(save);
- if (i != IBUF_USE_NONE && !ibuf.index)
- push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
- "InnoDB: The change buffer is corrupted.");
- else
- innodb_change_buffering= i;
-}
-
-static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering,
- PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_DEPRECATED,
- "Buffer changes to secondary indexes.",
- nullptr, innodb_change_buffering_update,
- IBUF_USE_NONE, &innodb_change_buffering_typelib);
-
-static MYSQL_SYSVAR_UINT(change_buffer_max_size,
- srv_change_buffer_max_size,
- PLUGIN_VAR_RQCMDARG,
- "Maximum on-disk size of change buffer in terms of percentage"
- " of the buffer pool.",
- NULL, innodb_change_buffer_max_size_update,
- CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0);
-
static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
PLUGIN_VAR_RQCMDARG,
"Specifies how InnoDB index statistics collection code should"
@@ -19520,18 +19475,6 @@ static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
" NULLS_UNEQUAL and NULLS_IGNORED",
NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-static MYSQL_SYSVAR_BOOL(change_buffer_dump, ibuf_dump,
- PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
- "Dump the change buffer at startup.",
- NULL, NULL, FALSE);
-
-static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
- PLUGIN_VAR_RQCMDARG,
- "Debug flags for InnoDB change buffering (0=none, 1=try to buffer)",
- NULL, NULL, 0, 0, 1, 0);
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
static MYSQL_SYSVAR_ULONG(buf_dump_status_frequency, srv_buf_dump_status_frequency,
PLUGIN_VAR_RQCMDARG,
"A number between [0, 100] that tells how oftern buffer pool dump status "
@@ -19752,6 +19695,12 @@ static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tabl
"Enrypt the temporary table data.",
NULL, NULL, false);
+static MYSQL_SYSVAR_BOOL(truncate_temporary_tablespace_now,
+ innodb_truncate_temporary_tablespace_now,
+ PLUGIN_VAR_OPCMDARG,
+ "Shrink the temporary tablespace",
+ NULL, innodb_trunc_temp_space_update, false);
+
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(autoextend_increment),
MYSQL_SYSVAR(buffer_pool_size),
@@ -19769,12 +19718,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buffer_pool_load_pages_abort),
#endif /* UNIV_DEBUG */
MYSQL_SYSVAR(buffer_pool_load_at_startup),
- MYSQL_SYSVAR(defragment),
- MYSQL_SYSVAR(defragment_n_pages),
- MYSQL_SYSVAR(defragment_stats_accuracy),
- MYSQL_SYSVAR(defragment_fill_factor),
- MYSQL_SYSVAR(defragment_fill_factor_n_recs),
- MYSQL_SYSVAR(defragment_frequency),
MYSQL_SYSVAR(lru_scan_depth),
MYSQL_SYSVAR(lru_flush_size),
MYSQL_SYSVAR(flush_neighbors),
@@ -19811,6 +19754,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#if defined __linux__ || defined _WIN32
MYSQL_SYSVAR(log_file_buffering),
#endif
+ MYSQL_SYSVAR(log_file_write_through),
+ MYSQL_SYSVAR(data_file_buffering),
+ MYSQL_SYSVAR(data_file_write_through),
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_spin_wait_delay),
MYSQL_SYSVAR(log_group_home_dir),
@@ -19860,12 +19806,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#ifdef HAVE_LIBNUMA
MYSQL_SYSVAR(numa_interleave),
#endif /* HAVE_LIBNUMA */
- MYSQL_SYSVAR(change_buffering),
- MYSQL_SYSVAR(change_buffer_max_size),
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
- MYSQL_SYSVAR(change_buffer_dump),
- MYSQL_SYSVAR(change_buffering_debug),
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
MYSQL_SYSVAR(random_read_ahead),
MYSQL_SYSVAR(read_ahead_threshold),
MYSQL_SYSVAR(read_only),
@@ -19921,6 +19861,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buf_dump_status_frequency),
MYSQL_SYSVAR(background_thread),
MYSQL_SYSVAR(encrypt_temporary_tables),
+ MYSQL_SYSVAR(truncate_temporary_tablespace_now),
NULL
};
@@ -20021,6 +19962,7 @@ ha_innobase::multi_range_read_info_const(
uint n_ranges,
uint* bufsz,
uint* flags,
+ ha_rows limit,
Cost_estimate* cost)
{
/* See comments in ha_myisam::multi_range_read_info_const */
@@ -20030,8 +19972,9 @@ ha_innobase::multi_range_read_info_const(
*flags |= HA_MRR_USE_DEFAULT_IMPL;
}
- ha_rows res= m_ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges,
- bufsz, flags, cost);
+ ha_rows res= m_ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param,
+ n_ranges,
+ bufsz, flags, limit, cost);
return res;
}
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 50ac423f..86ece0df 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -105,10 +105,10 @@ public:
int close(void) override;
- double scan_time() override;
-
- double read_time(uint index, uint ranges, ha_rows rows) override;
-
+#ifdef NOT_USED
+ IO_AND_CPU_COST scan_time() override;
+ double rnd_pos_time(ha_rows rows) override;
+#endif
int write_row(const uchar * buf) override;
int update_row(const uchar * old_data, const uchar * new_data) override;
@@ -207,7 +207,6 @@ public:
int delete_table(const char *name) override;
int rename_table(const char* from, const char* to) override;
- inline int defragment_table();
int check(THD* thd, HA_CHECK_OPT* check_opt) override;
int check_for_upgrade(HA_CHECK_OPT* check_opt) override;
@@ -215,11 +214,11 @@ public:
char* get_foreign_key_create_info() override;
- int get_foreign_key_list(THD *thd,
+ int get_foreign_key_list(const THD *thd,
List<FOREIGN_KEY_INFO> *f_key_list) override;
int get_parent_foreign_key_list(
- THD* thd,
+ const THD* thd,
List<FOREIGN_KEY_INFO>* f_key_list) override;
bool can_switch_engines() override;
@@ -384,6 +383,7 @@ public:
uint n_ranges,
uint* bufsz,
uint* flags,
+ ha_rows limit,
Cost_estimate* cost) override;
/** Initialize multi range read and get information.
@@ -522,6 +522,10 @@ protected:
/** If mysql has locked with external_lock() */
bool m_mysql_has_locked;
+
+ /** If true, disable the Rowid Filter. It is disabled when
+ the enigne is intialized for making rnd_pos() calls */
+ bool m_disable_rowid_filter;
};
@@ -702,6 +706,8 @@ public:
ulint flags2() const
{ return(m_flags2); }
+ bool creating_stub() const { return UNIV_UNLIKELY(m_creating_stub); }
+
/** Get trx. */
trx_t* trx() const
{ return(m_trx); }
@@ -768,6 +774,9 @@ private:
/** Table flags2 */
ulint m_flags2;
+
+ /** Whether we are creating a stub table for importing. */
+ const bool m_creating_stub;
};
/**
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 6689b9ef..560840c5 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -2150,7 +2150,7 @@ next_page:
}
next_page= false;
- block= btr_block_get(*clust_index, next_page_no, RW_S_LATCH, false, &mtr);
+ block= btr_block_get(*clust_index, next_page_no, RW_S_LATCH, &mtr);
if (!block)
goto non_empty;
page_cur_set_before_first(block, cur);
@@ -10319,7 +10319,16 @@ commit_try_rebuild(
"parent" table. */
if (!user_table->space) {
rebuilt_table->file_unreadable = true;
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic push
+# if __GNUC__ < 12 || defined WITH_UBSAN
+# pragma GCC diagnostic ignored "-Wconversion"
+# endif
+#endif
rebuilt_table->flags2 |= DICT_TF2_DISCARDED;
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic pop
+#endif
}
/* We can now rename the old table as a temporary table,
@@ -10346,6 +10355,7 @@ commit_try_rebuild(
/* We must be still holding a table handle. */
DBUG_ASSERT(user_table->get_ref_count() == 1);
+ rebuilt_table->row_id = uint64_t{user_table->row_id};
DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;);
switch (error) {
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index 711144e3..ec50418b 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -39,7 +39,6 @@ Created July 18, 2007 Vasil Dimov
#include "dict0load.h"
#include "buf0buddy.h"
#include "buf0buf.h"
-#include "ibuf0ibuf.h"
#include "dict0mem.h"
#include "dict0types.h"
#include "srv0start.h"
@@ -80,10 +79,7 @@ in i_s_page_type[] array */
/** R-tree index page */
#define I_S_PAGE_TYPE_RTREE (FIL_PAGE_TYPE_LAST + 1)
-/** Change buffer B-tree page */
-#define I_S_PAGE_TYPE_IBUF (FIL_PAGE_TYPE_LAST + 2)
-
-#define I_S_PAGE_TYPE_LAST I_S_PAGE_TYPE_IBUF
+#define I_S_PAGE_TYPE_LAST I_S_PAGE_TYPE_RTREE
#define I_S_PAGE_TYPE_BITS 4
@@ -104,9 +100,6 @@ static buf_page_desc_t i_s_page_type[] = {
{"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
{"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN},
{"RTREE_INDEX", I_S_PAGE_TYPE_RTREE},
- {"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
- {"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED},
- {"PAGE COMPRESSED AND ENCRYPTED", FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED},
};
/** This structure defines information we will fetch from pages
@@ -3776,17 +3769,17 @@ i_s_innodb_buffer_page_fill(
OK(fields[IDX_BUFFER_PAGE_STATE]->store(
std::min<uint32_t>(3, page_info->state) + 1, true));
- static_assert(buf_page_t::UNFIXED == 1U << 29, "comp.");
+ static_assert(buf_page_t::UNFIXED == 2U << 29, "comp.");
static_assert(buf_page_t::READ_FIX == 4U << 29, "comp.");
- static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp.");
+ static_assert(buf_page_t::WRITE_FIX == 6U << 29, "comp.");
unsigned io_fix = page_info->state >> 29;
if (io_fix < 4) {
io_fix = 1;
- } else if (io_fix > 5) {
- io_fix = 3;
+ } else if (io_fix == 4) {
+ io_fix = 2;
} else {
- io_fix -= 2;
+ io_fix = 3;
}
OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(io_fix, true));
@@ -3824,14 +3817,9 @@ i_s_innodb_set_page_type(
their values are defined as 17855 and 17854, so we cannot
use them to index into i_s_page_type[] array, its array index
in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX
- (1) for index pages or I_S_PAGE_TYPE_IBUF for
- change buffer index pages */
+ (1) for index pages */
if (page_type == FIL_PAGE_RTREE) {
page_info->page_type = I_S_PAGE_TYPE_RTREE;
- } else if (page_info->index_id
- == static_cast<index_id_t>(DICT_IBUF_ID_MIN
- + IBUF_SPACE_ID)) {
- page_info->page_type = I_S_PAGE_TYPE_IBUF;
} else {
ut_ad(page_type == FIL_PAGE_INDEX
|| page_type == FIL_PAGE_TYPE_INSTANT);
@@ -3876,9 +3864,9 @@ i_s_innodb_buffer_page_get_info(
static_assert(buf_page_t::NOT_USED == 0, "compatibility");
static_assert(buf_page_t::MEMORY == 1, "compatibility");
static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility");
- static_assert(buf_page_t::UNFIXED == 1U << 29, "compatibility");
+ static_assert(buf_page_t::UNFIXED == 2U << 29, "compatibility");
static_assert(buf_page_t::READ_FIX == 4U << 29, "compatibility");
- static_assert(buf_page_t::WRITE_FIX == 5U << 29, "compatibility");
+ static_assert(buf_page_t::WRITE_FIX == 6U << 29, "compatibility");
page_info->state = bpage->state();
@@ -4268,17 +4256,17 @@ i_s_innodb_buf_page_lru_fill(
OK(fields[IDX_BUF_LRU_PAGE_STATE]->store(
page_info->compressed_only, true));
- static_assert(buf_page_t::UNFIXED == 1U << 29, "comp.");
+ static_assert(buf_page_t::UNFIXED == 2U << 29, "comp.");
static_assert(buf_page_t::READ_FIX == 4U << 29, "comp.");
- static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp.");
+ static_assert(buf_page_t::WRITE_FIX == 6U << 29, "comp.");
unsigned io_fix = page_info->state >> 29;
if (io_fix < 4) {
io_fix = 1;
- } else if (io_fix > 5) {
- io_fix = 3;
+ } else if (io_fix == 4) {
+ io_fix = 2;
} else {
- io_fix -= 2;
+ io_fix = 3;
}
OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(io_fix, true));
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index e77401ed..67a89f94 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -17,1260 +17,98 @@ this program; if not, write to the Free Software Foundation, Inc.,
*****************************************************************************/
-/**************************************************//**
+/**
@file ibuf/ibuf0ibuf.cc
-Insert buffer
-
-Created 7/19/1997 Heikki Tuuri
-*******************************************************/
+Upgrade and removal of the InnoDB change buffer
+*/
-#include <tuple>
#include "ibuf0ibuf.h"
#include "btr0sea.h"
-
-/** Number of bits describing a single page */
-#define IBUF_BITS_PER_PAGE 4
-/** The start address for an insert buffer bitmap page bitmap */
-#define IBUF_BITMAP PAGE_DATA
-
-#include "buf0buf.h"
-#include "buf0rea.h"
-#include "fsp0fsp.h"
-#include "trx0sys.h"
-#include "fil0fil.h"
-#include "rem0rec.h"
-#include "btr0cur.h"
#include "btr0pcur.h"
-#include "btr0btr.h"
#include "row0upd.h"
-#include "dict0boot.h"
-#include "fut0lst.h"
-#include "lock0lock.h"
-#include "log0recv.h"
-#include "que0que.h"
-#include "srv0start.h" /* srv_shutdown_state */
-#include "rem0cmp.h"
+#include "my_service_manager.h"
#include "log.h"
-/* STRUCTURE OF AN INSERT BUFFER RECORD
+/** Possible operations buffered in the change buffer. */
+enum ibuf_op
+{
+ IBUF_OP_INSERT= 0,
+ IBUF_OP_DELETE_MARK= 1,
+ IBUF_OP_DELETE= 2,
+};
+
+constexpr const page_id_t ibuf_root{0, FSP_IBUF_TREE_ROOT_PAGE_NO};
+constexpr const page_id_t ibuf_header{0, FSP_IBUF_HEADER_PAGE_NO};
+constexpr const index_id_t ibuf_index_id{0xFFFFFFFF00000000ULL};
-In versions < 4.1.x:
+/* Format of the change buffer records:
+
+MySQL 3.23 and MySQL 4.0 (not supported since MySQL 5.6.5 and MariaDB 10.0.11):
1. The first field is the page number.
2. The second field is an array which stores type info for each subsequent
- field. We store the information which affects the ordering of records, and
+ field (4 bytes per column).
+ We store the information which affects the ordering of records, and
also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
is 10 bytes.
3. Next we have the fields of the actual index record.
-In versions >= 4.1.x:
-
-Note that contary to what we planned in the 1990's, there will only be one
-insert buffer tree, and that is in the system tablespace of InnoDB.
+MySQL 4.1:
1. The first field is the space id.
2. The second field is a one-byte marker (0) which differentiates records from
the < 4.1.x storage format.
3. The third field is the page number.
-4. The fourth field contains the type info, where we have also added 2 bytes to
- store the charset. In the compressed table format of 5.0.x we must add more
- information here so that we can build a dummy 'index' struct which 5.0.x
- can use in the binary search on the index page in the ibuf merge phase.
+4. The fourth field contains the type info
+ (6 bytes per index field, 16-bit collation information added).
+ Unless ROW_FORMAT=REDUNDANT, we add more metadata here so that
+ we can access records in the index page.
5. The rest of the fields contain the fields of the actual index record.
-In versions >= 5.0.3:
+MySQL 5.0 (starting with MySQL 5.0.3) and MySQL 5.1:
The first byte of the fourth field is an additional marker (0) if the record
-is in the compact format. The presence of this marker can be detected by
-looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
+is not in ROW_FORMAT=REDUNDANT. The presence of this marker can be detected by
+looking at the length of the field modulo 6.
The high-order bit of the character set field in the type info is the
"nullable" flag for the field.
-In versions >= 5.5:
+MySQL 5.5 and MariaDB 5.5 and later:
-The optional marker byte at the start of the fourth field is replaced by
-mandatory 3 fields, totaling 4 bytes:
+Unless innodb_change_buffering=inserts, the optional marker byte at
+the start of the fourth field may be replaced by mandatory 3 fields,
+comprising 4 bytes:
1. 2 bytes: Counter field, used to sort records within a (space id, page
no) in the order they were added. This is needed so that for example the
sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
correctly.
- 2. 1 byte: Operation type (see ibuf_op_t).
+ 2. 1 byte: Operation type (see ibuf_op).
- 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
-
-To ensure older records, which do not have counters to enforce correct
-sorting, are merged before any new records, ibuf_insert checks if we're
-trying to insert to a position that contains old-style records, and if so,
-refuses the insert. Thus, ibuf pages are gradually converted to the new
-format as their corresponding buffer pool pages are read into memory.
+ 3. 1 byte: 0=ROW_FORMAT=REDUNDANT, 1=other
*/
-
-/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
-
-If an OS thread performs any operation that brings in disk pages from
-non-system tablespaces into the buffer pool, or creates such a page there,
-then the operation may have as a side effect an insert buffer index tree
-compression. Thus, the tree latch of the insert buffer tree may be acquired
-in the x-mode, and also the file space latch of the system tablespace may
-be acquired in the x-mode.
-
-Also, an insert to an index in a non-system tablespace can have the same
-effect. How do we know this cannot lead to a deadlock of OS threads? There
-is a problem with the i\o-handler threads: they break the latching order
-because they own x-latches to pages which are on a lower level than the
-insert buffer tree latch, its page latches, and the tablespace latch an
-insert buffer operation can reserve.
-
-The solution is the following: Let all the tree and page latches connected
-with the insert buffer be later in the latching order than the fsp latch and
-fsp page latches.
-
-Insert buffer pages must be such that the insert buffer is never invoked
-when these pages are accessed as this would result in a recursion violating
-the latching order. We let a special i/o-handler thread take care of i/o to
-the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
-pages and the first inode page, which contains the inode of the ibuf tree: let
-us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
-access both non-ibuf and ibuf pages.
-
-Then an i/o-handler for the insert buffer never needs to access recursively the
-insert buffer tree and thus obeys the latching order. On the other hand, other
-i/o-handlers for other tablespaces may require access to the insert buffer,
-but because all kinds of latches they need to access there are later in the
-latching order, no violation of the latching order occurs in this case,
-either.
-
-A problem is how to grow and contract an insert buffer tree. As it is later
-in the latching order than the fsp management, we have to reserve the fsp
-latch first, before adding or removing pages from the insert buffer tree.
-We let the insert buffer tree have its own file space management: a free
-list of pages linked to the tree root. To prevent recursive using of the
-insert buffer when adding pages to the tree, we must first load these pages
-to memory, obtaining a latch on them, and only after that add them to the
-free list of the insert buffer tree. More difficult is removing of pages
-from the free list. If there is an excess of pages in the free list of the
-ibuf tree, they might be needed if some thread reserves the fsp latch,
-intending to allocate more file space. So we do the following: if a thread
-reserves the fsp latch, we check the writer count field of the latch. If
-this field has value 1, it means that the thread did not own the latch
-before entering the fsp system, and the mtr of the thread contains no
-modifications to the fsp pages. Now we are free to reserve the ibuf latch,
-and check if there is an excess of pages in the free list. We can then, in a
-separate mini-transaction, take them out of the free list and free them to
-the fsp system.
-
-To avoid deadlocks in the ibuf system, we divide file pages into three levels:
-
-(1) non-ibuf pages,
-(2) ibuf tree pages and the pages in the ibuf tree free list, and
-(3) ibuf bitmap pages.
-
-No OS thread is allowed to access higher level pages if it has latches to
-lower level pages; even if the thread owns a B-tree latch it must not access
-the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
-is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
-exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
-level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
-it uses synchronous aio, it can access any pages, as long as it obeys the
-access order rules. */
-
-/** Operations that can currently be buffered. */
-ulong innodb_change_buffering;
-
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/** Dump the change buffer at startup */
-my_bool ibuf_dump;
-/** Flag to control insert buffer debugging. */
-uint ibuf_debug;
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
-/** The insert buffer control structure */
-ibuf_t ibuf;
-
-/** @name Offsets to the per-page bits in the insert buffer bitmap */
-/* @{ */
-#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the
- amount of free space */
-#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered
- changes for the page */
-#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of
- the ibuf tree, excluding the
- root page, or is in the free
- list of the ibuf */
-/* @} */
-
-#define IBUF_REC_FIELD_SPACE 0 /*!< in the pre-4.1 format,
- the page number. later, the space_id */
-#define IBUF_REC_FIELD_MARKER 1 /*!< starting with 4.1, a marker
- consisting of 1 byte that is 0 */
-#define IBUF_REC_FIELD_PAGE 2 /*!< starting with 4.1, the
- page number */
-#define IBUF_REC_FIELD_METADATA 3 /* the metadata field */
-#define IBUF_REC_FIELD_USER 4 /* first user field */
-
-/* Various constants for checking the type of an ibuf record and extracting
-data from it. For details, see the description of the record format at the
-top of this file. */
-
-/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record
-The fourth column in the MySQL 5.5 format contains an operation
-type, counter, and some flags. */
-/* @{ */
-#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at
- the beginning of the fourth field */
-
-/* Offsets for the fields at the beginning of the fourth field */
-#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */
-#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */
-#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */
-
-/* Record flag masks */
-#define IBUF_REC_COMPACT 0x1 /*!< Set in
- IBUF_REC_OFFSET_FLAGS if the
- user index is in COMPACT
- format or later */
-
-
-#ifndef SAFE_MUTEX
-static
-#endif /* SAFE_MUTEX */
-/** The mutex protecting the insert buffer */
-mysql_mutex_t ibuf_mutex,
- /** The mutex covering pessimistic inserts into the change buffer */
- ibuf_pessimistic_insert_mutex;
-
-/** The area in pages from which contract looks for page numbers for merge */
-constexpr ulint IBUF_MERGE_AREA = 8;
-
-/** In ibuf_contract() at most this number of pages is read to memory in one
-batch, in order to merge the entries for them in the change buffer */
-constexpr ulint IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA;
-
-/* TODO: how to cope with drop table if there are records in the insert
-buffer for the indexes of the table? Is there actually any problem,
-because ibuf merge is done to a page when it is read in, and it is
-still physically like the index page even if the index would have been
-dropped! So, there seems to be no problem. */
-
-/******************************************************************//**
-Sets the flag in the current mini-transaction record indicating we're
-inside an insert buffer routine. */
-UNIV_INLINE
-void
-ibuf_enter(
-/*=======*/
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_ad(!mtr->is_inside_ibuf());
- mtr->enter_ibuf();
-}
-
-/******************************************************************//**
-Sets the flag in the current mini-transaction record indicating we're
-exiting an insert buffer routine. */
-UNIV_INLINE
-void
-ibuf_exit(
-/*======*/
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_ad(mtr->is_inside_ibuf());
- mtr->exit_ibuf();
-}
-
-/**************************************************************//**
-Commits an insert buffer mini-transaction and sets the persistent
-cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */
-UNIV_INLINE
-void
-ibuf_btr_pcur_commit_specify_mtr(
-/*=============================*/
- btr_pcur_t* pcur, /*!< in/out: persistent cursor */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_d(ibuf_exit(mtr));
- btr_pcur_commit_specify_mtr(pcur, mtr);
-}
-
-/******************************************************************//**
-Gets the ibuf header page and x-latches it.
-@return insert buffer header page */
-static
-page_t*
-ibuf_header_page_get(
-/*=================*/
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_ad(!ibuf_inside(mtr));
-
- buf_block_t* block = buf_page_get(
- page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
- 0, RW_X_LATCH, mtr);
- if (UNIV_UNLIKELY(!block)) {
- return nullptr;
- }
-
- buf_page_make_young_if_needed(&block->page);
-
- return block->page.frame;
-}
-
-/** Acquire the change buffer root page.
-@param[in,out] mtr mini-transaction
-@return change buffer root page, SX-latched */
-static buf_block_t *ibuf_tree_root_get(mtr_t *mtr, dberr_t *err= nullptr)
-{
- ut_ad(ibuf_inside(mtr));
- mysql_mutex_assert_owner(&ibuf_mutex);
-
- mtr_sx_lock_index(ibuf.index, mtr);
-
- buf_block_t *block=
- buf_page_get_gen(page_id_t{IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO},
- 0, RW_SX_LATCH, nullptr, BUF_GET, mtr, err);
- if (block)
- {
- ut_ad(ibuf.empty == page_is_empty(block->page.frame));
- buf_page_make_young_if_needed(&block->page);
- }
-
- return block;
-}
-
-/******************************************************************//**
-Closes insert buffer and frees the data structures. */
-void
-ibuf_close(void)
-/*============*/
-{
- if (!ibuf.index) {
- return;
- }
-
- mysql_mutex_destroy(&ibuf_pessimistic_insert_mutex);
- mysql_mutex_destroy(&ibuf_mutex);
-
- dict_table_t* ibuf_table = ibuf.index->table;
- ibuf.index->lock.free();
- dict_mem_index_free(ibuf.index);
- dict_mem_table_free(ibuf_table);
- ibuf.index = NULL;
-}
-
-/******************************************************************//**
-Updates the size information of the ibuf, assuming the segment size has not
-changed. */
-static
-void
-ibuf_size_update(
-/*=============*/
- const page_t* root) /*!< in: ibuf tree root */
-{
- mysql_mutex_assert_owner(&ibuf_mutex);
-
- ibuf.free_list_len = flst_get_len(root + PAGE_HEADER
- + PAGE_BTR_IBUF_FREE_LIST);
-
- ibuf.height = 1 + btr_page_get_level(root);
-
- /* the '1 +' is the ibuf header page */
- ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len);
-}
-
-/******************************************************************//**
-Creates the insert buffer data structure at a database startup and initializes
-the data structures for the insert buffer.
-@return DB_SUCCESS or failure */
-dberr_t
-ibuf_init_at_db_start(void)
-/*=======================*/
-{
- page_t* root;
-
- ut_ad(!ibuf.index);
- mtr_t mtr;
- mtr.start();
- compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
- compile_time_assert(IBUF_SPACE_ID == 0);
- mtr.x_lock_space(fil_system.sys_space);
- dberr_t err;
- buf_block_t* header_page = buf_page_get_gen(
- page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
- 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
-
- if (!header_page) {
-err_exit:
- sql_print_error("InnoDB: The change buffer is corrupted"
- " or has been removed on upgrade"
- " to MariaDB 11.0 or later");
- mtr.commit();
- if (innodb_change_buffering == IBUF_USE_NONE) {
- err = DB_SUCCESS;
- }
- return err;
- }
-
- fseg_n_reserved_pages(*header_page,
- IBUF_HEADER + IBUF_TREE_SEG_HEADER
- + header_page->page.frame, &ibuf.seg_size, &mtr);
-
- do {
- IF_DBUG(if (_db_keyword_(nullptr, "intermittent_read_failure",
- 1)) continue,);
- ut_ad(ibuf.seg_size >= 2);
- } while (0);
-
- if (buf_block_t* block =
- buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
- FSP_IBUF_TREE_ROOT_PAGE_NO),
- 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) {
- root = buf_block_get_frame(block);
- } else {
- goto err_exit;
- }
-
- DBUG_EXECUTE_IF("ibuf_init_corrupt",
- err = DB_CORRUPTION;
- goto err_exit;);
-
- if (page_is_comp(root) || fil_page_get_type(root) != FIL_PAGE_INDEX
- || btr_page_get_index_id(root) != DICT_IBUF_ID_MIN) {
- err = DB_CORRUPTION;
- goto err_exit;
- }
-
- /* At startup we intialize ibuf to have a maximum of
- CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
- buffer pool size. Once ibuf struct is initialized this
- value is updated with the user supplied size by calling
- ibuf_max_size_update(). */
- ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
- * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
-
- mysql_mutex_init(ibuf_mutex_key, &ibuf_mutex, nullptr);
- mysql_mutex_init(ibuf_pessimistic_insert_mutex_key,
- &ibuf_pessimistic_insert_mutex, nullptr);
-
- mysql_mutex_lock(&ibuf_mutex);
- ibuf_size_update(root);
- mysql_mutex_unlock(&ibuf_mutex);
-
- ibuf.empty = page_is_empty(root);
- mtr.commit();
-
- ibuf.index = dict_mem_index_create(
- dict_table_t::create(
- {C_STRING_WITH_LEN("innodb_change_buffer")},
- fil_system.sys_space, 1, 0, 0, 0),
- "CLUST_IND",
- DICT_CLUSTERED | DICT_IBUF, 1);
- ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
- ibuf.index->n_uniq = REC_MAX_N_FIELDS;
- ibuf.index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
-#ifdef BTR_CUR_ADAPT
- ibuf.index->search_info = btr_search_info_create(ibuf.index->heap);
-#endif /* BTR_CUR_ADAPT */
- ibuf.index->page = FSP_IBUF_TREE_ROOT_PAGE_NO;
- ut_d(ibuf.index->cached = TRUE);
-
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
- if (!ibuf_dump) {
- return DB_SUCCESS;
- }
- ib::info() << "Dumping the change buffer";
- ibuf_mtr_start(&mtr);
- btr_pcur_t pcur;
- if (DB_SUCCESS
- == pcur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr)) {
- while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
- rec_print_old(stderr, btr_pcur_get_rec(&pcur));
- }
- }
- ibuf_mtr_commit(&mtr);
- ib::info() << "Dumped the change buffer";
-#endif
-
- return DB_SUCCESS;
-}
-
-/*********************************************************************//**
-Updates the max_size value for ibuf. */
-void
-ibuf_max_size_update(
-/*=================*/
- ulint new_val) /*!< in: new value in terms of
- percentage of the buffer pool size */
-{
- if (UNIV_UNLIKELY(!ibuf.index)) return;
- ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
- * new_val) / 100;
- mysql_mutex_lock(&ibuf_mutex);
- ibuf.max_size = new_size;
- mysql_mutex_unlock(&ibuf_mutex);
-}
-
-# ifdef UNIV_DEBUG
-/** Gets the desired bits for a given page from a bitmap page.
-@param[in] page bitmap page
-@param[in] page_id page id whose bits to get
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
-@param[in,out] mtr mini-transaction holding an x-latch on the
-bitmap page
-@return value of bits */
-# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \
- ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, \
- MTR_MEMO_PAGE_X_FIX, mtr, bit)
-# else /* UNIV_DEBUG */
-/** Gets the desired bits for a given page from a bitmap page.
-@param[in] page bitmap page
-@param[in] page_id page id whose bits to get
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
-@param[in,out] mtr mini-transaction holding an x-latch on the
-bitmap page
-@return value of bits */
-# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \
- ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, bit)
-# endif /* UNIV_DEBUG */
-
-/** Gets the desired bits for a given page from a bitmap page.
-@param[in] page bitmap page
-@param[in] page_id page id whose bits to get
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] latch_type MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ...
-@param[in,out] mtr mini-transaction holding latch_type on the
-bitmap page
-@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
-@return value of bits */
-UNIV_INLINE
-ulint
-ibuf_bitmap_page_get_bits_low(
- const page_t* page,
- const page_id_t page_id,
- ulint zip_size,
-#ifdef UNIV_DEBUG
- ulint latch_type,
- mtr_t* mtr,
-#endif /* UNIV_DEBUG */
- ulint bit)
-{
- ulint byte_offset;
- ulint bit_offset;
- ulint map_byte;
- ulint value;
- const ulint size = zip_size ? zip_size : srv_page_size;
-
- ut_ad(ut_is_2pow(zip_size));
- ut_ad(bit < IBUF_BITS_PER_PAGE);
- compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
- ut_ad(mtr->memo_contains_page_flagged(page, latch_type));
-
- bit_offset = (page_id.page_no() & (size - 1))
- * IBUF_BITS_PER_PAGE + bit;
-
- byte_offset = bit_offset / 8;
- bit_offset = bit_offset % 8;
-
- ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
-
- map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
-
- value = ut_bit_get_nth(map_byte, bit_offset);
-
- if (bit == IBUF_BITMAP_FREE) {
- ut_ad(bit_offset + 1 < 8);
-
- value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
- }
-
- return(value);
-}
-
-/** Sets the desired bit for a given page in a bitmap page.
-@tparam bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
-@param[in,out] block bitmap page
-@param[in] page_id page id whose bits to set
-@param[in] physical_size page size
-@param[in] val value to set
-@param[in,out] mtr mtr containing an x-latch to the bitmap page */
-template<ulint bit>
-static void
-ibuf_bitmap_page_set_bits(
- buf_block_t* block,
- const page_id_t page_id,
- ulint physical_size,
- ulint val,
- mtr_t* mtr)
-{
- ulint byte_offset;
- ulint bit_offset;
-
- static_assert(bit < IBUF_BITS_PER_PAGE, "wrong bit");
- compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
- ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
- ut_ad(mtr->is_named_space(page_id.space()));
-
- bit_offset = (page_id.page_no() % physical_size)
- * IBUF_BITS_PER_PAGE + bit;
-
- byte_offset = bit_offset / 8;
- bit_offset = bit_offset % 8;
-
- ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
-
- byte* map_byte = &block->page.frame[IBUF_BITMAP + byte_offset];
- byte b = *map_byte;
-
- if (bit == IBUF_BITMAP_FREE) {
- ut_ad(bit_offset + 1 < 8);
- ut_ad(val <= 3);
- b &= static_cast<byte>(~(3U << bit_offset));
- b |= static_cast<byte>(((val & 2) >> 1) << bit_offset
- | (val & 1) << (bit_offset + 1));
- } else {
- ut_ad(val <= 1);
- b &= static_cast<byte>(~(1U << bit_offset));
-#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
-# pragma GCC diagnostic push
-# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
-#endif
- b |= static_cast<byte>(val << bit_offset);
-#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
-# pragma GCC diagnostic pop
-#endif
- }
-
- mtr->write<1,mtr_t::MAYBE_NOP>(*block, map_byte, b);
-}
-
-/** Calculates the bitmap page number for a given page number.
-@param[in] page_id page id
-@param[in] size page size
-@return the bitmap page id where the file page is mapped */
-inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size)
-{
- if (!size)
- size= srv_page_size;
-
- return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET
- + uint32_t(page_id.page_no() & ~(size - 1)));
-}
-
-/** Gets the ibuf bitmap page where the bits describing a given file page are
-stored.
-@param[in] page_id page id of the file page
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out] mtr mini-transaction
-@return bitmap page where the file page is mapped, that is, the bitmap
-page containing the descriptor bits for the file page; the bitmap page
-is x-latched */
-static
-buf_block_t*
-ibuf_bitmap_get_map_page(
- const page_id_t page_id,
- ulint zip_size,
- mtr_t* mtr)
-{
- return buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, zip_size),
- zip_size, RW_X_LATCH, nullptr,
- BUF_GET_POSSIBLY_FREED, mtr);
-}
-
-/************************************************************************//**
-Sets the free bits of the page in the ibuf bitmap. This is done in a separate
-mini-transaction, hence this operation does not restrict further work to only
-ibuf bitmap operations, which would result if the latch to the bitmap page
-were kept. */
-UNIV_INLINE
-void
-ibuf_set_free_bits_low(
-/*===================*/
- const buf_block_t* block, /*!< in: index page; free bits are set if
- the index is non-clustered and page
- level is 0 */
- ulint val, /*!< in: value to set: < 4 */
- mtr_t* mtr) /*!< in/out: mtr */
-{
- ut_ad(mtr->is_named_space(block->page.id().space()));
- if (!page_is_leaf(block->page.frame)) {
- return;
- }
-
-#ifdef UNIV_IBUF_DEBUG
- ut_a(val <= ibuf_index_page_calc_free(block));
-#endif /* UNIV_IBUF_DEBUG */
- const page_id_t id(block->page.id());
-
- if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
- id, block->zip_size(), mtr)) {
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
- bitmap_page, id, block->physical_size(),
- val, mtr);
- }
-}
-
-/************************************************************************//**
-Sets the free bit of the page in the ibuf bitmap. This is done in a separate
-mini-transaction, hence this operation does not restrict further work to only
-ibuf bitmap operations, which would result if the latch to the bitmap page
-were kept. */
-void
-ibuf_set_free_bits_func(
-/*====================*/
- buf_block_t* block, /*!< in: index page of a non-clustered index;
- free bit is reset if page level is 0 */
-#ifdef UNIV_IBUF_DEBUG
- ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
- value which the bits must have before
- setting; this is for debugging */
-#endif /* UNIV_IBUF_DEBUG */
- ulint val) /*!< in: value to set: < 4 */
-{
- if (!page_is_leaf(block->page.frame))
- return;
-
- mtr_t mtr;
- mtr.start();
- const page_id_t id(block->page.id());
- const fil_space_t *space= mtr.set_named_space_id(id.space());
-
- if (buf_block_t *bitmap_page=
- ibuf_bitmap_get_map_page(id, block->zip_size(), &mtr))
- {
- if (space->purpose != FIL_TYPE_TABLESPACE)
- mtr.set_log_mode(MTR_LOG_NO_REDO);
-
-#ifdef UNIV_IBUF_DEBUG
- if (max_val != ULINT_UNDEFINED)
- {
- ulint old_val= ibuf_bitmap_page_get_bits(bitmap_page, id,
- IBUF_BITMAP_FREE, &mtr);
- ut_a(old_val <= max_val);
- }
-
- ut_a(val <= ibuf_index_page_calc_free(block));
-#endif /* UNIV_IBUF_DEBUG */
-
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>
- (bitmap_page, id, block->physical_size(), val, &mtr);
- }
-
- mtr.commit();
-}
-
-/************************************************************************//**
-Resets the free bits of the page in the ibuf bitmap. This is done in a
-separate mini-transaction, hence this operation does not restrict
-further work to only ibuf bitmap operations, which would result if the
-latch to the bitmap page were kept. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is safe
-to decrement or reset the bits in the bitmap in a mini-transaction
-that is committed before the mini-transaction that affects the free
-space. */
-void
-ibuf_reset_free_bits(
-/*=================*/
- buf_block_t* block) /*!< in: index page; free bits are set to 0
- if the index is a non-clustered
- non-unique, and page level is 0 */
-{
- ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
-}
-
-/**********************************************************************//**
-Updates the free bits for an uncompressed page to reflect the present
-state. Does this in the mtr given, which means that the latching
-order rules virtually prevent any further operations for this OS
-thread until mtr is committed. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is safe
-to set the free bits in the same mini-transaction that updated the
-page. */
-void
-ibuf_update_free_bits_low(
-/*======================*/
- const buf_block_t* block, /*!< in: index page */
- ulint max_ins_size, /*!< in: value of
- maximum insert size
- with reorganize before
- the latest operation
- performed to the page */
- mtr_t* mtr) /*!< in/out: mtr */
-{
- ulint before;
- ulint after;
-
- ut_a(!is_buf_block_get_page_zip(block));
- ut_ad(mtr->is_named_space(block->page.id().space()));
-
- before = ibuf_index_page_calc_free_bits(srv_page_size,
- max_ins_size);
-
- after = ibuf_index_page_calc_free(block);
-
- /* This approach cannot be used on compressed pages, since the
- computed value of "before" often does not match the current
- state of the bitmap. This is because the free space may
- increase or decrease when a compressed page is reorganized. */
- if (before != after) {
- ibuf_set_free_bits_low(block, after, mtr);
- }
-}
-
-/**********************************************************************//**
-Updates the free bits for a compressed page to reflect the present
-state. Does this in the mtr given, which means that the latching
-order rules virtually prevent any further operations for this OS
-thread until mtr is committed. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is safe
-to set the free bits in the same mini-transaction that updated the
-page. */
-void
-ibuf_update_free_bits_zip(
-/*======================*/
- buf_block_t* block, /*!< in/out: index page */
- mtr_t* mtr) /*!< in/out: mtr */
-{
- ut_ad(page_is_leaf(block->page.frame));
- ut_ad(block->zip_size());
-
- ulint after = ibuf_index_page_calc_free_zip(block);
-
- if (after == 0) {
- /* We move the page to the front of the buffer pool LRU list:
- the purpose of this is to prevent those pages to which we
- cannot make inserts using the insert buffer from slipping
- out of the buffer pool */
-
- buf_page_make_young(&block->page);
- }
-
- if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
- block->page.id(), block->zip_size(), mtr)) {
-
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
- bitmap_page, block->page.id(),
- block->physical_size(), after, mtr);
- }
-}
-
-/**********************************************************************//**
-Updates the free bits for the two pages to reflect the present state.
-Does this in the mtr given, which means that the latching order rules
-virtually prevent any further operations until mtr is committed.
-NOTE: The free bits in the insert buffer bitmap must never exceed the
-free space on a page. It is safe to set the free bits in the same
-mini-transaction that updated the pages. */
-void
-ibuf_update_free_bits_for_two_pages_low(
-/*====================================*/
- buf_block_t* block1, /*!< in: index page */
- buf_block_t* block2, /*!< in: index page */
- mtr_t* mtr) /*!< in: mtr */
-{
- ut_ad(mtr->is_named_space(block1->page.id().space()));
- ut_ad(block1->page.id().space() == block2->page.id().space());
-
- /* Avoid deadlocks by acquiring multiple bitmap page latches in
- a consistent order (smaller pointer first). */
- if (block1 > block2)
- std::swap(block1, block2);
-
- ibuf_set_free_bits_low(block1, ibuf_index_page_calc_free(block1), mtr);
- ibuf_set_free_bits_low(block2, ibuf_index_page_calc_free(block2), mtr);
-}
-
-/** Returns TRUE if the page is one of the fixed address ibuf pages.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@return TRUE if a fixed address ibuf i/o page */
-inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size)
-{
- return(page_id == page_id_t(IBUF_SPACE_ID, IBUF_TREE_ROOT_PAGE_NO)
- || ibuf_bitmap_page(page_id, zip_size));
-}
-
-/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
-Must not be called when recv_no_ibuf_operations==true.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] x_latch FALSE if relaxed check (avoid latching the
-bitmap page)
-@param[in,out] mtr mtr which will contain an x-latch to the
-bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
-in which case a new transaction is created.
-@return TRUE if level 2 or level 3 page */
-bool
-ibuf_page_low(
- const page_id_t page_id,
- ulint zip_size,
-#ifdef UNIV_DEBUG
- bool x_latch,
-#endif /* UNIV_DEBUG */
- mtr_t* mtr)
-{
- ibool ret;
- mtr_t local_mtr;
-
- ut_ad(!recv_no_ibuf_operations);
- ut_ad(x_latch || mtr == NULL);
-
- if (ibuf_fixed_addr_page(page_id, zip_size)) {
- return(true);
- } else if (page_id.space() != IBUF_SPACE_ID) {
- return(false);
- }
-
- compile_time_assert(IBUF_SPACE_ID == 0);
- ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE);
-
-#ifdef UNIV_DEBUG
- if (!x_latch) {
- mtr_start(&local_mtr);
-
- /* Get the bitmap page without a page latch, so that
- we will not be violating the latching order when
- another bitmap page has already been latched by this
- thread. The page will be buffer-fixed, and thus it
- cannot be removed or relocated while we are looking at
- it. The contents of the page could change, but the
- IBUF_BITMAP_IBUF bit that we are interested in should
- not be modified by any other thread. Nobody should be
- calling ibuf_add_free_page() or ibuf_remove_free_page()
- while the page is linked to the insert buffer b-tree. */
- buf_block_t* block = buf_page_get_gen(
- ibuf_bitmap_page_no_calc(page_id, zip_size),
- zip_size, RW_NO_LATCH, nullptr, BUF_GET, &local_mtr);
-
- ret = block
- && ibuf_bitmap_page_get_bits_low(
- block->page.frame, page_id, zip_size,
- MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
-
- mtr_commit(&local_mtr);
- return(ret);
- }
-#endif /* UNIV_DEBUG */
-
- if (mtr == NULL) {
- mtr = &local_mtr;
- mtr_start(mtr);
- }
-
- buf_block_t *block = ibuf_bitmap_get_map_page(page_id, zip_size,
- mtr);
- ret = block
- && ibuf_bitmap_page_get_bits(block->page.frame,
- page_id, zip_size,
- IBUF_BITMAP_IBUF, mtr);
-
- if (mtr == &local_mtr) {
- mtr_commit(mtr);
- }
-
- return(ret);
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec)
-#else /* UNIV_DEBUG */
-# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec)
-#endif /* UNIV_DEBUG */
+/** first user record field */
+constexpr unsigned IBUF_REC_FIELD_USER= 4;
/********************************************************************//**
Returns the page number field of an ibuf record.
@return page number */
-static
-uint32_t
-ibuf_rec_get_page_no_func(
-/*======================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* rec) /*!< in: ibuf record */
+static uint32_t ibuf_rec_get_page_no(const rec_t *rec)
{
- const byte* field;
- ulint len;
-
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
- ut_ad(rec_get_n_fields_old(rec) > 2);
-
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
-
- ut_a(len == 1);
-
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
-
- ut_a(len == 4);
-
- return(mach_read_from_4(field));
+ return mach_read_from_4(rec + 5);
}
-#ifdef UNIV_DEBUG
-# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec)
-#else /* UNIV_DEBUG */
-# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec)
-#endif /* UNIV_DEBUG */
-
/********************************************************************//**
-Returns the space id field of an ibuf record. For < 4.1.x format records
-returns 0.
+Returns the space id field of an ibuf record.
@return space id */
-static
-uint32_t
-ibuf_rec_get_space_func(
-/*====================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* rec) /*!< in: ibuf record */
-{
- const byte* field;
- ulint len;
-
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
- ut_ad(rec_get_n_fields_old(rec) > 2);
-
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
-
- ut_a(len == 1);
-
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
-
- ut_a(len == 4);
-
- return(mach_read_from_4(field));
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \
- ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter)
-#else /* UNIV_DEBUG */
-# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \
- ibuf_rec_get_info_func(rec,op,comp,info_len,counter)
-#endif
-/****************************************************************//**
-Get various information about an ibuf record in >= 4.1.x format. */
-static
-void
-ibuf_rec_get_info_func(
-/*===================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* rec, /*!< in: ibuf record */
- ibuf_op_t* op, /*!< out: operation type, or NULL */
- ibool* comp, /*!< out: compact flag, or NULL */
- ulint* info_len, /*!< out: length of info fields at the
- start of the fourth field, or
- NULL */
- ulint* counter) /*!< in: counter value, or NULL */
-{
- const byte* types;
- ulint fields;
- ulint len;
-
- /* Local variables to shadow arguments. */
- ibuf_op_t op_local;
- ibool comp_local;
- ulint info_len_local;
- ulint counter_local;
-
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
- fields = rec_get_n_fields_old(rec);
- ut_a(fields > IBUF_REC_FIELD_USER);
-
- types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
-
- info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
- compile_time_assert(IBUF_REC_INFO_SIZE
- < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
-
- switch (info_len_local) {
- case 0:
- case 1:
- op_local = IBUF_OP_INSERT;
- comp_local = info_len_local;
- ut_ad(!counter);
- counter_local = ULINT_UNDEFINED;
- break;
-
- case IBUF_REC_INFO_SIZE:
- op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
- comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
- counter_local = mach_read_from_2(
- types + IBUF_REC_OFFSET_COUNTER);
- break;
-
- default:
- ut_error;
- }
-
- ut_a(op_local < IBUF_OP_COUNT);
- ut_a((len - info_len_local) ==
- (fields - IBUF_REC_FIELD_USER)
- * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
-
- if (op) {
- *op = op_local;
- }
-
- if (comp) {
- *comp = comp_local;
- }
-
- if (info_len) {
- *info_len = info_len_local;
- }
-
- if (counter) {
- *counter = counter_local;
- }
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec)
-#else /* UNIV_DEBUG */
-# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec)
-#endif
-
-/****************************************************************//**
-Returns the operation type field of an ibuf record.
-@return operation type */
-static
-ibuf_op_t
-ibuf_rec_get_op_type_func(
-/*======================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* rec) /*!< in: ibuf record */
-{
- ulint len;
-
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
- ut_ad(rec_get_n_fields_old(rec) > 2);
-
- (void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
-
- if (len > 1) {
- /* This is a < 4.1.x format record */
-
- return(IBUF_OP_INSERT);
- } else {
- ibuf_op_t op;
-
- ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL);
-
- return(op);
- }
-}
-
-/****************************************************************//**
-Read the first two bytes from a record's fourth field (counter field in new
-records; something else in older records).
-@return "counter" field, or ULINT_UNDEFINED if for some reason it
-can't be read */
-ulint
-ibuf_rec_get_counter(
-/*=================*/
- const rec_t* rec) /*!< in: ibuf record */
+static uint32_t ibuf_rec_get_space(const rec_t *rec)
{
- const byte* ptr;
- ulint len;
-
- if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) {
-
- return(ULINT_UNDEFINED);
- }
-
- ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
-
- if (len >= 2) {
-
- return(mach_read_from_2(ptr));
- } else {
-
- return(ULINT_UNDEFINED);
- }
-}
-
-
-/**
- Add accumulated operation counts to a permanent array.
- Both arrays must be of size IBUF_OP_COUNT.
-*/
-static void ibuf_add_ops(Atomic_counter<ulint> *out, const ulint *in)
-{
- for (auto i = 0; i < IBUF_OP_COUNT; i++)
- out[i]+= in[i];
+ return mach_read_from_4(rec);
}
-
-/****************************************************************//**
-Print operation counts. The array must be of size IBUF_OP_COUNT. */
-static
-void
-ibuf_print_ops(
-/*===========*/
- const char* op_name,/*!< in: operation name */
- const Atomic_counter<ulint>* ops, /*!< in: operation counts */
- FILE* file) /*!< in: file where to print */
-{
- static const char* op_names[] = {
- "insert",
- "delete mark",
- "delete"
- };
-
- static_assert(array_elements(op_names) == IBUF_OP_COUNT, "");
- fputs(op_name, file);
-
- for (ulint i = 0; i < IBUF_OP_COUNT; i++) {
- fprintf(file, "%s " ULINTPF "%s", op_names[i],
- ulint{ops[i]}, (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
- }
-
- putc('\n', file);
-}
-
-/********************************************************************//**
-Creates a dummy index for inserting a record to a non-clustered index.
-@return dummy index */
-static
-dict_index_t*
-ibuf_dummy_index_create(
-/*====================*/
- ulint n, /*!< in: number of fields */
- ibool comp) /*!< in: TRUE=use compact record format */
-{
- dict_table_t* table;
- dict_index_t* index;
-
- table = dict_table_t::create({C_STRING_WITH_LEN("IBUF_DUMMY")},
- nullptr, n, 0,
- comp ? DICT_TF_COMPACT : 0, 0);
-
- index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n);
-
- /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
- index->cached = TRUE;
- ut_d(index->is_dummy = true);
-
- return(index);
-}
/********************************************************************//**
Add a column to the dummy index */
static
@@ -1289,93 +127,79 @@ ibuf_dummy_index_add_col(
dict_index_add_col(index, index->table,
dict_table_get_nth_col(index->table, i), len);
}
-/********************************************************************//**
-Deallocates a dummy index for inserting a record to a non-clustered index. */
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
static
void
-ibuf_dummy_index_free(
-/*==================*/
- dict_index_t* index) /*!< in, own: dummy index */
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
{
- dict_table_t* table = index->table;
-
- dict_mem_index_free(index);
- dict_mem_table_free(table);
-}
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
-#ifdef UNIV_DEBUG
-# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \
- ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
-#else /* UNIV_DEBUG */
-# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \
- ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex)
-#endif
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
-/*********************************************************************//**
-Builds the entry used to
+ if (buf[4] & 128) {
+ type->prtype |= DATA_NOT_NULL;
+ }
-1) IBUF_OP_INSERT: insert into a non-clustered index
+ type->len = mach_read_from_2(buf + 2);
-2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
- activate
+ uint32_t charset_coll = (mach_read_from_2(buf + 4) & CHAR_COLL_MASK)
+ << 16;
-3) IBUF_OP_DELETE: find the record we need to delete
+ if (dtype_is_string_type(type->mtype)) {
+ type->prtype |= charset_coll;
-when we have the corresponding record in an ibuf index.
+ if (charset_coll == 0) {
+ /* This insert buffer record was inserted before
+ MySQL 4.1.2, and the charset-collation code was not
+ explicitly stored to dtype->prtype at that time. It
+ must be the default charset-collation of this MySQL
+ installation. */
+ type->prtype |= default_charset_info->number << 16;
+ }
+ }
-NOTE that as we copy pointers to fields in ibuf_rec, the caller must
-hold a latch to the ibuf_rec page as long as the entry is used!
+ dtype_set_mblen(type);
+}
-@return own: entry to insert to a non-clustered index */
-static
-dtuple_t*
-ibuf_build_entry_from_ibuf_rec_func(
-/*================================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* ibuf_rec, /*!< in: record in an insert buffer */
- mem_heap_t* heap, /*!< in: heap where built */
- dict_index_t** pindex) /*!< out, own: dummy index that
- describes the entry */
+/** Construct an index entry and an index for applying an operation.
+@param ibuf_rec change buffer record in an X-latched page
+@param not_redundant whether another format than ROW_FORMAT=REDUNDANT is used
+@param n_fields number of index record fields
+@param types type information
+@param heap memory heap
+@param index dummy index metadata
+@return the index entry for applying the operation */
+static dtuple_t *ibuf_entry_build(const rec_t *ibuf_rec, ulint not_redundant,
+ ulint n_fields, const byte *types,
+ mem_heap_t *heap, dict_index_t *&index)
{
dtuple_t* tuple;
dfield_t* field;
- ulint n_fields;
- const byte* types;
const byte* data;
ulint len;
- ulint info_len;
- ulint i;
- ulint comp;
- dict_index_t* index;
-
- ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
-
- data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
-
- ut_a(len == 1);
- ut_a(*data == 0);
- ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
-
- n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER;
tuple = dtuple_create(heap, n_fields);
- types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
-
- ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL);
-
- index = ibuf_dummy_index_create(n_fields, comp);
-
- len -= info_len;
- types += info_len;
-
- ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ index = dict_mem_index_create(
+ dict_table_t::create({C_STRING_WITH_LEN("")}, nullptr,
+ n_fields, 0,
+ not_redundant ? DICT_TF_COMPACT : 0, 0),
+ "IBUF_DUMMY", 0, n_fields);
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ ut_d(index->cached = true);
+ ut_d(index->is_dummy = true);
- for (i = 0; i < n_fields; i++) {
+ for (ulint i = 0; i < n_fields; i++) {
field = dtuple_get_nth_field(tuple, i);
data = rec_get_nth_field_old(
@@ -1384,8 +208,7 @@ ibuf_build_entry_from_ibuf_rec_func(
dfield_set_data(field, data, len);
dtype_new_read_for_order_and_null_size(
- dfield_get_type(field),
- types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ dfield_get_type(field), types + i * 6);
ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
}
@@ -1395,2136 +218,87 @@ ibuf_build_entry_from_ibuf_rec_func(
/* Prevent an ut_ad() failure in page_zip_write_rec() by
adding system columns to the dummy table pointed to by the
- dummy secondary index. The insert buffer is only used for
+ dummy secondary index. The change buffer was only used for
secondary indexes, whose records never contain any system
columns, such as DB_TRX_ID. */
ut_d(dict_table_add_system_columns(index->table, index->table->heap));
-
- *pindex = index;
-
- return(tuple);
-}
-
-/******************************************************************//**
-Get the data size.
-@return size of fields */
-UNIV_INLINE
-ulint
-ibuf_rec_get_size(
-/*==============*/
- const rec_t* rec, /*!< in: ibuf record */
- const byte* types, /*!< in: fields */
- ulint n_fields, /*!< in: number of fields */
- ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT,
- nonzero=ROW_FORMAT=COMPACT */
-{
- ulint i;
- ulint field_offset;
- ulint types_offset;
- ulint size = 0;
-
- field_offset = IBUF_REC_FIELD_USER;
- types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
-
- for (i = 0; i < n_fields; i++) {
- ulint len;
- dtype_t dtype;
-
- rec_get_nth_field_offs_old(rec, i + field_offset, &len);
-
- if (len != UNIV_SQL_NULL) {
- size += len;
- } else {
- dtype_new_read_for_order_and_null_size(&dtype, types);
-
- size += dtype_get_sql_null_size(&dtype, comp);
- }
-
- types += types_offset;
- }
-
- return(size);
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec)
-#else /* UNIV_DEBUG */
-# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec)
-#endif
-
-/********************************************************************//**
-Returns the space taken by a stored non-clustered index entry if converted to
-an index record.
-@return size of index record in bytes + an upper limit of the space
-taken in the page directory */
-static
-ulint
-ibuf_rec_get_volume_func(
-/*=====================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* ibuf_rec)/*!< in: ibuf record */
-{
- ulint len;
- const byte* data;
- const byte* types;
- ulint n_fields;
- ulint data_size;
- ulint comp;
- ibuf_op_t op;
- ulint info_len;
-
- ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
- ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
-
- data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
- ut_a(len == 1);
- ut_a(*data == 0);
-
- types = rec_get_nth_field_old(
- ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
-
- ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
-
- if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
- /* Delete-marking a record doesn't take any
- additional space, and while deleting a record
- actually frees up space, we have to play it safe and
- pretend it takes no additional space (the record
- might not exist, etc.). */
-
- return(0);
- } else if (comp) {
- dtuple_t* entry;
- ulint volume;
- dict_index_t* dummy_index;
- mem_heap_t* heap = mem_heap_create(500);
-
- entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
- heap, &dummy_index);
-
- volume = rec_get_converted_size(dummy_index, entry, 0);
-
- ibuf_dummy_index_free(dummy_index);
- mem_heap_free(heap);
-
- return(volume + page_dir_calc_reserved_space(1));
- }
-
- types += info_len;
- n_fields = rec_get_n_fields_old(ibuf_rec)
- - IBUF_REC_FIELD_USER;
-
- data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
-
- return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
- + page_dir_calc_reserved_space(1));
-}
-
-/*********************************************************************//**
-Builds the tuple to insert to an ibuf tree when we have an entry for a
-non-clustered index.
-
-NOTE that the original entry must be kept because we copy pointers to
-its fields.
-
-@return own: entry to insert into an ibuf index tree */
-static
-dtuple_t*
-ibuf_entry_build(
-/*=============*/
- ibuf_op_t op, /*!< in: operation type */
- dict_index_t* index, /*!< in: non-clustered index */
- const dtuple_t* entry, /*!< in: entry for a non-clustered index */
- ulint space, /*!< in: space id */
- ulint page_no,/*!< in: index page number where entry should
- be inserted */
- ulint counter,/*!< in: counter value;
- ULINT_UNDEFINED=not used */
- mem_heap_t* heap) /*!< in: heap into which to build */
-{
- dtuple_t* tuple;
- dfield_t* field;
- const dfield_t* entry_field;
- ulint n_fields;
- byte* buf;
- byte* ti;
- byte* type_info;
- ulint i;
-
- ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT);
- ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF);
- ut_ad(op < IBUF_OP_COUNT);
-
- /* We have to build a tuple with the following fields:
-
- 1-4) These are described at the top of this file.
-
- 5) The rest of the fields are copied from the entry.
-
- All fields in the tuple are ordered like the type binary in our
- insert buffer tree. */
-
- n_fields = dtuple_get_n_fields(entry);
-
- tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER);
-
- /* 1) Space Id */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
-
- buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
-
- mach_write_to_4(buf, space);
-
- dfield_set_data(field, buf, 4);
-
- /* 2) Marker byte */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
-
- buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
-
- /* We set the marker byte zero */
-
- mach_write_to_1(buf, 0);
-
- dfield_set_data(field, buf, 1);
-
- /* 3) Page number */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
-
- buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
-
- mach_write_to_4(buf, page_no);
-
- dfield_set_data(field, buf, 4);
-
- /* 4) Type info, part #1 */
-
- if (counter == ULINT_UNDEFINED) {
- i = dict_table_is_comp(index->table) ? 1 : 0;
- } else {
- ut_ad(counter <= 0xFFFF);
- i = IBUF_REC_INFO_SIZE;
- }
-
- ti = type_info = static_cast<byte*>(
- mem_heap_alloc(
- heap,
- i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
-
- switch (i) {
- default:
- ut_error;
- break;
- case 1:
- /* set the flag for ROW_FORMAT=COMPACT */
- *ti++ = 0;
- /* fall through */
- case 0:
- /* the old format does not allow delete buffering */
- ut_ad(op == IBUF_OP_INSERT);
- break;
- case IBUF_REC_INFO_SIZE:
- mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter);
-
- ti[IBUF_REC_OFFSET_TYPE] = (byte) op;
- ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
- ? IBUF_REC_COMPACT : 0;
- ti += IBUF_REC_INFO_SIZE;
- break;
- }
-
- /* 5+) Fields from the entry */
-
- for (i = 0; i < n_fields; i++) {
- ulint fixed_len;
- const dict_field_t* ifield;
-
- field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER);
- entry_field = dtuple_get_nth_field(entry, i);
- dfield_copy(field, entry_field);
-
- ifield = dict_index_get_nth_field(index, i);
- ut_ad(!ifield->descending);
- /* Prefix index columns of fixed-length columns are of
- fixed length. However, in the function call below,
- dfield_get_type(entry_field) contains the fixed length
- of the column in the clustered index. Replace it with
- the fixed length of the secondary index column. */
- fixed_len = ifield->fixed_len;
-
-#ifdef UNIV_DEBUG
- if (fixed_len) {
- /* dict_index_add_col() should guarantee these */
- ut_ad(fixed_len <= (ulint)
- dfield_get_type(entry_field)->len);
- if (ifield->prefix_len) {
- ut_ad(ifield->prefix_len == fixed_len);
- } else {
- ut_ad(fixed_len == (ulint)
- dfield_get_type(entry_field)->len);
- }
- }
-#endif /* UNIV_DEBUG */
-
- dtype_new_store_for_order_and_null_size(
- ti, dfield_get_type(entry_field), fixed_len);
- ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
- }
-
- /* 4) Type info, part #2 */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA);
-
- dfield_set_data(field, type_info, ulint(ti - type_info));
-
- /* Set all the types in the new tuple binary */
-
- dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER);
-
- return(tuple);
-}
-
-/*********************************************************************//**
-Builds a search tuple used to search buffered inserts for an index page.
-This is for >= 4.1.x format records.
-@return own: search tuple */
-static
-dtuple_t*
-ibuf_search_tuple_build(
-/*====================*/
- ulint space, /*!< in: space id */
- ulint page_no,/*!< in: index page number */
- mem_heap_t* heap) /*!< in: heap into which to build */
-{
- dtuple_t* tuple;
- dfield_t* field;
- byte* buf;
-
- tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
-
- /* Store the space id in tuple */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
-
- buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
-
- mach_write_to_4(buf, space);
-
- dfield_set_data(field, buf, 4);
-
- /* Store the new format record marker byte */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
-
- buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
-
- mach_write_to_1(buf, 0);
-
- dfield_set_data(field, buf, 1);
-
- /* Store the page number in tuple */
-
- field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
-
- buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
-
- mach_write_to_4(buf, page_no);
-
- dfield_set_data(field, buf, 4);
-
- dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA);
-
return(tuple);
}
-/*********************************************************************//**
-Checks if there are enough pages in the free list of the ibuf tree that we
-dare to start a pessimistic insert to the insert buffer.
-@return whether enough free pages in list */
-static inline bool ibuf_data_enough_free_for_insert()
-{
- mysql_mutex_assert_owner(&ibuf_mutex);
-
- /* We want a big margin of free pages, because a B-tree can sometimes
- grow in size also if records are deleted from it, as the node pointers
- can change, and we must make sure that we are able to delete the
- inserts buffered for pages that we read to the buffer pool, without
- any risk of running out of free space in the insert buffer. */
-
- return(ibuf.free_list_len >= (ibuf.size / 2) + 3 * ibuf.height);
-}
-
-/*********************************************************************//**
-Checks if there are enough pages in the free list of the ibuf tree that we
-should remove them and free to the file space management.
-@return TRUE if enough free pages in list */
-UNIV_INLINE
-ibool
-ibuf_data_too_much_free(void)
-/*=========================*/
-{
- mysql_mutex_assert_owner(&ibuf_mutex);
-
- return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height);
-}
-
-/** Allocate a change buffer page.
-@retval true on success
-@retval false if no space left */
-static bool ibuf_add_free_page()
-{
- mtr_t mtr;
- page_t* header_page;
- buf_block_t* block;
-
- mtr.start();
- /* Acquire the fsp latch before the ibuf header, obeying the latching
- order */
- mtr.x_lock_space(fil_system.sys_space);
- header_page = ibuf_header_page_get(&mtr);
- if (!header_page) {
- mtr.commit();
- return false;
- }
-
- /* Allocate a new page: NOTE that if the page has been a part of a
- non-clustered index which has subsequently been dropped, then the
- page may have buffered inserts in the insert buffer, and these
- should be deleted from there. These get deleted when the page
- allocation creates the page in buffer. Thus the call below may end
- up calling the insert buffer routines and, as we yet have no latches
- to insert buffer tree pages, these routines can run without a risk
- of a deadlock. This is the reason why we created a special ibuf
- header page apart from the ibuf tree. */
-
- dberr_t err;
- block = fseg_alloc_free_page_general(
- header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
- false, &mtr, &mtr, &err);
-
- if (!block) {
- mtr.commit();
- return false;
- }
-
- ut_ad(block->page.lock.not_recursive());
- ibuf_enter(&mtr);
- mysql_mutex_lock(&ibuf_mutex);
-
- mtr.write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
- FIL_PAGE_IBUF_FREE_LIST);
- buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr);
- if (UNIV_UNLIKELY(!ibuf_root)) {
-corrupted:
- /* Do not bother to try to free the allocated block, because
- the change buffer is seriously corrupted already. */
- mysql_mutex_unlock(&ibuf_mutex);
- ibuf_mtr_commit(&mtr);
- return false;
- }
-
- /* Add the page to the free list and update the ibuf size data */
-
- err = flst_add_last(ibuf_root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
- block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
- fil_system.sys_space->free_limit, &mtr);
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
- goto corrupted;
- }
-
- /* Set the bit indicating that this page is now an ibuf tree page
- (level 2 page) */
-
- const page_id_t page_id(block->page.id());
- buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
-
- if (UNIV_UNLIKELY(!bitmap_page)) {
- goto corrupted;
- }
-
- ibuf.seg_size++;
- ibuf.free_list_len++;
-
- mysql_mutex_unlock(&ibuf_mutex);
-
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id,
- srv_page_size, true, &mtr);
- ibuf_mtr_commit(&mtr);
- return true;
-}
-
-/*********************************************************************//**
-Removes a page from the free list and frees it to the fsp system. */
-static void ibuf_remove_free_page()
+/** Removes a page from the free list and frees it to the fsp system.
+@param mtr mini-transaction
+@return error code
+@retval DB_SUCCESS if more work may remain to be done
+@retval DB_SUCCESS_LOCKED_REC if everything was freed */
+ATTRIBUTE_COLD static dberr_t ibuf_remove_free_page(mtr_t &mtr)
{
- mtr_t mtr;
- page_t* header_page;
+ log_free_check();
- log_free_check();
-
- mtr_start(&mtr);
- /* Acquire the fsp latch before the ibuf header, obeying the latching
- order */
-
- mtr.x_lock_space(fil_system.sys_space);
- header_page = ibuf_header_page_get(&mtr);
-
- /* Prevent pessimistic inserts to insert buffer trees for a while */
- ibuf_enter(&mtr);
- mysql_mutex_lock(&ibuf_pessimistic_insert_mutex);
- mysql_mutex_lock(&ibuf_mutex);
-
- if (!header_page || !ibuf_data_too_much_free()) {
-early_exit:
- mysql_mutex_unlock(&ibuf_mutex);
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
-
- ibuf_mtr_commit(&mtr);
-
- return;
- }
-
- buf_block_t* root = ibuf_tree_root_get(&mtr);
-
- if (UNIV_UNLIKELY(!root)) {
- goto early_exit;
- }
-
- const auto root_savepoint = mtr.get_savepoint() - 1;
- const uint32_t page_no = flst_get_last(PAGE_HEADER
- + PAGE_BTR_IBUF_FREE_LIST
- + root->page.frame).page;
-
- if (page_no >= fil_system.sys_space->free_limit) {
- goto early_exit;
- }
-
- mysql_mutex_unlock(&ibuf_mutex);
-
- /* NOTE that we must release the latch on the ibuf tree root
- because in fseg_free_page we access level 1 pages, and the root
- is a level 2 page. */
- root->page.lock.u_unlock();
- mtr.lock_register(root_savepoint, MTR_MEMO_BUF_FIX);
- ibuf_exit(&mtr);
-
- /* Since pessimistic inserts were prevented, we know that the
- page is still in the free list. NOTE that also deletes may take
- pages from the free list, but they take them from the start, and
- the free list was so long that they cannot have taken the last
- page from it. */
-
- compile_time_assert(IBUF_SPACE_ID == 0);
- const page_id_t page_id{IBUF_SPACE_ID, page_no};
- buf_block_t* bitmap_page = nullptr;
- dberr_t err = fseg_free_page(
- header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
- fil_system.sys_space, page_no, &mtr);
-
- if (err != DB_SUCCESS) {
- goto func_exit;
- }
-
- ibuf_enter(&mtr);
-
- mysql_mutex_lock(&ibuf_mutex);
- mtr.upgrade_buffer_fix(root_savepoint, RW_X_LATCH);
-
- /* Remove the page from the free list and update the ibuf size data */
- if (buf_block_t* block =
- buf_page_get_gen(page_id, 0, RW_X_LATCH, nullptr, BUF_GET,
- &mtr, &err)) {
- err = flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
- block,
- PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
- fil_system.sys_space->free_limit, &mtr);
- }
-
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+ mtr.start();
- if (err == DB_SUCCESS) {
- ibuf.seg_size--;
- ibuf.free_list_len--;
- bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
- }
+ mtr.x_lock_space(fil_system.sys_space);
+ dberr_t err;
+ buf_block_t* header= buf_page_get_gen(ibuf_header, 0, RW_X_LATCH, nullptr,
+ BUF_GET, &mtr, &err);
+ if (!header)
+ {
func_exit:
- mysql_mutex_unlock(&ibuf_mutex);
-
- if (bitmap_page) {
- /* Set the bit indicating that this page is no more an
- ibuf tree page (level 2 page) */
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(
- bitmap_page, page_id, srv_page_size, false, &mtr);
- }
-
- if (err == DB_SUCCESS) {
- buf_page_free(fil_system.sys_space, page_no, &mtr);
- }
-
- ibuf_mtr_commit(&mtr);
-}
-
-/***********************************************************************//**
-Frees excess pages from the ibuf free list. This function is called when an OS
-thread calls fsp services to allocate a new file segment, or a new page to a
-file segment, and the thread did not own the fsp latch before this call. */
-void
-ibuf_free_excess_pages(void)
-/*========================*/
-{
- if (UNIV_UNLIKELY(!ibuf.index)) return;
- /* Free at most a few pages at a time, so that we do not delay the
- requested service too much */
-
- for (ulint i = 0; i < 4; i++) {
-
- ibool too_much_free;
-
- mysql_mutex_lock(&ibuf_mutex);
- too_much_free = ibuf_data_too_much_free();
- mysql_mutex_unlock(&ibuf_mutex);
-
- if (!too_much_free) {
- return;
- }
-
- ibuf_remove_free_page();
- }
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_get_merge_page_nos(rec,mtr,ids,pages,n_stored) \
- ibuf_get_merge_page_nos_func(rec,mtr,ids,pages,n_stored)
-#else /* UNIV_DEBUG */
-# define ibuf_get_merge_page_nos(rec,mtr,ids,pages,n_stored) \
- ibuf_get_merge_page_nos_func(rec,ids,pages,n_stored)
-#endif /* UNIV_DEBUG */
-
-/*********************************************************************//**
-Reads page numbers from a leaf in an ibuf tree.
-@return a lower limit for the combined volume of records which will be
-merged */
-static
-ulint
-ibuf_get_merge_page_nos_func(
-/*=========================*/
- const rec_t* rec, /*!< in: insert buffer record */
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction holding rec */
-#endif /* UNIV_DEBUG */
- uint32_t* space_ids,/*!< in/out: space id's of the pages */
- uint32_t* page_nos,/*!< in/out: buffer for at least
- IBUF_MAX_N_PAGES_MERGED many page numbers;
- the page numbers are in an ascending order */
- ulint* n_stored)/*!< out: number of page numbers stored to
- page_nos in this function */
-{
- uint32_t prev_page_no;
- uint32_t prev_space_id;
- uint32_t first_page_no;
- uint32_t first_space_id;
- uint32_t rec_page_no;
- uint32_t rec_space_id;
- ulint sum_volumes;
- ulint volume_for_page;
- ulint rec_volume;
- ulint limit;
- ulint n_pages;
-
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
-
- *n_stored = 0;
-
- if (page_rec_is_supremum(rec)) {
-
- rec = page_rec_get_prev_const(rec);
- if (UNIV_UNLIKELY(!rec)) {
-corruption:
- ut_ad("corrupted page" == 0);
- return 0;
- }
- }
-
- if (page_rec_is_infimum(rec)) {
- rec = page_rec_get_next_const(rec);
- if (!rec || page_rec_is_supremum(rec)) {
- return 0;
- }
- }
-
- limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
- buf_pool_get_curr_size() / 4);
-
- first_page_no = ibuf_rec_get_page_no(mtr, rec);
- first_space_id = ibuf_rec_get_space(mtr, rec);
- n_pages = 0;
- prev_page_no = 0;
- prev_space_id = 0;
-
- /* Go backwards from the first rec until we reach the border of the
- 'merge area', or the page start or the limit of storeable pages is
- reached */
-
- while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
-
- rec_page_no = ibuf_rec_get_page_no(mtr, rec);
- rec_space_id = ibuf_rec_get_space(mtr, rec);
-
- if (rec_space_id != first_space_id
- || (rec_page_no / IBUF_MERGE_AREA)
- != (first_page_no / IBUF_MERGE_AREA)) {
-
- break;
- }
-
- if (rec_page_no != prev_page_no
- || rec_space_id != prev_space_id) {
- n_pages++;
- }
-
- prev_page_no = rec_page_no;
- prev_space_id = rec_space_id;
-
- if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
- goto corruption;
- }
- }
-
- rec = page_rec_get_next_const(rec);
-
- /* At the loop start there is no prev page; we mark this with a pair
- of space id, page no (0, 0) for which there can never be entries in
- the insert buffer */
-
- prev_page_no = 0;
- prev_space_id = 0;
- sum_volumes = 0;
- volume_for_page = 0;
-
- while (*n_stored < limit && rec) {
- if (page_rec_is_supremum(rec)) {
- /* When no more records available, mark this with
- another 'impossible' pair of space id, page no */
- rec_page_no = 1;
- rec_space_id = 0;
- } else {
- rec_page_no = ibuf_rec_get_page_no(mtr, rec);
- rec_space_id = ibuf_rec_get_space(mtr, rec);
- /* In the system tablespace the smallest
- possible secondary index leaf page number is
- bigger than FSP_DICT_HDR_PAGE_NO (7).
- In all tablespaces, pages 0 and 1 are reserved
- for the allocation bitmap and the change
- buffer bitmap. In file-per-table tablespaces,
- a file segment inode page will be created at
- page 2 and the clustered index tree is created
- at page 3. So for file-per-table tablespaces,
- page 4 is the smallest possible secondary
- index leaf page. CREATE TABLESPACE also initially
- uses pages 2 and 3 for the first created table,
- but that table may be dropped, allowing page 2
- to be reused for a secondary index leaf page.
- To keep this assertion simple, just
- make sure the page is >= 2. */
- ut_ad(rec_page_no >= FSP_FIRST_INODE_PAGE_NO);
- }
-
-#ifdef UNIV_IBUF_DEBUG
- ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
-#endif
- if ((rec_space_id != prev_space_id
- || rec_page_no != prev_page_no)
- && (prev_space_id != 0 || prev_page_no != 0)) {
-
- space_ids[*n_stored] = prev_space_id;
- page_nos[*n_stored] = prev_page_no;
- (*n_stored)++;
- sum_volumes += volume_for_page;
-
- if (rec_space_id != first_space_id
- || rec_page_no / IBUF_MERGE_AREA
- != first_page_no / IBUF_MERGE_AREA) {
-
- break;
- }
-
- volume_for_page = 0;
- }
-
- if (rec_page_no == 1 && rec_space_id == 0) {
- /* Supremum record */
-
- break;
- }
-
- rec_volume = ibuf_rec_get_volume(mtr, rec);
-
- volume_for_page += rec_volume;
-
- prev_page_no = rec_page_no;
- prev_space_id = rec_space_id;
-
- rec = page_rec_get_next_const(rec);
- }
-
-#ifdef UNIV_IBUF_DEBUG
- ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
-#endif
-#if 0
- fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
- *n_stored, sum_volumes);
-#endif
- return(sum_volumes);
-}
-
-/*******************************************************************//**
-Get the matching records for space id.
-@return current rec or NULL */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-const rec_t*
-ibuf_get_user_rec(
-/*===============*/
- btr_pcur_t* pcur, /*!< in: the current cursor */
- mtr_t* mtr) /*!< in: mini transaction */
-{
- do {
- const rec_t* rec = btr_pcur_get_rec(pcur);
-
- if (page_rec_is_user_rec(rec)) {
- return(rec);
- }
- } while (btr_pcur_move_to_next(pcur, mtr));
-
- return(NULL);
-}
-
-/*********************************************************************//**
-Reads page numbers for a space id from an ibuf tree.
-@return a lower limit for the combined volume of records which will be
-merged */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-ulint
-ibuf_get_merge_pages(
-/*=================*/
- btr_pcur_t* pcur, /*!< in/out: cursor */
- uint32_t space, /*!< in: space for which to merge */
- ulint limit, /*!< in: max page numbers to read */
- uint32_t* pages, /*!< out: pages read */
- uint32_t* spaces, /*!< out: spaces read */
- ulint* n_pages,/*!< out: number of pages read */
- mtr_t* mtr) /*!< in: mini transaction */
-{
- const rec_t* rec;
- ulint volume = 0;
-
- *n_pages = 0;
-
- while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
- && ibuf_rec_get_space(mtr, rec) == space
- && *n_pages < limit) {
-
- uint32_t page_no = ibuf_rec_get_page_no(mtr, rec);
-
- if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
- spaces[*n_pages] = space;
- pages[*n_pages] = page_no;
- ++*n_pages;
- }
-
- volume += ibuf_rec_get_volume(mtr, rec);
-
- btr_pcur_move_to_next(pcur, mtr);
- }
-
- return(volume);
-}
-
-/**
-Delete a change buffer record.
-@param[in] page_id page identifier
-@param[in,out] pcur persistent cursor positioned on the record
-@param[in] search_tuple search key for (space,page_no)
-@param[in,out] mtr mini-transaction
-@return whether mtr was committed (due to pessimistic operation) */
-static MY_ATTRIBUTE((warn_unused_result, nonnull))
-bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
- const dtuple_t* search_tuple, mtr_t* mtr);
-
-/** Delete the change buffer records for the given page id
-@param page_id page identifier */
-static void ibuf_delete_recs(const page_id_t page_id)
-{
- if (!ibuf.index || srv_read_only_mode)
- return;
- dfield_t dfield[IBUF_REC_FIELD_METADATA];
- dtuple_t tuple {0,IBUF_REC_FIELD_METADATA,IBUF_REC_FIELD_METADATA,
- dfield,0,nullptr
-#ifdef UNIV_DEBUG
- ,DATA_TUPLE_MAGIC_N
-#endif
- };
- byte space_id[4], page_no[4];
-
- mach_write_to_4(space_id, page_id.space());
- mach_write_to_4(page_no, page_id.page_no());
+ mtr.commit();
+ return err;
+ }
- dfield_set_data(&dfield[0], space_id, 4);
- dfield_set_data(&dfield[1], field_ref_zero, 1);
- dfield_set_data(&dfield[2], page_no, 4);
- dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA);
+ buf_block_t *root= buf_page_get_gen(ibuf_root, 0, RW_X_LATCH,
+ nullptr, BUF_GET, &mtr, &err);
- mtr_t mtr;
-loop:
- btr_pcur_t pcur;
- pcur.btr_cur.page_cur.index= ibuf.index;
- ibuf_mtr_start(&mtr);
- if (btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr))
- goto func_exit;
- if (!btr_pcur_is_on_user_rec(&pcur))
- {
- ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+ if (UNIV_UNLIKELY(!root))
goto func_exit;
- }
- for (;;)
+ const uint32_t page_no= flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST +
+ root->page.frame).page;
+ if (page_no == FIL_NULL)
{
- ut_ad(btr_pcur_is_on_user_rec(&pcur));
- const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
- if (ibuf_rec_get_space(&mtr, ibuf_rec) != page_id.space()
- || ibuf_rec_get_page_no(&mtr, ibuf_rec) != page_id.page_no())
- break;
- /* Delete the record from ibuf */
- if (ibuf_delete_rec(page_id, &pcur, &tuple, &mtr))
- {
- /* Deletion was pessimistic and mtr was committed:
- we start from the beginning again */
- ut_ad(mtr.has_committed());
- goto loop;
- }
-
- if (btr_pcur_is_after_last_on_page(&pcur))
- {
- ibuf_mtr_commit(&mtr);
- btr_pcur_close(&pcur);
- goto loop;
- }
+ mtr.set_modified(*root);
+ fsp_init_file_page(fil_system.sys_space, root, &mtr);
+ err= DB_SUCCESS_LOCKED_REC;
+ goto func_exit;
}
-func_exit:
- ibuf_mtr_commit(&mtr);
- btr_pcur_close(&pcur);
-}
-
-/** Merge the change buffer to some pages. */
-static void ibuf_read_merge_pages(const uint32_t* space_ids,
- const uint32_t* page_nos, ulint n_stored,
- bool slow_shutdown_cleanup)
-{
- for (ulint i = 0; i < n_stored; i++) {
- const uint32_t space_id = space_ids[i];
- fil_space_t* s = fil_space_t::get(space_id);
- if (!s) {
-tablespace_deleted:
- /* The tablespace was not found: remove all
- entries for it */
- ibuf_delete_for_discarded_space(space_id);
- while (i + 1 < n_stored
- && space_ids[i + 1] == space_id) {
- i++;
- }
- continue;
- }
-
- const ulint zip_size = s->zip_size(), size = s->size;
- s->x_lock();
- s->release();
- mtr_t mtr;
-
- if (UNIV_LIKELY(page_nos[i] < size)) {
- mtr.start();
- dberr_t err;
- /* Load the page and apply change buffers. */
- std::ignore =
- buf_page_get_gen(page_id_t(space_id, page_nos[i]),
- zip_size, RW_X_LATCH, nullptr,
- BUF_GET_POSSIBLY_FREED,
- &mtr, &err, true);
- mtr.commit();
- if (err == DB_TABLESPACE_DELETED) {
- s->x_unlock();
- goto tablespace_deleted;
- }
- }
-
- s->x_unlock();
-
- /* During slow shutdown cleanup, we apply all pending IBUF
- changes and need to cleanup any left-over IBUF records. There
- are a few cases when the changes are already discarded and IBUF
- bitmap is cleaned but we miss to delete the record. Even after
- the issues are fixed, we need to keep this safety measure for
- upgraded DBs with such left over records. */
- if (!slow_shutdown_cleanup) {
- continue;
- }
-
- /* The following code works around a hang when the
- change buffer is corrupted, likely due to the
- failure of ibuf_merge_or_delete_for_page() to
- invoke ibuf_delete_recs() if (!bitmap_bits).
-
- It also introduced corruption by itself in the
- following scenario:
-
- (1) We merged buffered changes in buf_page_get_gen()
- (2) We committed the mini-transaction
- (3) Redo log and the page with the merged changes is written
- (4) A write completion callback thread evicts the page.
- (5) Other threads buffer changes for that page.
- (6) We will wrongly discard those newly buffered changes below.
-
- To prevent this scenario, we will only invoke this code
- on shutdown. A call to ibuf_max_size_update(0) will cause
- ibuf_insert_low() to refuse to insert anything into the
- change buffer. */
-
- /* Prevent an infinite loop, by removing entries from
- the change buffer in the case the bitmap bits were
- wrongly clear even though buffered changes exist. */
- ibuf_delete_recs(page_id_t(space_id, page_nos[i]));
- }
-}
-
-/** Contract the change buffer by reading pages to the buffer pool.
-@return a lower limit for the combined size in bytes of entries which
-will be merged from ibuf trees to the pages read
-@retval 0 if ibuf.empty */
-ATTRIBUTE_COLD ulint ibuf_contract()
-{
- if (UNIV_UNLIKELY(!ibuf.index)) return 0;
- mtr_t mtr;
- btr_cur_t cur;
- ulint sum_sizes;
- uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED];
- uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED];
-
- ibuf_mtr_start(&mtr);
-
- if (cur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr) !=
- DB_SUCCESS) {
- return 0;
- }
-
- ut_ad(page_validate(btr_cur_get_page(&cur), ibuf.index));
-
- if (page_is_empty(btr_cur_get_page(&cur))) {
- /* If a B-tree page is empty, it must be the root page
- and the whole B-tree must be empty. InnoDB does not
- allow empty B-tree pages other than the root. */
- ut_ad(ibuf.empty);
- ut_ad(btr_cur_get_block(&cur)->page.id()
- == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
-
- ibuf_mtr_commit(&mtr);
-
- return(0);
- }
-
- ulint n_pages = 0;
- sum_sizes = ibuf_get_merge_page_nos(btr_cur_get_rec(&cur), &mtr,
- space_ids, page_nos, &n_pages);
- ibuf_mtr_commit(&mtr);
-
- ibuf_read_merge_pages(space_ids, page_nos, n_pages, true);
-
- return(sum_sizes + 1);
-}
-
-/*********************************************************************//**
-Contracts insert buffer trees by reading pages referring to space_id
-to the buffer pool.
-@returns number of pages merged.*/
-ulint
-ibuf_merge_space(
-/*=============*/
- ulint space) /*!< in: tablespace id to merge */
-{
- if (UNIV_UNLIKELY(!ibuf.index)) return 0;
- mtr_t mtr;
- btr_pcur_t pcur;
-
- dfield_t dfield[IBUF_REC_FIELD_METADATA];
- dtuple_t tuple {0, IBUF_REC_FIELD_METADATA,
- IBUF_REC_FIELD_METADATA,dfield,0,nullptr
-#ifdef UNIV_DEBUG
- , DATA_TUPLE_MAGIC_N
-#endif
- };
- byte space_id[4];
-
- mach_write_to_4(space_id, space);
-
- dfield_set_data(&dfield[0], space_id, 4);
- dfield_set_data(&dfield[1], field_ref_zero, 1);
- dfield_set_data(&dfield[2], field_ref_zero, 4);
-
- dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA);
- ulint n_pages = 0;
-
- ut_ad(space < SRV_SPACE_ID_UPPER_BOUND);
-
- log_free_check();
- ibuf_mtr_start(&mtr);
-
- /* Position the cursor on the first matching record. */
-
- pcur.btr_cur.page_cur.index = ibuf.index;
- dberr_t err = btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF,
- &pcur, &mtr);
- ut_ad(err != DB_SUCCESS || page_validate(btr_pcur_get_page(&pcur),
- ibuf.index));
-
- ulint sum_sizes = 0;
- uint32_t pages[IBUF_MAX_N_PAGES_MERGED];
- uint32_t spaces[IBUF_MAX_N_PAGES_MERGED];
-
- if (err != DB_SUCCESS) {
- } else if (page_is_empty(btr_pcur_get_page(&pcur))) {
- /* If a B-tree page is empty, it must be the root page
- and the whole B-tree must be empty. InnoDB does not
- allow empty B-tree pages other than the root. */
- ut_ad(ibuf.empty);
- ut_ad(btr_pcur_get_block(&pcur)->page.id()
- == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
- } else {
-
- sum_sizes = ibuf_get_merge_pages(
- &pcur, uint32_t(space), IBUF_MAX_N_PAGES_MERGED,
- &pages[0], &spaces[0], &n_pages,
- &mtr);
- ib::info() << "Size of pages merged " << sum_sizes;
- }
-
- ibuf_mtr_commit(&mtr);
-
- if (n_pages > 0) {
- ut_ad(n_pages <= UT_ARR_SIZE(pages));
-
-#ifdef UNIV_DEBUG
- for (ulint i = 0; i < n_pages; ++i) {
- ut_ad(spaces[i] == space);
- }
-#endif /* UNIV_DEBUG */
-
- ibuf_read_merge_pages(spaces, pages, n_pages, false);
- }
-
- return(n_pages);
-}
-
-/** Determine if a change buffer record has been encountered already.
-@param rec change buffer record in the MySQL 5.5 format
-@param hash hash table of encountered records
-@param size number of elements in hash
-@retval true if a distinct record
-@retval false if this may be duplicating an earlier record */
-static bool ibuf_get_volume_buffered_hash(const rec_t *rec, ulint *hash,
- ulint size)
-{
- ut_ad(rec_get_n_fields_old(rec) > IBUF_REC_FIELD_USER);
- const ulint start= rec_get_field_start_offs(rec, IBUF_REC_FIELD_USER);
- const ulint len= rec_get_data_size_old(rec) - start;
- const uint32_t fold= my_crc32c(0, rec + start, len);
- hash+= (fold / (CHAR_BIT * sizeof *hash)) % size;
- ulint bitmask= static_cast<ulint>(1) << (fold % (CHAR_BIT * sizeof(*hash)));
-
- if (*hash & bitmask)
- return false;
-
- /* We have not seen this record yet. Remember it. */
- *hash|= bitmask;
- return true;
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \
- ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs)
-#else /* UNIV_DEBUG */
-# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \
- ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs)
-#endif /* UNIV_DEBUG */
-
-/*********************************************************************//**
-Update the estimate of the number of records on a page, and
-get the space taken by merging the buffered record to the index page.
-@return size of index record in bytes + an upper limit of the space
-taken in the page directory */
-static
-ulint
-ibuf_get_volume_buffered_count_func(
-/*================================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction owning rec */
-#endif /* UNIV_DEBUG */
- const rec_t* rec, /*!< in: insert buffer record */
- ulint* hash, /*!< in/out: hash array */
- ulint size, /*!< in: number of elements in hash array */
- lint* n_recs) /*!< in/out: estimated number of records
- on the page that rec points to */
-{
- ulint len;
- ibuf_op_t ibuf_op;
- const byte* types;
- ulint n_fields;
-
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(ibuf_inside(mtr));
-
- n_fields = rec_get_n_fields_old(rec);
- ut_ad(n_fields > IBUF_REC_FIELD_USER);
- n_fields -= IBUF_REC_FIELD_USER;
-
- rec_get_nth_field_offs_old(rec, 1, &len);
- /* This function is only invoked when buffering new
- operations. All pre-4.1 records should have been merged
- when the database was started up. */
- ut_a(len == 1);
-
- if (rec_get_deleted_flag(rec, 0)) {
- /* This record has been merged already,
- but apparently the system crashed before
- the change was discarded from the buffer.
- Pretend that the record does not exist. */
- return(0);
- }
-
- types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
-
- switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE),
- IBUF_REC_INFO_SIZE)) {
- default:
- ut_error;
- case 0:
- /* This ROW_TYPE=REDUNDANT record does not include an
- operation counter. Exclude it from the *n_recs,
- because deletes cannot be buffered if there are
- old-style inserts buffered for the page. */
-
- len = ibuf_rec_get_size(rec, types, n_fields, 0);
-
- return(len
- + rec_get_converted_extra_size(len, n_fields, 0)
- + page_dir_calc_reserved_space(1));
- case 1:
- /* This ROW_TYPE=COMPACT record does not include an
- operation counter. Exclude it from the *n_recs,
- because deletes cannot be buffered if there are
- old-style inserts buffered for the page. */
- goto get_volume_comp;
-
- case IBUF_REC_INFO_SIZE:
- ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
- break;
- }
-
- switch (ibuf_op) {
- case IBUF_OP_INSERT:
- /* Inserts can be done by updating a delete-marked record.
- Because delete-mark and insert operations can be pointing to
- the same records, we must not count duplicates. */
- case IBUF_OP_DELETE_MARK:
- /* There must be a record to delete-mark.
- See if this record has been already buffered. */
- if (n_recs && ibuf_get_volume_buffered_hash(rec, hash, size)) {
- (*n_recs)++;
- }
-
- if (ibuf_op == IBUF_OP_DELETE_MARK) {
- /* Setting the delete-mark flag does not
- affect the available space on the page. */
- return(0);
- }
- break;
- case IBUF_OP_DELETE:
- /* A record will be removed from the page. */
- if (n_recs) {
- (*n_recs)--;
- }
- /* While deleting a record actually frees up space,
- we have to play it safe and pretend that it takes no
- additional space (the record might not exist, etc.). */
- return(0);
- default:
- ut_error;
- }
- ut_ad(ibuf_op == IBUF_OP_INSERT);
+ if (page_no >= fil_system.sys_space->free_limit)
+ goto corrupted;
-get_volume_comp:
- {
- dtuple_t* entry;
- ulint volume;
- dict_index_t* dummy_index;
- mem_heap_t* heap = mem_heap_create(500);
+ /* Since pessimistic inserts were prevented, we know that the
+ page is still in the free list. NOTE that also deletes may take
+ pages from the free list, but they take them from the start, and
+ the free list was so long that they cannot have taken the last
+ page from it. */
- entry = ibuf_build_entry_from_ibuf_rec(
- mtr, rec, heap, &dummy_index);
+ err= fseg_free_page(header->page.frame + PAGE_DATA, fil_system.sys_space,
+ page_no, &mtr);
- volume = rec_get_converted_size(dummy_index, entry, 0);
-
- ibuf_dummy_index_free(dummy_index);
- mem_heap_free(heap);
-
- return(volume + page_dir_calc_reserved_space(1));
- }
-}
-
-/*********************************************************************//**
-Gets an upper limit for the combined size of entries buffered in the insert
-buffer for a given page.
-@return upper limit for the volume of buffered inserts for the index
-page, in bytes; srv_page_size, if the entries for the index page span
-several pages in the insert buffer */
-static
-ulint
-ibuf_get_volume_buffered(
-/*=====================*/
- const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an
- insert buffer tree where we would insert an
- entry for the index page whose number is
- page_no, latch mode has to be BTR_MODIFY_PREV
- or BTR_MODIFY_TREE */
- ulint space, /*!< in: space id */
- ulint page_no,/*!< in: page number of an index page */
- lint* n_recs, /*!< in/out: minimum number of records on the
- page after the buffered changes have been
- applied, or NULL to disable the counting */
- mtr_t* mtr) /*!< in: mini-transaction of pcur */
-{
- ulint volume;
- const rec_t* rec;
- const page_t* page;
- const page_t* prev_page;
- const page_t* next_page;
- /* bitmap of buffered recs */
- ulint hash_bitmap[128 / sizeof(ulint)];
-
- ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
- || (pcur->latch_mode == BTR_MODIFY_TREE));
-
- /* Count the volume of inserts earlier in the alphabetical order than
- pcur */
-
- volume = 0;
-
- if (n_recs) {
- memset(hash_bitmap, 0, sizeof hash_bitmap);
- }
-
- rec = btr_pcur_get_rec(pcur);
- page = page_align(rec);
- ut_ad(page_validate(page, ibuf.index));
-
- if (page_rec_is_supremum(rec)
- && UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
-corruption:
- ut_ad("corrupted page" == 0);
- return srv_page_size;
- }
-
- uint32_t prev_page_no;
-
- for (; !page_rec_is_infimum(rec); ) {
- ut_ad(page_align(rec) == page);
-
- if (page_no != ibuf_rec_get_page_no(mtr, rec)
- || space != ibuf_rec_get_space(mtr, rec)) {
-
- goto count_later;
- }
-
- volume += ibuf_get_volume_buffered_count(
- mtr, rec,
- hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
-
- if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
- goto corruption;
- }
- }
-
- /* Look at the previous page */
-
- prev_page_no = btr_page_get_prev(page);
-
- if (prev_page_no == FIL_NULL) {
-
- goto count_later;
- }
-
- if (buf_block_t* block =
- buf_page_get(page_id_t(IBUF_SPACE_ID, prev_page_no),
- 0, RW_X_LATCH, mtr)) {
- prev_page = buf_block_get_frame(block);
- ut_ad(page_validate(prev_page, ibuf.index));
- } else {
- return srv_page_size;
- }
-
- static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
- static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-
- if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT,
- page + FIL_PAGE_OFFSET, 4))) {
- return srv_page_size;
- }
-
- rec = page_rec_get_prev_const(page_get_supremum_rec(prev_page));
-
- if (UNIV_UNLIKELY(!rec)) {
- goto corruption;
- }
-
- for (;;) {
- ut_ad(page_align(rec) == prev_page);
-
- if (page_rec_is_infimum(rec)) {
-
- /* We cannot go to yet a previous page, because we
- do not have the x-latch on it, and cannot acquire one
- because of the latching order: we have to give up */
-
- return(srv_page_size);
- }
-
- if (page_no != ibuf_rec_get_page_no(mtr, rec)
- || space != ibuf_rec_get_space(mtr, rec)) {
-
- goto count_later;
- }
-
- volume += ibuf_get_volume_buffered_count(
- mtr, rec,
- hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
-
- if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
- goto corruption;
- }
- }
-
-count_later:
- rec = btr_pcur_get_rec(pcur);
-
- if (!page_rec_is_supremum(rec)) {
- rec = page_rec_get_next_const(rec);
- }
-
- for (; !page_rec_is_supremum(rec);
- rec = page_rec_get_next_const(rec)) {
- if (UNIV_UNLIKELY(!rec)) {
- return srv_page_size;
- }
- if (page_no != ibuf_rec_get_page_no(mtr, rec)
- || space != ibuf_rec_get_space(mtr, rec)) {
-
- return(volume);
- }
-
- volume += ibuf_get_volume_buffered_count(
- mtr, rec,
- hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
- }
-
- /* Look at the next page */
-
- uint32_t next_page_no = btr_page_get_next(page);
-
- if (next_page_no == FIL_NULL) {
-
- return(volume);
- }
-
- if (buf_block_t* block =
- buf_page_get(page_id_t(IBUF_SPACE_ID, next_page_no),
- 0, RW_X_LATCH, mtr)) {
- next_page = buf_block_get_frame(block);
- ut_ad(page_validate(next_page, ibuf.index));
- } else {
- return srv_page_size;
- }
-
- static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
- static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-
- if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
- page + FIL_PAGE_OFFSET, 4))) {
- return 0;
- }
-
- rec = page_get_infimum_rec(next_page);
- rec = page_rec_get_next_const(rec);
-
- for (; ; rec = page_rec_get_next_const(rec)) {
- if (!rec || page_rec_is_supremum(rec)) {
- /* We give up */
- return(srv_page_size);
- }
-
- ut_ad(page_align(rec) == next_page);
-
- if (page_no != ibuf_rec_get_page_no(mtr, rec)
- || space != ibuf_rec_get_space(mtr, rec)) {
-
- return(volume);
- }
-
- volume += ibuf_get_volume_buffered_count(
- mtr, rec,
- hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
- }
-}
-
-/*********************************************************************//**
-Reads the biggest tablespace id from the high end of the insert buffer
-tree and updates the counter in fil_system. */
-void
-ibuf_update_max_tablespace_id(void)
-/*===============================*/
-{
- if (UNIV_UNLIKELY(!ibuf.index)) return;
- const rec_t* rec;
- const byte* field;
- ulint len;
- btr_pcur_t pcur;
- mtr_t mtr;
-
- ut_ad(!ibuf.index->table->not_redundant());
-
- ibuf_mtr_start(&mtr);
-
- if (pcur.open_leaf(false, ibuf.index, BTR_SEARCH_LEAF, &mtr)
- != DB_SUCCESS) {
-func_exit:
- ibuf_mtr_commit(&mtr);
- return;
- }
-
- ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
-
- if (!btr_pcur_move_to_prev(&pcur, &mtr)
- || btr_pcur_is_before_first_on_page(&pcur)) {
- goto func_exit;
- }
-
- rec = btr_pcur_get_rec(&pcur);
-
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
-
- ut_a(len == 4);
-
- const uint32_t max_space_id = mach_read_from_4(field);
-
- ibuf_mtr_commit(&mtr);
-
- /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
-
- fil_set_max_space_id_if_bigger(max_space_id);
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \
- ibuf_get_entry_counter_low_func(mtr,rec,space,page_no)
-#else /* UNIV_DEBUG */
-# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \
- ibuf_get_entry_counter_low_func(rec,space,page_no)
-#endif
-/****************************************************************//**
-Helper function for ibuf_get_entry_counter_func. Checks if rec is for
-(space, page_no), and if so, reads counter value from it and returns
-that + 1.
-@retval ULINT_UNDEFINED if the record does not contain any counter
-@retval 0 if the record is not for (space, page_no)
-@retval 1 + previous counter value, otherwise */
-static
-ulint
-ibuf_get_entry_counter_low_func(
-/*============================*/
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction of rec */
-#endif /* UNIV_DEBUG */
- const rec_t* rec, /*!< in: insert buffer record */
- ulint space, /*!< in: space id */
- ulint page_no) /*!< in: page number */
-{
- ulint counter;
- const byte* field;
- ulint len;
-
- ut_ad(ibuf_inside(mtr));
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
- | MTR_MEMO_PAGE_S_FIX));
- ut_ad(rec_get_n_fields_old(rec) > 2);
-
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
-
- ut_a(len == 1);
-
- /* Check the tablespace identifier. */
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
-
- ut_a(len == 4);
-
- if (mach_read_from_4(field) != space) {
-
- return(0);
- }
-
- /* Check the page offset. */
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
- ut_a(len == 4);
-
- if (mach_read_from_4(field) != page_no) {
-
- return(0);
- }
-
- /* Check if the record contains a counter field. */
- field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
-
- switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
- default:
- ut_error;
- case 0: /* ROW_FORMAT=REDUNDANT */
- case 1: /* ROW_FORMAT=COMPACT */
- return(ULINT_UNDEFINED);
-
- case IBUF_REC_INFO_SIZE:
- counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER);
- ut_a(counter < 0xFFFF);
- return(counter + 1);
- }
-}
-
-#ifdef UNIV_DEBUG
-# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
- ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf)
-#else /* UNIV_DEBUG */
-# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
- ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf)
-#endif /* UNIV_DEBUG */
-
-/****************************************************************//**
-Calculate the counter field for an entry based on the current
-last record in ibuf for (space, page_no).
-@return the counter field, or ULINT_UNDEFINED
-if we should abort this insertion to ibuf */
-static
-ulint
-ibuf_get_entry_counter_func(
-/*========================*/
- ulint space, /*!< in: space id of entry */
- ulint page_no, /*!< in: page number of entry */
- const rec_t* rec, /*!< in: the record preceding the
- insertion point */
-#ifdef UNIV_DEBUG
- mtr_t* mtr, /*!< in: mini-transaction */
-#endif /* UNIV_DEBUG */
- ibool only_leaf) /*!< in: TRUE if this is the only
- leaf page that can contain entries
- for (space,page_no), that is, there
- was no exact match for (space,page_no)
- in the node pointer */
-{
- ut_ad(ibuf_inside(mtr));
- ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
- ut_ad(page_validate(page_align(rec), ibuf.index));
-
- if (page_rec_is_supremum(rec)) {
- /* This is just for safety. The record should be a
- page infimum or a user record. */
- ut_ad(0);
- return(ULINT_UNDEFINED);
- } else if (!page_rec_is_infimum(rec)) {
- return(ibuf_get_entry_counter_low(mtr, rec, space, page_no));
- } else if (only_leaf || !page_has_prev(page_align(rec))) {
- /* The parent node pointer did not contain the
- searched for (space, page_no), which means that the
- search ended on the correct page regardless of the
- counter value, and since we're at the infimum record,
- there are no existing records. */
- return(0);
- } else {
- /* We used to read the previous page here. It would
- break the latching order, because the caller has
- buffer-fixed an insert buffer bitmap page. */
- return(ULINT_UNDEFINED);
- }
-}
-
-
-/** Translates the ibuf free bits to the free space on a page in bytes.
-@param[in] physical_size page_size
-@param[in] bits value for ibuf bitmap bits
-@return maximum insert size after reorganize for the page */
-inline ulint
-ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits)
-{
- ut_ad(bits < 4);
- ut_ad(physical_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
-
- if (bits == 3) {
- bits = 4;
- }
-
- return bits * physical_size / IBUF_PAGE_SIZE_PER_FREE_SPACE;
-}
-
-/** Buffer an operation in the insert/delete buffer, instead of doing it
-directly to the disk page, if this is possible.
-@param[in] mode BTR_MODIFY_PREV or BTR_INSERT_TREE
-@param[in] op operation type
-@param[in] no_counter TRUE=use 5.0.3 format; FALSE=allow delete
-buffering
-@param[in] entry index entry to insert
-@param[in] entry_size rec_get_converted_size(index, entry)
-@param[in,out] index index where to insert; must not be unique
-or clustered
-@param[in] page_id page id where to insert
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out] thr query thread
-@return DB_SUCCESS, DB_STRONG_FAIL or other error */
-static TRANSACTIONAL_TARGET MY_ATTRIBUTE((warn_unused_result))
-dberr_t
-ibuf_insert_low(
- btr_latch_mode mode,
- ibuf_op_t op,
- ibool no_counter,
- const dtuple_t* entry,
- ulint entry_size,
- dict_index_t* index,
- const page_id_t page_id,
- ulint zip_size,
- que_thr_t* thr)
-{
- big_rec_t* dummy_big_rec;
- btr_pcur_t pcur;
- btr_cur_t* cursor;
- dtuple_t* ibuf_entry;
- mem_heap_t* offsets_heap = NULL;
- mem_heap_t* heap;
- rec_offs* offsets = NULL;
- ulint buffered;
- lint min_n_recs;
- rec_t* ins_rec;
- buf_block_t* bitmap_page;
- buf_block_t* block = NULL;
- page_t* root;
- dberr_t err;
- mtr_t mtr;
- mtr_t bitmap_mtr;
-
- ut_a(!dict_index_is_clust(index));
- ut_ad(!dict_index_is_spatial(index));
- ut_ad(dtuple_check_typed(entry));
- ut_ad(!no_counter || op == IBUF_OP_INSERT);
- ut_ad(page_id.space() == index->table->space_id);
- ut_a(op < IBUF_OP_COUNT);
-
- /* Perform dirty comparison of ibuf.max_size and ibuf.size to
- reduce ibuf_mutex contention. */
- if (ibuf.size >= ibuf.max_size) {
- return(DB_STRONG_FAIL);
- }
-
- heap = mem_heap_create(1024);
-
- /* Build the entry which contains the space id and the page number
- as the first fields and the type information for other fields, and
- which will be inserted to the insert buffer. Using a counter value
- of 0xFFFF we find the last record for (space, page_no), from which
- we can then read the counter value N and use N + 1 in the record we
- insert. (We patch the ibuf_entry's counter field to the correct
- value just before actually inserting the entry.) */
-
- ibuf_entry = ibuf_entry_build(
- op, index, entry, page_id.space(), page_id.page_no(),
- no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
-
- /* Open a cursor to the insert buffer tree to calculate if we can add
- the new entry to it without exceeding the free space limit for the
- page. */
-
- if (mode == BTR_INSERT_TREE) {
- for (;;) {
- mysql_mutex_lock(&ibuf_pessimistic_insert_mutex);
- mysql_mutex_lock(&ibuf_mutex);
-
- if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
-
- break;
- }
-
- mysql_mutex_unlock(&ibuf_mutex);
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
-
- if (!ibuf_add_free_page()) {
-
- mem_heap_free(heap);
- return(DB_STRONG_FAIL);
- }
- }
- }
-
- ibuf_mtr_start(&mtr);
- pcur.btr_cur.page_cur.index = ibuf.index;
-
- err = btr_pcur_open(ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
- if (err != DB_SUCCESS) {
-func_exit:
- ibuf_mtr_commit(&mtr);
- ut_free(pcur.old_rec_buf);
- mem_heap_free(heap);
- return err;
- }
-
- ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
-
- /* Find out the volume of already buffered inserts for the same index
- page */
- min_n_recs = 0;
- buffered = ibuf_get_volume_buffered(&pcur,
- page_id.space(),
- page_id.page_no(),
- op == IBUF_OP_DELETE
- ? &min_n_recs
- : NULL, &mtr);
-
- const ulint physical_size = zip_size ? zip_size : srv_page_size;
-
- if (op == IBUF_OP_DELETE
- && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) {
- /* The page could become empty after the record is
- deleted, or the page has been read in to the buffer
- pool. Refuse to buffer the operation. */
-
- /* The buffer pool watch is needed for IBUF_OP_DELETE
- because of latching order considerations. We can
- check buf_pool_watch_occurred() only after latching
- the insert buffer B-tree pages that contain buffered
- changes for the page. We never buffer IBUF_OP_DELETE,
- unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
- been previously buffered for the page. Because there
- are buffered operations for the page, the insert
- buffer B-tree page latches held by mtr will guarantee
- that no changes for the user page will be merged
- before mtr_commit(&mtr). We must not mtr_commit(&mtr)
- until after the IBUF_OP_DELETE has been buffered. */
-
-fail_exit:
- if (mode == BTR_INSERT_TREE) {
- mysql_mutex_unlock(&ibuf_mutex);
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
- }
-
- err = DB_STRONG_FAIL;
- goto func_exit;
- }
-
- /* After this point, the page could still be loaded to the
- buffer pool, but we do not have to care about it, since we are
- holding a latch on the insert buffer leaf page that contains
- buffered changes for (space, page_no). If the page enters the
- buffer pool, buf_page_t::read_complete() for (space, page_no) will
- have to acquire a latch on the same insert buffer leaf page,
- which it cannot do until we have buffered the IBUF_OP_DELETE
- and done mtr_commit(&mtr) to release the latch. */
-
- ibuf_mtr_start(&bitmap_mtr);
-
- bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr);
-
- /* We check if the index page is suitable for buffered entries */
-
- if (!bitmap_page || buf_pool.page_hash_contains(
- page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
-commit_exit:
- ibuf_mtr_commit(&bitmap_mtr);
- goto fail_exit;
- } else if (!lock_sys.rd_lock_try()) {
- goto commit_exit;
- } else {
- hash_cell_t* cell = lock_sys.rec_hash.cell_get(page_id.fold());
- lock_sys.rec_hash.latch(cell)->acquire();
- const lock_t* lock = lock_sys_t::get_first(*cell, page_id);
- lock_sys.rec_hash.latch(cell)->release();
- lock_sys.rd_unlock();
- if (lock) {
- goto commit_exit;
- }
- }
-
- if (op == IBUF_OP_INSERT) {
- ulint bits = ibuf_bitmap_page_get_bits(
- bitmap_page->page.frame, page_id, physical_size,
- IBUF_BITMAP_FREE, &bitmap_mtr);
-
- if (buffered + entry_size + page_dir_calc_reserved_space(1)
- > ibuf_index_page_calc_free_from_bits(physical_size,
- bits)) {
- /* Release the bitmap page latch early. */
- ibuf_mtr_commit(&bitmap_mtr);
- goto fail_exit;
- }
- }
-
- if (!no_counter) {
- /* Patch correct counter value to the entry to
- insert. This can change the insert position, which can
- result in the need to abort in some cases. */
- ulint counter = ibuf_get_entry_counter(
- page_id.space(), page_id.page_no(),
- btr_pcur_get_rec(&pcur), &mtr,
- btr_pcur_get_btr_cur(&pcur)->low_match
- < IBUF_REC_FIELD_METADATA);
- dfield_t* field;
-
- if (counter == ULINT_UNDEFINED) {
- goto commit_exit;
- }
-
- field = dtuple_get_nth_field(
- ibuf_entry, IBUF_REC_FIELD_METADATA);
- mach_write_to_2(
- (byte*) dfield_get_data(field)
- + IBUF_REC_OFFSET_COUNTER, counter);
- }
-
- /* Set the bitmap bit denoting that the insert buffer contains
- buffered entries for this index page, if the bit is not set yet */
- index->set_modified(bitmap_mtr);
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
- bitmap_page, page_id, physical_size, true, &bitmap_mtr);
- ibuf_mtr_commit(&bitmap_mtr);
-
- cursor = btr_pcur_get_btr_cur(&pcur);
-
- if (mode == BTR_MODIFY_PREV) {
- err = btr_cur_optimistic_insert(
- BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
- cursor, &offsets, &offsets_heap,
- ibuf_entry, &ins_rec,
- &dummy_big_rec, 0, thr, &mtr);
- block = btr_cur_get_block(cursor);
- ut_ad(block->page.id().space() == IBUF_SPACE_ID);
-
- /* If this is the root page, update ibuf.empty. */
- if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
- const page_t* root = buf_block_get_frame(block);
-
- ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
- ut_ad(page_get_page_no(root)
- == FSP_IBUF_TREE_ROOT_PAGE_NO);
-
- ibuf.empty = page_is_empty(root);
- }
- } else {
- ut_ad(mode == BTR_INSERT_TREE);
-
- /* We acquire an sx-latch to the root page before the insert,
- because a pessimistic insert releases the tree x-latch,
- which would cause the sx-latching of the root after that to
- break the latching order. */
- if (buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr)) {
- root = ibuf_root->page.frame;
- } else {
- err = DB_CORRUPTION;
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
- mysql_mutex_unlock(&ibuf_mutex);
- goto ibuf_insert_done;
- }
-
- err = btr_cur_optimistic_insert(
- BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
- cursor, &offsets, &offsets_heap,
- ibuf_entry, &ins_rec,
- &dummy_big_rec, 0, thr, &mtr);
-
- if (err == DB_FAIL) {
- err = btr_cur_pessimistic_insert(
- BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
- cursor, &offsets, &offsets_heap,
- ibuf_entry, &ins_rec,
- &dummy_big_rec, 0, thr, &mtr);
- }
-
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
- ibuf_size_update(root);
- mysql_mutex_unlock(&ibuf_mutex);
- ibuf.empty = page_is_empty(root);
-
- block = btr_cur_get_block(cursor);
- ut_ad(block->page.id().space() == IBUF_SPACE_ID);
- }
-
-ibuf_insert_done:
- if (offsets_heap) {
- mem_heap_free(offsets_heap);
- }
-
- if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
- /* Update the page max trx id field */
- page_update_max_trx_id(block, NULL,
- thr_get_trx(thr)->id, &mtr);
- }
-
- goto func_exit;
-}
-
-/** Buffer an operation in the change buffer, instead of applying it
-directly to the file page, if this is possible. Does not do it if the index
-is clustered or unique.
-@param[in] op operation type
-@param[in] entry index entry to insert
-@param[in,out] index index where to insert
-@param[in] page_id page id where to insert
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out] thr query thread
-@return true if success */
-TRANSACTIONAL_TARGET
-bool
-ibuf_insert(
- ibuf_op_t op,
- const dtuple_t* entry,
- dict_index_t* index,
- const page_id_t page_id,
- ulint zip_size,
- que_thr_t* thr)
-{
- if (!index->is_committed()) {
- return false;
- }
-
- dberr_t err;
- ulint entry_size;
- ibool no_counter;
- /* Read the settable global variable only once in
- this function, so that we will have a consistent view of it. */
- ibuf_use_t use = ibuf_use_t(innodb_change_buffering);
- DBUG_ENTER("ibuf_insert");
-
- DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF,
- op, page_id.space(), page_id.page_no()));
-
- ut_ad(dtuple_check_typed(entry));
- ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
- ut_ad(index->is_btree());
- ut_a(!dict_index_is_clust(index));
- ut_ad(!index->table->is_temporary());
-
- no_counter = use <= IBUF_USE_INSERT;
-
- switch (op) {
- case IBUF_OP_INSERT:
- switch (use) {
- case IBUF_USE_NONE:
- case IBUF_USE_DELETE:
- case IBUF_USE_DELETE_MARK:
- DBUG_RETURN(false);
- case IBUF_USE_INSERT:
- case IBUF_USE_INSERT_DELETE_MARK:
- case IBUF_USE_ALL:
- goto check_watch;
- }
- break;
- case IBUF_OP_DELETE_MARK:
- switch (use) {
- case IBUF_USE_NONE:
- case IBUF_USE_INSERT:
- DBUG_RETURN(false);
- case IBUF_USE_DELETE_MARK:
- case IBUF_USE_DELETE:
- case IBUF_USE_INSERT_DELETE_MARK:
- case IBUF_USE_ALL:
- ut_ad(!no_counter);
- goto check_watch;
- }
- break;
- case IBUF_OP_DELETE:
- switch (use) {
- case IBUF_USE_NONE:
- case IBUF_USE_INSERT:
- case IBUF_USE_INSERT_DELETE_MARK:
- DBUG_RETURN(false);
- case IBUF_USE_DELETE_MARK:
- case IBUF_USE_DELETE:
- case IBUF_USE_ALL:
- ut_ad(!no_counter);
- goto skip_watch;
- }
- break;
- case IBUF_OP_COUNT:
- break;
- }
-
- /* unknown op or use */
- ut_error;
-
-check_watch:
- /* If a thread attempts to buffer an insert on a page while a
- purge is in progress on the same page, the purge must not be
- buffered, because it could remove a record that was
- re-inserted later. For simplicity, we block the buffering of
- all operations on a page that has a purge pending.
-
- We do not check this in the IBUF_OP_DELETE case, because that
- would always trigger the buffer pool watch during purge and
- thus prevent the buffering of delete operations. We assume
- that the issuer of IBUF_OP_DELETE has called
- buf_pool_t::watch_set(). */
-
- if (buf_pool.page_hash_contains<true>(
- page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
- /* A buffer pool watch has been set or the
- page has been read into the buffer pool.
- Do not buffer the request. If a purge operation
- is being buffered, have this request executed
- directly on the page in the buffer pool after the
- buffered entries for this page have been merged. */
- DBUG_RETURN(false);
- }
-
-skip_watch:
- entry_size = rec_get_converted_size(index, entry, 0);
-
- if (entry_size
- >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
- / 2) {
+ if (err != DB_SUCCESS)
+ goto func_exit;
- DBUG_RETURN(false);
- }
+ if (page_no != flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST +
+ root->page.frame).page)
+ {
+ corrupted:
+ err= DB_CORRUPTION;
+ goto func_exit;
+ }
- err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
- entry, entry_size,
- index, page_id, zip_size, thr);
- if (err == DB_FAIL) {
- err = ibuf_insert_low(BTR_INSERT_TREE,
- op, no_counter, entry, entry_size,
- index, page_id, zip_size, thr);
- }
+ /* Remove the page from the free list and update the ibuf size data */
+ if (buf_block_t *block=
+ buf_page_get_gen(page_id_t{0, page_no}, 0, RW_X_LATCH, nullptr, BUF_GET,
+ &mtr, &err))
+ err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+ fil_system.sys_space->free_limit, &mtr);
- ut_a(err == DB_SUCCESS || err == DB_STRONG_FAIL
- || err == DB_TOO_BIG_RECORD);
+ if (err == DB_SUCCESS)
+ buf_page_free(fil_system.sys_space, page_no, &mtr);
- DBUG_RETURN(err == DB_SUCCESS);
+ goto func_exit;
}
MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -3547,9 +321,7 @@ ibuf_insert_to_index_page_low(
return DB_SUCCESS;
/* Page reorganization or recompression should already have been
- attempted by page_cur_tuple_insert(). Besides, per
- ibuf_index_page_calc_free_zip() the page should not have been
- recompressed or reorganized. */
+ attempted by page_cur_tuple_insert(). */
ut_ad(!is_buf_block_get_page_zip(page_cur->block));
/* If the record did not fit, reorganize */
@@ -3587,19 +359,16 @@ ibuf_insert_to_index_page(
block->page.id().page_no()));
ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index
- ut_ad(ibuf_inside(mtr));
ut_ad(dtuple_check_typed(entry));
#ifdef BTR_CUR_HASH_ADAPT
- /* A change buffer merge must occur before users are granted
- any access to the page. No adaptive hash index entries may
- point to a freshly read page. */
+ /* ibuf_cleanup() must finish before the adaptive hash index
+ can be inserted into. */
ut_ad(!block->index);
- assert_block_ahi_empty(block);
#endif /* BTR_CUR_HASH_ADAPT */
ut_ad(mtr->is_named_space(block->page.id().space()));
- if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
- != (ibool)!!page_is_comp(page))) {
+ if (UNIV_UNLIKELY(index->table->not_redundant()
+ != !!page_is_comp(page))) {
return DB_CORRUPTION;
}
@@ -3739,7 +508,6 @@ ibuf_set_del_mark(
page_cur.index = index;
ulint up_match = 0, low_match = 0;
- ut_ad(ibuf_inside(mtr));
ut_ad(dtuple_check_typed(entry));
if (!page_cur_search_with_match(entry, PAGE_CUR_LE,
@@ -3798,7 +566,6 @@ ibuf_delete(
page_cur.index = index;
ulint up_match = 0, low_match = 0;
- ut_ad(ibuf_inside(mtr));
ut_ad(dtuple_check_typed(entry));
ut_ad(!index->is_spatial());
ut_ad(!index->is_clust());
@@ -3807,7 +574,6 @@ ibuf_delete(
&up_match, &low_match, &page_cur,
nullptr)
&& low_match == dtuple_get_n_fields(entry)) {
- page_zip_des_t* page_zip= buf_block_get_page_zip(block);
page_t* page = buf_block_get_frame(block);
rec_t* rec = page_cur_get_rec(&page_cur);
@@ -3817,7 +583,6 @@ ibuf_delete(
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs* offsets = offsets_;
mem_heap_t* heap = NULL;
- ulint max_ins_size = 0;
rec_offs_init(offsets_);
@@ -3848,12 +613,8 @@ ibuf_delete(
return;
}
- if (!page_zip) {
- max_ins_size
- = page_get_max_insert_size_after_reorganize(
- page, 1);
- }
#ifdef UNIV_ZIP_DEBUG
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
ut_a(!page_zip || page_zip_validate(page_zip, page, index));
#endif /* UNIV_ZIP_DEBUG */
page_cur_delete_rec(&page_cur, offsets, mtr);
@@ -3861,761 +622,438 @@ ibuf_delete(
ut_a(!page_zip || page_zip_validate(page_zip, page, index));
#endif /* UNIV_ZIP_DEBUG */
- if (page_zip) {
- ibuf_update_free_bits_zip(block, mtr);
- } else {
- ibuf_update_free_bits_low(block, max_ins_size, mtr);
- }
-
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
}
}
-/*********************************************************************//**
-Restores insert buffer tree cursor position
-@return whether the position was restored */
-static MY_ATTRIBUTE((nonnull))
-bool
-ibuf_restore_pos(
-/*=============*/
- const page_id_t page_id,/*!< in: page identifier */
- const dtuple_t* search_tuple,
- /*!< in: search tuple for entries of page_no */
- btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
- btr_pcur_t* pcur, /*!< in/out: persistent cursor whose
- position is to be restored */
- mtr_t* mtr) /*!< in/out: mini-transaction */
+/** Reset the bits in the bitmap page for the given page number.
+@param bitmap change buffer bitmap page
+@param offset page number
+@param mtr mini-transaction */
+static void ibuf_reset(buf_block_t &bitmap, uint32_t offset, mtr_t *mtr)
{
- if (UNIV_LIKELY(pcur->restore_position(mode, mtr) ==
- btr_pcur_t::SAME_ALL)) {
- return true;
- }
-
- if (fil_space_t* s = fil_space_t::get(page_id.space())) {
- ib::error() << "ibuf cursor restoration fails!"
- " ibuf record inserted to page "
- << page_id
- << " in file " << s->chain.start->name;
- s->release();
-
- ib::error() << BUG_REPORT_MSG;
-
- rec_print_old(stderr, btr_pcur_get_rec(pcur));
- rec_print_old(stderr, pcur->old_rec);
- dtuple_print(stderr, search_tuple);
- }
-
- ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
- return false;
+ offset&= uint32_t(bitmap.physical_size() - 1);
+ byte *map_byte= &bitmap.page.frame[PAGE_DATA + offset / 2];
+ /* We must reset IBUF_BITMAP_BUFFERED, but at the same time we will also
+ reset IBUF_BITMAP_FREE (and IBUF_BITMAP_IBUF, which should be clear). */
+ byte b= byte(*map_byte & ((offset & 1) ? byte{0xf} : byte{0xf0}));
+ mtr->write<1,mtr_t::MAYBE_NOP>(bitmap, map_byte, b);
}
-/**
-Delete a change buffer record.
-@param[in] page_id page identifier
-@param[in,out] pcur persistent cursor positioned on the record
-@param[in] search_tuple search key for (space,page_no)
-@param[in,out] mtr mini-transaction
-@return whether mtr was committed (due to pessimistic operation) */
-static MY_ATTRIBUTE((warn_unused_result, nonnull))
-bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
- const dtuple_t* search_tuple, mtr_t* mtr)
+/** Move to the next change buffer record. */
+ATTRIBUTE_COLD static dberr_t ibuf_move_to_next(btr_cur_t *cur, mtr_t *mtr)
{
- dberr_t err;
-
- ut_ad(ibuf_inside(mtr));
- ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
- ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur))
- == page_id.page_no());
- ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur))
- == page_id.space());
-
- switch (btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
- BTR_CREATE_FLAG, mtr)) {
- case DB_FAIL:
- break;
- case DB_SUCCESS:
- if (page_is_empty(btr_pcur_get_page(pcur))) {
- /* If a B-tree page is empty, it must be the root page
- and the whole B-tree must be empty. InnoDB does not
- allow empty B-tree pages other than the root. */
- ut_d(const page_t* root = btr_pcur_get_page(pcur));
-
- ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
- ut_ad(page_get_page_no(root)
- == FSP_IBUF_TREE_ROOT_PAGE_NO);
-
- /* ibuf.empty is protected by the root page latch.
- Before the deletion, it had to be FALSE. */
- ut_ad(!ibuf.empty);
- ibuf.empty = true;
- }
- /* fall through */
- default:
- return(FALSE);
- }
-
- /* We have to resort to a pessimistic delete from ibuf.
- Delete-mark the record so that it will not be applied again,
- in case the server crashes before the pessimistic delete is
- made persistent. */
- btr_rec_set_deleted<true>(btr_pcur_get_block(pcur),
- btr_pcur_get_rec(pcur), mtr);
-
- btr_pcur_store_position(pcur, mtr);
- ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
-
- ibuf_mtr_start(mtr);
- mysql_mutex_lock(&ibuf_mutex);
- mtr_x_lock_index(ibuf.index, mtr);
-
- if (!ibuf_restore_pos(page_id, search_tuple,
- BTR_PURGE_TREE_ALREADY_LATCHED, pcur, mtr)) {
- mysql_mutex_unlock(&ibuf_mutex);
- goto func_exit;
- }
-
- if (buf_block_t* ibuf_root = ibuf_tree_root_get(mtr)) {
- btr_cur_pessimistic_delete(&err, TRUE,
- btr_pcur_get_btr_cur(pcur),
- BTR_CREATE_FLAG, false, mtr);
- ut_a(err == DB_SUCCESS);
-
- ibuf_size_update(ibuf_root->page.frame);
- ibuf.empty = page_is_empty(ibuf_root->page.frame);
- }
-
- mysql_mutex_unlock(&ibuf_mutex);
- ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
-
-func_exit:
- ut_ad(mtr->has_committed());
- btr_pcur_close(pcur);
-
- return(TRUE);
-}
+ if (!page_cur_move_to_next(&cur->page_cur))
+ return DB_CORRUPTION;
+ if (!page_cur_is_after_last(&cur->page_cur))
+ return DB_SUCCESS;
-/** Check whether buffered changes exist for a page.
-@param[in] id page identifier
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@return whether buffered changes exist */
-bool ibuf_page_exists(const page_id_t id, ulint zip_size)
-{
- ut_ad(!fsp_is_system_temporary(id.space()));
+ /* The following is adapted from btr_pcur_move_to_next_page(),
+ but we will not release any latches. */
- const ulint physical_size = zip_size ? zip_size : srv_page_size;
+ const buf_block_t &block= *cur->page_cur.block;
+ const uint32_t next_page_no= btr_page_get_next(block.page.frame);
+ switch (next_page_no) {
+ case 0:
+ case 1:
+ return DB_CORRUPTION;
+ case FIL_NULL:
+ return DB_SUCCESS;
+ }
- if (ibuf_fixed_addr_page(id, physical_size)
- || fsp_descr_page(id, physical_size)) {
- return false;
- }
+ if (UNIV_UNLIKELY(next_page_no == block.page.id().page_no()))
+ return DB_CORRUPTION;
- mtr_t mtr;
- bool bitmap_bits = false;
+ dberr_t err;
+ buf_block_t *next=
+ btr_block_get(*cur->index(), next_page_no, RW_X_LATCH, mtr, &err);
+ if (!next)
+ return err;
- ibuf_mtr_start(&mtr);
- if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
- id, zip_size, &mtr)) {
- bitmap_bits = ibuf_bitmap_page_get_bits(
- bitmap_page->page.frame, id, zip_size,
- IBUF_BITMAP_BUFFERED, &mtr) != 0;
- }
- ibuf_mtr_commit(&mtr);
- return bitmap_bits;
-}
+ if (UNIV_UNLIKELY(memcmp_aligned<4>(next->page.frame + FIL_PAGE_PREV,
+ block.page.frame + FIL_PAGE_OFFSET, 4)))
+ return DB_CORRUPTION;
-/** Reset the bits in the bitmap page for the given block and page id.
-@param b X-latched secondary index page (nullptr to discard changes)
-@param page_id page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param mtr mini-transaction */
-static void ibuf_reset_bitmap(buf_block_t *b, page_id_t page_id,
- ulint zip_size, mtr_t *mtr)
-{
- buf_block_t *bitmap= ibuf_bitmap_get_map_page(page_id, zip_size, mtr);
- if (!bitmap)
- return;
-
- const ulint physical_size = zip_size ? zip_size : srv_page_size;
- /* FIXME: update the bitmap byte only once! */
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(bitmap, page_id,
- physical_size, false, mtr);
-
- if (b)
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(bitmap, page_id, physical_size,
- ibuf_index_page_calc_free(b),
- mtr);
+ page_cur_set_before_first(next, &cur->page_cur);
+ return page_cur_move_to_next(&cur->page_cur) ? DB_SUCCESS : DB_CORRUPTION;
}
-/** When an index page is read from a disk to the buffer pool, this function
-applies any buffered operations to the page and deletes the entries from the
-insert buffer. If the page is not read, but created in the buffer pool, this
-function deletes its buffered entries from the insert buffer; there can
-exist entries for such a page if the page belonged to an index which
-subsequently was dropped.
-@param block X-latched page to try to apply changes to, or NULL to discard
-@param page_id page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@return error code */
-dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
- const page_id_t page_id,
- ulint zip_size)
+/** @return if buffered changes exist for the page */
+ATTRIBUTE_COLD
+static bool ibuf_bitmap_buffered(const buf_block_t *bitmap, uint32_t offset)
{
- if (trx_sys_hdr_page(page_id)) {
- return DB_SUCCESS;
- }
-
- ut_ad(!block || page_id == block->page.id());
- ut_ad(!block || block->page.frame);
- ut_ad(!block || !block->page.is_ibuf_exist());
- ut_ad(!block || !block->page.is_reinit());
- ut_ad(!trx_sys_hdr_page(page_id));
- ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
-
- const ulint physical_size = zip_size ? zip_size : srv_page_size;
-
- if (ibuf_fixed_addr_page(page_id, physical_size)
- || fsp_descr_page(page_id, physical_size)) {
- return DB_SUCCESS;
- }
-
- btr_pcur_t pcur;
-#ifdef UNIV_IBUF_DEBUG
- ulint volume = 0;
-#endif /* UNIV_IBUF_DEBUG */
- dberr_t err = DB_SUCCESS;
- mtr_t mtr;
-
- fil_space_t* space = fil_space_t::get(page_id.space());
-
- if (UNIV_UNLIKELY(!space)) {
- block = nullptr;
- } else {
- ulint bitmap_bits = 0;
-
- ibuf_mtr_start(&mtr);
-
- buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
- page_id, zip_size, &mtr);
-
- if (bitmap_page
- && fil_page_get_type(bitmap_page->page.frame)
- != FIL_PAGE_TYPE_ALLOCATED) {
- bitmap_bits = ibuf_bitmap_page_get_bits(
- bitmap_page->page.frame, page_id, zip_size,
- IBUF_BITMAP_BUFFERED, &mtr);
- }
-
- ibuf_mtr_commit(&mtr);
-
- if (!bitmap_bits) {
- done:
- /* No changes are buffered for this page. */
- space->release();
- return DB_SUCCESS;
- }
-
- if (!block
- || DB_SUCCESS
- == fseg_page_is_allocated(space, page_id.page_no())) {
- ibuf_mtr_start(&mtr);
- mtr.set_named_space(space);
- ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
- ibuf_mtr_commit(&mtr);
- if (!block
- || btr_page_get_index_id(block->page.frame)
- != DICT_IBUF_ID_MIN + IBUF_SPACE_ID) {
- ibuf_delete_recs(page_id);
- }
- goto done;
- }
- }
-
- if (!block) {
- } else if (!fil_page_index_page_check(block->page.frame)
- || !page_is_leaf(block->page.frame)) {
- space->set_corrupted();
- err = DB_CORRUPTION;
- block = nullptr;
- } else {
- /* Move the ownership of the x-latch on the page to this OS
- thread, so that we can acquire a second x-latch on it. This
- is needed for the insert operations to the index page to pass
- the debug checks. */
-
- block->page.lock.claim_ownership();
- }
-
- mem_heap_t* heap = mem_heap_create(512);
-
- const dtuple_t* search_tuple = ibuf_search_tuple_build(
- page_id.space(), page_id.page_no(), heap);
-
- /* Counts for merged & discarded operations. */
- ulint mops[IBUF_OP_COUNT];
- ulint dops[IBUF_OP_COUNT];
-
- memset(mops, 0, sizeof(mops));
- memset(dops, 0, sizeof(dops));
- pcur.btr_cur.page_cur.index = ibuf.index;
-
-loop:
- ibuf_mtr_start(&mtr);
-
- /* Position pcur in the insert buffer at the first entry for this
- index page */
- if (btr_pcur_open_on_user_rec(search_tuple,
- BTR_MODIFY_LEAF, &pcur, &mtr)
- != DB_SUCCESS) {
- err = DB_CORRUPTION;
- goto reset_bit;
- }
-
- if (block) {
- block->page.fix();
- block->page.lock.x_lock_recursive();
- mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
- }
-
- if (space) {
- mtr.set_named_space(space);
- }
-
- if (!btr_pcur_is_on_user_rec(&pcur)) {
- ut_ad(btr_pcur_is_after_last_on_page(&pcur));
- goto reset_bit;
- }
-
- for (;;) {
- rec_t* rec;
+ if (!bitmap)
+ return false;
+ offset&= uint32_t(bitmap->physical_size() - 1);
+ byte *map_byte= &bitmap->page.frame[PAGE_DATA + offset / 2];
+ return *map_byte & (byte{4} << ((offset & 1) << 4));
+}
+
+/** Apply changes to a block. */
+ATTRIBUTE_COLD
+static dberr_t ibuf_merge(fil_space_t *space, btr_cur_t *cur, mtr_t *mtr)
+{
+ if (btr_cur_get_rec(cur)[4])
+ return DB_CORRUPTION;
+
+ const uint32_t space_id= mach_read_from_4(btr_cur_get_rec(cur));
+ const uint32_t page_no= mach_read_from_4(btr_cur_get_rec(cur) + 5);
+
+ buf_block_t *block= space && page_no < space->size
+ ? buf_page_get_gen(page_id_t{space_id, page_no}, space->zip_size(),
+ RW_X_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, mtr)
+ : nullptr;
+
+ buf_block_t *bitmap= block
+ ? buf_page_get_gen(page_id_t(space_id,
+ uint32_t(page_no &
+ ~(block->physical_size() - 1)) + 1),
+ block->zip_size(), RW_X_LATCH, nullptr,
+ BUF_GET_POSSIBLY_FREED, mtr)
+ : nullptr;
+ bool buffered= false;
+
+ if (!block);
+ else if (fil_page_get_type(block->page.frame) != FIL_PAGE_INDEX ||
+ !page_is_leaf(block->page.frame) ||
+ DB_SUCCESS == fseg_page_is_allocated(space, page_no))
+ block= nullptr;
+ else
+ buffered= ibuf_bitmap_buffered(bitmap, block->page.id().page_no());
+
+ do
+ {
+ rec_t *rec= cur->page_cur.rec;
+ ulint n_fields= rec_get_n_fields_old(rec);
- ut_ad(btr_pcur_is_on_user_rec(&pcur));
+ if (n_fields < IBUF_REC_FIELD_USER + 1 || rec[4])
+ return DB_CORRUPTION;
- rec = btr_pcur_get_rec(&pcur);
+ n_fields-= IBUF_REC_FIELD_USER;
- /* Check if the entry is for this index page */
- if (ibuf_rec_get_page_no(&mtr, rec) != page_id.page_no()
- || ibuf_rec_get_space(&mtr, rec) != page_id.space()) {
+ ulint types_len, not_redundant;
- if (block != NULL) {
- page_header_reset_last_insert(block, &mtr);
- }
+ if (rec_get_1byte_offs_flag(rec))
+ {
+ if (rec_1_get_field_end_info(rec, 0) != 4 ||
+ rec_1_get_field_end_info(rec, 1) != 5 ||
+ rec_1_get_field_end_info(rec, 2) != 9)
+ return DB_CORRUPTION;
+ types_len= rec_1_get_field_end_info(rec, 3);
+ }
+ else
+ {
+ if (rec_2_get_field_end_info(rec, 0) != 4 ||
+ rec_2_get_field_end_info(rec, 1) != 5 ||
+ rec_2_get_field_end_info(rec, 2) != 9)
+ return DB_CORRUPTION;
+ types_len= rec_2_get_field_end_info(rec, 3);
+ }
- goto reset_bit;
- }
+ if (types_len < 9 || (types_len - 9) / 6 != n_fields)
+ return DB_CORRUPTION;
- if (err) {
- fputs("InnoDB: Discarding record\n ", stderr);
- rec_print_old(stderr, rec);
- fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
- } else if (block != NULL && !rec_get_deleted_flag(rec, 0)) {
- /* Now we have at pcur a record which should be
- applied on the index page; NOTE that the call below
- copies pointers to fields in rec, and we must
- keep the latch to the rec page until the
- insertion is finished! */
- dtuple_t* entry;
- trx_id_t max_trx_id;
- dict_index_t* dummy_index;
- ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec);
-
- max_trx_id = page_get_max_trx_id(page_align(rec));
- page_update_max_trx_id(block,
- buf_block_get_page_zip(block),
- max_trx_id, &mtr);
-
- ut_ad(page_validate(page_align(rec), ibuf.index));
-
- entry = ibuf_build_entry_from_ibuf_rec(
- &mtr, rec, heap, &dummy_index);
- ut_ad(!dummy_index->table->space);
- dummy_index->table->space = space;
- dummy_index->table->space_id = space->id;
-
- ut_ad(page_validate(block->page.frame, dummy_index));
-
- switch (op) {
- case IBUF_OP_INSERT:
-#ifdef UNIV_IBUF_DEBUG
- volume += rec_get_converted_size(
- dummy_index, entry, 0);
-
- volume += page_dir_calc_reserved_space(1);
-
- ut_a(volume <= (4U << srv_page_size_shift)
- / IBUF_PAGE_SIZE_PER_FREE_SPACE);
-#endif
- ibuf_insert_to_index_page(
- entry, block, dummy_index, &mtr);
- break;
-
- case IBUF_OP_DELETE_MARK:
- ibuf_set_del_mark(
- entry, block, dummy_index, &mtr);
- break;
-
- case IBUF_OP_DELETE:
- ibuf_delete(entry, block, dummy_index, &mtr);
- /* Because ibuf_delete() will latch an
- insert buffer bitmap page, commit mtr
- before latching any further pages.
- Store and restore the cursor position. */
- ut_ad(rec == btr_pcur_get_rec(&pcur));
- ut_ad(page_rec_is_user_rec(rec));
- ut_ad(ibuf_rec_get_page_no(&mtr, rec)
- == page_id.page_no());
- ut_ad(ibuf_rec_get_space(&mtr, rec)
- == page_id.space());
-
- /* Mark the change buffer record processed,
- so that it will not be merged again in case
- the server crashes between the following
- mtr_commit() and the subsequent mtr_commit()
- of deleting the change buffer record. */
- btr_rec_set_deleted<true>(
- btr_pcur_get_block(&pcur),
- btr_pcur_get_rec(&pcur), &mtr);
-
- btr_pcur_store_position(&pcur, &mtr);
- ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr);
-
- ibuf_mtr_start(&mtr);
- mtr.set_named_space(space);
-
- block->page.lock.x_lock_recursive();
- block->fix();
- mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
-
- if (!ibuf_restore_pos(page_id, search_tuple,
- BTR_MODIFY_LEAF,
- &pcur, &mtr)) {
-
- ut_ad(mtr.has_committed());
- mops[op]++;
- ibuf_dummy_index_free(dummy_index);
- goto loop;
- }
-
- break;
- default:
- ut_error;
- }
-
- mops[op]++;
-
- ibuf_dummy_index_free(dummy_index);
- } else {
- dops[ibuf_rec_get_op_type(&mtr, rec)]++;
- }
-
- /* Delete the record from ibuf */
- if (ibuf_delete_rec(page_id, &pcur, search_tuple, &mtr)) {
- /* Deletion was pessimistic and mtr was committed:
- we start from the beginning again */
+ ibuf_op op= IBUF_OP_INSERT;
+ const ulint info_len= (types_len - 9) % 6;
- ut_ad(mtr.has_committed());
- goto loop;
- } else if (btr_pcur_is_after_last_on_page(&pcur)) {
- ibuf_mtr_commit(&mtr);
- goto loop;
- }
- }
+ switch (info_len) {
+ default:
+ return DB_CORRUPTION;
+ case 0: case 1:
+ not_redundant= info_len;
+ break;
+ case 4:
+ not_redundant= rec[9 + 3];
+ if (rec[9 + 2] > IBUF_OP_DELETE || not_redundant > 1)
+ return DB_CORRUPTION;
+ op= static_cast<ibuf_op>(rec[9 + 2]);
+ }
-reset_bit:
- if (space) {
- ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
- }
+ const byte *const types= rec + 9 + info_len;
- ibuf_mtr_commit(&mtr);
- ut_free(pcur.old_rec_buf);
+ if (ibuf_rec_get_space(rec) != space_id ||
+ ibuf_rec_get_page_no(rec) != page_no)
+ break;
- if (space) {
- space->release();
- }
+ if (!rec_get_deleted_flag(rec, 0))
+ {
+ /* Delete-mark the record so that it will not be applied again if
+ the server is killed before the completion of ibuf_upgrade(). */
+ btr_rec_set_deleted<true>(cur->page_cur.block, rec, mtr);
+
+ if (buffered)
+ {
+ page_header_reset_last_insert(block, mtr);
+ page_update_max_trx_id(block, buf_block_get_page_zip(block),
+ page_get_max_trx_id(page_align(rec)), mtr);
+ dict_index_t *index;
+ mem_heap_t *heap = mem_heap_create(512);
+ dtuple_t *entry= ibuf_entry_build(rec, not_redundant, n_fields,
+ types, heap, index);
+ dict_table_t *table= index->table;
+ ut_ad(!table->space);
+ table->space= space;
+ table->space_id= space_id;
+
+ switch (op) {
+ case IBUF_OP_INSERT:
+ ibuf_insert_to_index_page(entry, block, index, mtr);
+ break;
+ case IBUF_OP_DELETE_MARK:
+ ibuf_set_del_mark(entry, block, index, mtr);
+ break;
+ case IBUF_OP_DELETE:
+ ibuf_delete(entry, block, index, mtr);
+ break;
+ }
+
+ mem_heap_free(heap);
+ dict_mem_index_free(index);
+ dict_mem_table_free(table);
+ }
+ }
- mem_heap_free(heap);
+ if (dberr_t err= ibuf_move_to_next(cur, mtr))
+ return err;
+ }
+ while (!page_cur_is_after_last(&cur->page_cur));
- ibuf.n_merges++;
- ibuf_add_ops(ibuf.n_merged_ops, mops);
- ibuf_add_ops(ibuf.n_discarded_ops, dops);
+ if (bitmap)
+ ibuf_reset(*bitmap, page_no, mtr);
- return err;
+ return DB_SUCCESS;
}
-/** Delete all change buffer entries for a tablespace,
-in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
-@param[in] space missing or to-be-discarded tablespace */
-void ibuf_delete_for_discarded_space(uint32_t space)
+static dberr_t ibuf_open(btr_cur_t *cur, mtr_t *mtr)
{
- if (UNIV_UNLIKELY(!ibuf.index)) return;
-
- btr_pcur_t pcur;
- const rec_t* ibuf_rec;
- mtr_t mtr;
-
- /* Counts for discarded operations. */
- ulint dops[IBUF_OP_COUNT];
-
- dfield_t dfield[IBUF_REC_FIELD_METADATA];
- dtuple_t search_tuple {0,IBUF_REC_FIELD_METADATA,
- IBUF_REC_FIELD_METADATA,dfield,0
- ,nullptr
-#ifdef UNIV_DEBUG
- ,DATA_TUPLE_MAGIC_N
-#endif /* UNIV_DEBUG */
- };
- byte space_id[4];
- mach_write_to_4(space_id, space);
-
- dfield_set_data(&dfield[0], space_id, 4);
- dfield_set_data(&dfield[1], field_ref_zero, 1);
- dfield_set_data(&dfield[2], field_ref_zero, 4);
- dtuple_set_types_binary(&search_tuple, IBUF_REC_FIELD_METADATA);
- /* Use page number 0 to build the search tuple so that we get the
- cursor positioned at the first entry for this space id */
-
- memset(dops, 0, sizeof(dops));
- pcur.btr_cur.page_cur.index = ibuf.index;
-
-loop:
- log_free_check();
- ibuf_mtr_start(&mtr);
-
- /* Position pcur in the insert buffer at the first entry for the
- space */
- if (btr_pcur_open_on_user_rec(&search_tuple,
- BTR_MODIFY_LEAF, &pcur, &mtr)
- != DB_SUCCESS) {
- goto leave_loop;
- }
-
- if (!btr_pcur_is_on_user_rec(&pcur)) {
- ut_ad(btr_pcur_is_after_last_on_page(&pcur));
- goto leave_loop;
- }
-
- for (;;) {
- ut_ad(btr_pcur_is_on_user_rec(&pcur));
-
- ibuf_rec = btr_pcur_get_rec(&pcur);
-
- /* Check if the entry is for this space */
- if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) {
-
- goto leave_loop;
- }
-
- uint32_t page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
+ ut_ad(mtr->get_savepoint() == 1);
- dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
+ uint32_t page= FSP_IBUF_TREE_ROOT_PAGE_NO;
- /* Delete the record from ibuf */
- if (ibuf_delete_rec(page_id_t(space, page_no),
- &pcur, &search_tuple, &mtr)) {
- /* Deletion was pessimistic and mtr was committed:
- we start from the beginning again */
+ for (ulint height= ULINT_UNDEFINED;;)
+ {
+ dberr_t err;
+ buf_block_t* block= btr_block_get(*cur->index(), page, RW_X_LATCH, mtr,
+ &err);
+ ut_ad(!block == (err != DB_SUCCESS));
- ut_ad(mtr.has_committed());
-clear:
- ut_free(pcur.old_rec_buf);
- goto loop;
- }
+ if (!block)
+ return err;
- if (btr_pcur_is_after_last_on_page(&pcur)) {
- ibuf_mtr_commit(&mtr);
- goto clear;
- }
- }
+ page_cur_set_before_first(block, &cur->page_cur);
+ const uint32_t l= btr_page_get_level(block->page.frame);
-leave_loop:
- ibuf_mtr_commit(&mtr);
- ut_free(pcur.old_rec_buf);
+ if (height == ULINT_UNDEFINED)
+ height= l;
+ else
+ {
+ /* Release the parent page latch. */
+ ut_ad(mtr->get_savepoint() == 3);
+ mtr->rollback_to_savepoint(1, 2);
- ibuf_add_ops(ibuf.n_discarded_ops, dops);
-}
+ if (UNIV_UNLIKELY(height != l))
+ return DB_CORRUPTION;
+ }
-/******************************************************************//**
-Looks if the insert buffer is empty.
-@return true if empty */
-bool
-ibuf_is_empty(void)
-/*===============*/
-{
- mtr_t mtr;
+ if (!height)
+ return ibuf_move_to_next(cur, mtr);
- ibuf_mtr_start(&mtr);
+ height--;
- ut_d(mysql_mutex_lock(&ibuf_mutex));
- const buf_block_t* root = ibuf_tree_root_get(&mtr);
- bool is_empty = root && page_is_empty(root->page.frame);
- ut_ad(!root || is_empty == ibuf.empty);
- ut_d(mysql_mutex_unlock(&ibuf_mutex));
- ibuf_mtr_commit(&mtr);
+ if (!page_cur_move_to_next(&cur->page_cur))
+ return DB_CORRUPTION;
- return(is_empty);
+ const rec_t *ptr= cur->page_cur.rec;
+ const ulint n_fields= rec_get_n_fields_old(ptr);
+ if (n_fields <= IBUF_REC_FIELD_USER)
+ return DB_CORRUPTION;
+ ulint len;
+ ptr+= rec_get_nth_field_offs_old(ptr, n_fields - 1, &len);
+ if (len != 4)
+ return DB_CORRUPTION;
+ page= mach_read_from_4(ptr);
+ }
}
-/******************************************************************//**
-Prints info of ibuf. */
-void
-ibuf_print(
-/*=======*/
- FILE* file) /*!< in: file where to print */
+ATTRIBUTE_COLD dberr_t ibuf_upgrade()
{
- if (UNIV_UNLIKELY(!ibuf.index)) return;
-
- mysql_mutex_lock(&ibuf_mutex);
- if (ibuf.empty)
+ if (srv_read_only_mode)
{
- mysql_mutex_unlock(&ibuf_mutex);
- return;
+ sql_print_error("InnoDB: innodb_read_only_mode prevents an upgrade");
+ return DB_READ_ONLY;
}
- const ulint size= ibuf.size;
- const ulint free_list_len= ibuf.free_list_len;
- const ulint seg_size= ibuf.seg_size;
- mysql_mutex_unlock(&ibuf_mutex);
-
- fprintf(file,
- "-------------\n"
- "INSERT BUFFER\n"
- "-------------\n"
- "size " ULINTPF ", free list len " ULINTPF ","
- " seg size " ULINTPF ", " ULINTPF " merges\n",
- size, free_list_len, seg_size, ulint{ibuf.n_merges});
- ibuf_print_ops("merged operations:\n", ibuf.n_merged_ops, file);
- ibuf_print_ops("discarded operations:\n", ibuf.n_discarded_ops, file);
-}
-
-/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
-@param[in] trx transaction
-@param[in,out] space tablespace being imported
-@return DB_SUCCESS or error code */
-dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
-{
- ut_ad(trx->mysql_thd);
- ut_ad(space->purpose == FIL_TYPE_IMPORT);
-
- const unsigned zip_size = space->zip_size();
- const unsigned physical_size = space->physical_size();
+ sql_print_information("InnoDB: Upgrading the change buffer");
- uint32_t size= std::min(space->free_limit, space->size);
-
- if (size == 0) {
- return(DB_TABLE_NOT_FOUND);
- }
+#ifdef BTR_CUR_HASH_ADAPT
+ const bool ahi= btr_search_enabled;
+ if (ahi)
+ btr_search_disable();
+#endif
- mtr_t mtr;
+ dict_table_t *ibuf_table= dict_table_t::create({C_STRING_WITH_LEN("ibuf")},
+ fil_system.sys_space,
+ 1, 0, 0, 0);
+ dict_index_t *ibuf_index=
+ dict_mem_index_create(ibuf_table, "CLUST_IND", DICT_CLUSTERED, 1);
+ ibuf_index->id= ibuf_index_id;
+ ibuf_index->n_uniq= REC_MAX_N_FIELDS;
+ ibuf_index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
+ ibuf_index->page= FSP_IBUF_TREE_ROOT_PAGE_NO;
+ ut_d(ibuf_index->is_dummy= true);
+ ut_d(ibuf_index->cached= true);
+
+ size_t spaces=0, pages= 0;
+ dberr_t err;
+ mtr_t mtr;
+ mtr.start();
+ mtr_x_lock_index(ibuf_index, &mtr);
- /* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat
- every page_size pages. For example if page_size is 16 KiB, then the
- two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop
- below page_no is measured in number of pages since the beginning of
- the space, as usual. */
+ {
+ btr_cur_t cur;
+ uint32_t prev_space_id= ~0U;
+ fil_space_t *space= nullptr;
+ cur.page_cur.index= ibuf_index;
+ log_free_check();
+ err= ibuf_open(&cur, &mtr);
+
+ while (err == DB_SUCCESS && !page_cur_is_after_last(&cur.page_cur))
+ {
+ const uint32_t space_id= ibuf_rec_get_space(cur.page_cur.rec);
+ if (space_id != prev_space_id)
+ {
+ if (space)
+ space->release();
+ prev_space_id= space_id;
+ space= fil_space_t::get(space_id);
+ if (space)
+ {
+ /* Move to the next user tablespace. We buffer-fix the current
+ change buffer leaf page to prevent it from being evicted
+ before we have started a new mini-transaction. */
+ cur.page_cur.block->fix();
+ mtr.commit();
+ log_free_check();
+ mtr.start();
+ mtr.page_lock(cur.page_cur.block, RW_X_LATCH);
+ mtr.set_named_space(space);
+ }
+ spaces++;
+ }
+ pages++;
+ err= ibuf_merge(space, &cur, &mtr);
+ if (err == DB_SUCCESS)
+ {
+ /* Move to the next user index page. We buffer-fix the current
+ change buffer leaf page to prevent it from being evicted
+ before we have started a new mini-transaction. */
+ cur.page_cur.block->fix();
+ mtr.commit();
+
+ if (recv_sys.report(time(nullptr)))
+ {
+ sql_print_information("InnoDB: merged changes to"
+ " %zu tablespaces, %zu pages", spaces, pages);
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "merged changes to"
+ " %zu tablespaces, %zu pages",
+ spaces, pages);
+ }
+
+ log_free_check();
+ mtr.start();
+ mtr.page_lock(cur.page_cur.block, RW_X_LATCH);
+ if (space)
+ mtr.set_named_space(space);
+ }
+ }
+ mtr.commit();
+ if (space)
+ space->release();
+ }
- for (uint32_t page_no = 0; page_no < size; page_no += physical_size) {
- if (trx_is_interrupted(trx)) {
- return(DB_INTERRUPTED);
- }
+ if (err == DB_SUCCESS)
+ {
+ mtr.start();
+ if (buf_block_t *root= buf_page_get_gen(ibuf_root, 0, RW_X_LATCH,
+ nullptr, BUF_GET, &mtr, &err))
+ {
+ page_create(root, &mtr, false);
+ mtr.write<2,mtr_t::MAYBE_NOP>(*root, PAGE_HEADER + PAGE_LEVEL +
+ root->page.frame, 0U);
+ }
+ mtr.commit();
- mtr_start(&mtr);
+ while (err == DB_SUCCESS)
+ err= ibuf_remove_free_page(mtr);
- buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
- page_id_t(space->id, page_no), zip_size, &mtr);
- if (!bitmap_page) {
- mtr.commit();
- return DB_CORRUPTION;
- }
+ if (err == DB_SUCCESS_LOCKED_REC)
+ err= DB_SUCCESS;
+ }
- if (buf_is_zeroes(span<const byte>(bitmap_page->page.frame,
- physical_size))) {
- /* This means we got all-zero page instead of
- ibuf bitmap page. The subsequent page should be
- all-zero pages. */
-#ifdef UNIV_DEBUG
- for (uint32_t curr_page = page_no + 1;
- curr_page < physical_size; curr_page++) {
-
- buf_block_t* block = buf_page_get(
- page_id_t(space->id, curr_page),
- zip_size, RW_S_LATCH, &mtr);
- page_t* page = buf_block_get_frame(block);
- ut_ad(buf_is_zeroes(span<const byte>(
- page,
- physical_size)));
- }
-#endif /* UNIV_DEBUG */
- mtr_commit(&mtr);
- continue;
- }
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi)
+ btr_search_enable();
+#endif
- for (uint32_t i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size;
- i++) {
- const uint32_t offset = page_no + i;
- const page_id_t cur_page_id(space->id, offset);
-
- if (ibuf_bitmap_page_get_bits(
- bitmap_page->page.frame,
- cur_page_id, zip_size,
- IBUF_BITMAP_IBUF, &mtr)) {
-
- mtr_commit(&mtr);
-
- ib_errf(trx->mysql_thd,
- IB_LOG_LEVEL_ERROR,
- ER_INNODB_INDEX_CORRUPT,
- "File %s page %u"
- " is wrongly flagged to belong to the"
- " insert buffer",
- space->chain.start->name, offset);
- return(DB_CORRUPTION);
- }
-
- if (ibuf_bitmap_page_get_bits(
- bitmap_page->page.frame,
- cur_page_id, zip_size,
- IBUF_BITMAP_BUFFERED, &mtr)) {
-
- ib_errf(trx->mysql_thd,
- IB_LOG_LEVEL_WARN,
- ER_INNODB_INDEX_CORRUPT,
- "Buffered changes"
- " for file %s page %u are lost",
- space->chain.start->name, offset);
-
- /* Tolerate this error, so that
- slightly corrupted tables can be
- imported and dumped. Clear the bit. */
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
- bitmap_page, cur_page_id,
- physical_size, false, &mtr);
- }
- }
+ ibuf_index->lock.free();
+ dict_mem_index_free(ibuf_index);
+ dict_mem_table_free(ibuf_table);
- mtr_commit(&mtr);
- }
+ if (err)
+ sql_print_error("InnoDB: Unable to upgrade the change buffer");
+ else
+ sql_print_information("InnoDB: Upgraded the change buffer: "
+ "%zu tablespaces, %zu pages", spaces, pages);
- return(DB_SUCCESS);
+ return err;
}
-void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset)
+dberr_t ibuf_upgrade_needed()
{
- ut_a(page_is_leaf(block->page.frame));
- const page_id_t id{block->page.id()};
- const auto zip_size= block->zip_size();
+ mtr_t mtr;
+ mtr.start();
+ mtr.x_lock_space(fil_system.sys_space);
+ dberr_t err;
+ const buf_block_t *header_page= recv_sys.recover(ibuf_header, &mtr, &err);
- if (buf_block_t *bitmap_page= ibuf_bitmap_get_map_page(id, zip_size, mtr))
+ if (!header_page)
{
- if (ibuf_bitmap_page_get_bits(bitmap_page->page.frame, id, zip_size,
- IBUF_BITMAP_BUFFERED, mtr))
- ibuf_delete_recs(id);
-
- ulint free_val= reset ? 0 : ibuf_index_page_calc_free(block);
- /* FIXME: update the bitmap byte only once! */
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>
- (bitmap_page, id, block->physical_size(), free_val, mtr);
- ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>
- (bitmap_page, id, block->physical_size(), false, mtr);
+ err_exit:
+ sql_print_error("InnoDB: The change buffer is corrupted");
+ if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO)
+ err= DB_SUCCESS;
+ func_exit:
+ mtr.commit();
+ return err;
+ }
+
+ const buf_block_t *root= recv_sys.recover(ibuf_root, &mtr, &err);
+ if (!root)
+ goto err_exit;
+
+ if (UNIV_LIKELY(!page_has_siblings(root->page.frame)) &&
+ UNIV_LIKELY(!memcmp(root->page.frame + FIL_PAGE_TYPE, field_ref_zero,
+ srv_page_size -
+ (FIL_PAGE_DATA_END + FIL_PAGE_TYPE))))
+ /* the change buffer was removed; no need to upgrade */;
+ else if (page_is_comp(root->page.frame) ||
+ btr_page_get_index_id(root->page.frame) != ibuf_index_id ||
+ fil_page_get_type(root->page.frame) != FIL_PAGE_INDEX)
+ {
+ err= DB_CORRUPTION;
+ goto err_exit;
}
+ else if (srv_read_only_mode)
+ {
+ sql_print_error("InnoDB: innodb_read_only=ON prevents an upgrade"
+ " of the change buffer");
+ err= DB_READ_ONLY;
+ }
+ else if (srv_force_recovery != SRV_FORCE_NO_LOG_REDO)
+ err= DB_FAIL;
+
+ goto func_exit;
}
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 83bdaa97..35a567d7 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -56,12 +56,8 @@ is acceptable for the program to die with a clear assert failure. */
#define BTR_MAX_LEVELS 100
#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \
- btr_latch_mode((latch_mode) & ~(BTR_INSERT \
- | BTR_DELETE_MARK \
- | BTR_RTREE_UNDO_INS \
+ btr_latch_mode((latch_mode) & ~(BTR_RTREE_UNDO_INS \
| BTR_RTREE_DELETE_MARK \
- | BTR_DELETE \
- | BTR_IGNORE_SEC_UNIQUE \
| BTR_ALREADY_S_LATCHED \
| BTR_LATCH_FOR_INSERT \
| BTR_LATCH_FOR_DELETE))
@@ -79,6 +75,14 @@ btr_root_adjust_on_import(
const dict_index_t* index) /*!< in: index tree */
MY_ATTRIBUTE((warn_unused_result));
+/** Check a file segment header within a B-tree root page.
+@param offset file segment header offset
+@param block B-tree root page
+@param space tablespace
+@return whether the segment header is valid */
+bool btr_root_fseg_validate(ulint offset, const buf_block_t &block,
+ const fil_space_t &space);
+
/** Report a decryption failure. */
ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
@@ -86,13 +90,12 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
@param[in] index index tree
@param[in] page page number
@param[in] mode latch mode
-@param[in] merge whether change buffer merge should be attempted
@param[in,out] mtr mini-transaction
@param[out] err error code
@param[out] first set if this is a first-time access to the page
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
- uint32_t page, rw_lock_type_t mode, bool merge,
+ uint32_t page, rw_lock_type_t mode,
mtr_t *mtr, dberr_t *err= nullptr,
bool *first= nullptr);
@@ -246,15 +249,7 @@ btr_root_raise_and_insert(
mtr_t* mtr, /*!< in: mtr */
dberr_t* err) /*!< out: error code */
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*************************************************************//**
-Reorganizes an index page.
-
-IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index. This has to
-be done either within the same mini-transaction, or by invoking
-ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
-IBUF_BITMAP_FREE is unaffected by reorganization.
-
+/** Reorganize an index page.
@param cursor page cursor
@param mtr mini-transaction
@return error code
@@ -352,6 +347,7 @@ btr_check_node_ptr(
/*===============*/
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: index page */
+ que_thr_t* thr, /*!< in/out: query thread */
mtr_t* mtr) /*!< in: mtr */
MY_ATTRIBUTE((warn_unused_result));
#endif /* UNIV_DEBUG */
@@ -455,15 +451,8 @@ btr_root_block_get(
or RW_X_LATCH */
mtr_t* mtr, /*!< in: mtr */
dberr_t* err); /*!< out: error code */
-/*************************************************************//**
-Reorganizes an index page.
-
-IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index. This has to
-be done either within the same mini-transaction, or by invoking
-ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
-IBUF_BITMAP_FREE is unaffected by reorganization.
+/** Reorganize an index page.
@return error code
@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
dberr_t btr_page_reorganize_block(
@@ -534,9 +523,10 @@ btr_lift_page_up(
must not be empty: use
btr_discard_only_page_on_level if the last
record from the page should be removed */
+ que_thr_t* thr, /*!< in/out: query thread for SPATIAL INDEX */
mtr_t* mtr, /*!< in/out: mini-transaction */
dberr_t* err) /*!< out: error code */
- __attribute__((nonnull));
+ __attribute__((nonnull(1,2,4,5)));
#define BTR_N_LEAF_PAGES 1
#define BTR_TOTAL_SIZE 2
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index f6abc9f5..dc64054e 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -56,11 +56,7 @@ enum {
BTR_KEEP_POS_FLAG = 8,
/** the caller is creating the index or wants to bypass the
index->info.online creation log */
- BTR_CREATE_FLAG = 16,
- /** the caller of btr_cur_optimistic_update() or
- btr_cur_update_in_place() will take care of
- updating IBUF_BITMAP_FREE */
- BTR_KEEP_IBUF_BITMAP = 32
+ BTR_CREATE_FLAG = 16
};
#include "que0types.h"
@@ -213,14 +209,8 @@ btr_cur_pessimistic_insert(
See if there is enough place in the page modification log to log
an update-in-place.
-@retval false if out of space; IBUF_BITMAP_FREE will be reset
-outside mtr if the page was recompressed
-@retval true if enough place;
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
-a secondary index leaf page. This has to be done either within the
-same mini-transaction, or by invoking ibuf_reset_free_bits() before
-mtr_commit(mtr). */
+@retval false if out of space
+@retval true if enough place */
bool
btr_cur_update_alloc_zip_func(
/*==========================*/
@@ -262,7 +252,7 @@ Updates a record when the update causes no size changes in its fields.
@return locking or undo log related error code, or
@retval DB_SUCCESS on success
@retval DB_ZIP_OVERFLOW if there is not enough space left
-on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+on a ROW_FORMAT=COMPRESSED page */
dberr_t
btr_cur_update_in_place(
/*====================*/
@@ -669,28 +659,13 @@ enum btr_cur_method {
reference is stored in the field
hash_node, and might be necessary to
update */
- BTR_CUR_BINARY, /*!< success using the binary search */
- BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to
- the insert buffer */
- BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete
- mark in the insert/delete buffer */
- BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in
- the insert/delete buffer */
- BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */
+ BTR_CUR_BINARY /*!< success using the binary search */
};
/** The tree cursor: the definition appears here only for the compiler
to know struct size! */
struct btr_cur_t {
page_cur_t page_cur; /*!< page cursor */
- purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */
- /*------------------------------*/
- que_thr_t* thr; /*!< this field is only used
- when search_leaf()
- is called for an index entry
- insertion: the calling query
- thread is passed here to be
- used in the insert buffer */
/*------------------------------*/
/** The following fields are used in
search_leaf() to pass information: */
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
deleted file mode 100644
index 0523829b..00000000
--- a/storage/innobase/include/btr0defragment.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*****************************************************************************
-
-Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
-Copyright (C) 2014, 2021, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-#ifndef btr0defragment_h
-#define btr0defragment_h
-
-#include "btr0pcur.h"
-
-/* Max number of pages to consider at once during defragmentation. */
-#define BTR_DEFRAGMENT_MAX_N_PAGES 32
-
-/** stats in btr_defragment */
-extern Atomic_counter<ulint> btr_defragment_compression_failures;
-extern Atomic_counter<ulint> btr_defragment_failures;
-extern Atomic_counter<ulint> btr_defragment_count;
-
-/******************************************************************//**
-Initialize defragmentation. */
-void
-btr_defragment_init(void);
-/******************************************************************//**
-Shutdown defragmentation. */
-void
-btr_defragment_shutdown();
-/******************************************************************//**
-Check whether the given index is in btr_defragment_wq. */
-bool
-btr_defragment_find_index(
- dict_index_t* index); /*!< Index to find. */
-/** Defragment an index.
-@param pcur persistent cursor
-@param thd current session, for checking thd_killed()
-@return whether the operation was interrupted */
-bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd);
-/******************************************************************//**
-When table is dropped, this function is called to mark a table as removed in
-btr_efragment_wq. The difference between this function and the remove_index
-function is this will not NULL the event. */
-void
-btr_defragment_remove_table(
- dict_table_t* table); /*!< Index to be removed. */
-/*********************************************************************//**
-Check whether we should save defragmentation statistics to persistent storage.*/
-void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index);
-
-/* Stop defragmentation.*/
-void btr_defragment_end();
-extern bool btr_defragment_active;
-#endif
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
index fc829e78..966247ff 100644
--- a/storage/innobase/include/btr0types.h
+++ b/storage/innobase/include/btr0types.h
@@ -69,7 +69,7 @@ enum btr_latch_mode {
Used in btr_pcur_move_backward_from_page(). */
BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF,
/** Modify the previous record.
- Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */
+ Used in btr_pcur_move_backward_from_page(). */
BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF,
/** Start modifying the entire B-tree. */
BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF,
@@ -77,24 +77,8 @@ enum btr_latch_mode {
Only used by rtr_search_to_nth_level(). */
BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE,
- /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
- exclusive. */
- /** The search tuple will be inserted to the secondary index
- at the searched position. When the leaf page is not in the
- buffer pool, try to use the change buffer. */
- BTR_INSERT = 64,
-
- /** Try to delete mark a secondary index leaf page record at
- the searched position using the change buffer when the page is
- not in the buffer pool. */
- BTR_DELETE_MARK = 128,
-
- /** Try to purge the record using the change buffer when the
- secondary index leaf page is not in the buffer pool. */
- BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK,
-
/** The caller is already holding dict_index_t::lock S-latch. */
- BTR_ALREADY_S_LATCHED = 256,
+ BTR_ALREADY_S_LATCHED = 16,
/** Search and S-latch a leaf page, assuming that the
dict_index_t::lock S-latch is being held. */
BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
@@ -111,28 +95,15 @@ enum btr_latch_mode {
BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF
| BTR_ALREADY_S_LATCHED,
- /** Attempt to delete-mark a secondary index record. */
- BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
- /** Attempt to delete-mark a secondary index record
- while holding the dict_index_t::lock S-latch. */
- BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
- | BTR_ALREADY_S_LATCHED,
- /** Attempt to purge a secondary index record. */
- BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
- /** Attempt to purge a secondary index record
- while holding the dict_index_t::lock S-latch. */
- BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
- | BTR_ALREADY_S_LATCHED,
-
/** In the case of BTR_MODIFY_TREE, the caller specifies
the intention to delete record only. It is used to optimize
block->lock range.*/
- BTR_LATCH_FOR_DELETE = 512,
+ BTR_LATCH_FOR_DELETE = 32,
/** In the case of BTR_MODIFY_TREE, the caller specifies
the intention to delete record only. It is used to optimize
block->lock range.*/
- BTR_LATCH_FOR_INSERT = 1024,
+ BTR_LATCH_FOR_INSERT = 64,
/** Attempt to delete a record in the tree. */
BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
@@ -143,12 +114,8 @@ enum btr_latch_mode {
/** Attempt to insert a record into the tree. */
BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
- /** This flag ORed to BTR_INSERT says that we can ignore possible
- UNIQUE definition on secondary indexes when we decide if we can use
- the insert buffer to speed up inserts */
- BTR_IGNORE_SEC_UNIQUE = 2048,
/** Rollback in spatial index */
- BTR_RTREE_UNDO_INS = 4096,
+ BTR_RTREE_UNDO_INS = 128,
/** Try to delete mark a spatial index record */
- BTR_RTREE_DELETE_MARK = 8192
+ BTR_RTREE_DELETE_MARK = 256
};
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index c291615c..b30763fa 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -45,13 +45,10 @@ Created 11/5/1995 Heikki Tuuri
/** @name Modes for buf_page_get_gen */
/* @{ */
#define BUF_GET 10 /*!< get always */
+#define BUF_GET_RECOVER 9 /*!< like BUF_GET, but in recv_sys.recover() */
#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make
the block young in the LRU list */
-#define BUF_GET_IF_IN_POOL_OR_WATCH 15
- /*!< Get the page only if it's in the
- buffer pool, if not then set a watch
- on the page. */
#define BUF_GET_POSSIBLY_FREED 16
/*!< Like BUF_GET, but do not mind
if the file page has been freed. */
@@ -204,11 +201,9 @@ buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
@param[in] guess guessed block or NULL
@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+or BUF_PEEK_IF_IN_POOL
@param[in,out] mtr mini-transaction
@param[out] err DB_SUCCESS or error code
-@param[in] allow_ibuf_merge Allow change buffer merge while
-reading the pages from file.
@return pointer to the block or NULL */
buf_block_t*
buf_page_get_gen(
@@ -218,40 +213,12 @@ buf_page_get_gen(
buf_block_t* guess,
ulint mode,
mtr_t* mtr,
- dberr_t* err = NULL,
- bool allow_ibuf_merge = false)
- MY_ATTRIBUTE((nonnull(6), warn_unused_result));
-
-/** This is the low level function used to get access to a database page.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in] guess guessed block or NULL
-@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in,out] mtr mini-transaction, or NULL if a
- block with page_id is to be evicted
-@param[out] err DB_SUCCESS or error code
-@param[in] allow_ibuf_merge Allow change buffer merge to happen
-while reading the page from file
-then it makes sure that it does merging of change buffer changes while
-reading the page from file.
-@return pointer to the block or NULL */
-buf_block_t*
-buf_page_get_low(
- const page_id_t page_id,
- ulint zip_size,
- ulint rw_latch,
- buf_block_t* guess,
- ulint mode,
- mtr_t* mtr,
- dberr_t* err,
- bool allow_ibuf_merge);
+ dberr_t* err = nullptr);
/** Initialize a page in the buffer pool. The page is usually not read
from a file even if it cannot be found in the buffer buf_pool. This is one
of the functions which perform to a block a state transition NOT_USED => LRU
-(the other is buf_page_get_low()).
+(the other is buf_page_get_gen()).
@param[in,out] space space object
@param[in] offset offset of the tablespace
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@@ -529,18 +496,16 @@ public:
static constexpr uint32_t REMOVE_HASH= 2;
/** smallest state() of a buffer page that is freed in the tablespace */
static constexpr uint32_t FREED= 3;
+ /* unused state: 1U<<29 */
/** smallest state() for a block that belongs to buf_pool.LRU */
- static constexpr uint32_t UNFIXED= 1U << 29;
- /** smallest state() of a block for which buffered changes may exist */
- static constexpr uint32_t IBUF_EXIST= 2U << 29;
+ static constexpr uint32_t UNFIXED= 2U << 29;
/** smallest state() of a (re)initialized page (no doublewrite needed) */
static constexpr uint32_t REINIT= 3U << 29;
/** smallest state() for an io-fixed block */
static constexpr uint32_t READ_FIX= 4U << 29;
+ /* unused state: 5U<<29 */
/** smallest state() for a write-fixed block */
- static constexpr uint32_t WRITE_FIX= 5U << 29;
- /** smallest state() for a write-fixed block with buffered changes */
- static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29;
+ static constexpr uint32_t WRITE_FIX= 6U << 29;
/** smallest state() for a write-fixed block (no doublewrite was used) */
static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29;
/** buf_pool.LRU status mask in state() */
@@ -552,8 +517,7 @@ public:
byte *frame;
/* @} */
/** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to)
- is also protected by buf_pool.mutex;
- !frame && !zip.data means an active buf_pool.watch */
+ is also protected by buf_pool.mutex */
page_zip_des_t zip;
#ifdef UNIV_DEBUG
/** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
@@ -683,13 +647,6 @@ public:
bool is_freed() const
{ const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
- bool is_ibuf_exist() const
- {
- const auto s= state();
- ut_ad(s >= UNFIXED);
- ut_ad(s < READ_FIX);
- return (s & LRU_MASK) == IBUF_EXIST;
- }
bool is_reinit() const { return !(~state() & REINIT); }
void set_reinit(uint32_t prev_state)
@@ -700,29 +657,10 @@ public:
ut_ad(s < prev_state + UNFIXED);
}
- void set_ibuf_exist()
- {
- ut_ad(lock.is_write_locked());
- ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
- const auto s= state();
- ut_ad(s >= UNFIXED);
- ut_ad(s < READ_FIX);
- ut_ad(s < IBUF_EXIST || s >= REINIT);
- zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s));
- }
- void clear_ibuf_exist()
- {
- ut_ad(lock.is_write_locked());
- ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
- ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED);
- ut_ad(s >= IBUF_EXIST);
- ut_ad(s < REINIT);
- }
-
uint32_t read_unfix(uint32_t s)
{
ut_ad(lock.is_write_locked());
- ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
+ ut_ad(s == UNFIXED + 1 || s == REINIT + 1);
uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
ut_ad(old_state >= READ_FIX);
ut_ad(old_state < WRITE_FIX);
@@ -812,7 +750,7 @@ public:
uint32_t fix(uint32_t count= 1)
{
ut_ad(count);
- ut_ad(count < IBUF_EXIST);
+ ut_ad(count < REINIT);
uint32_t f= zip.fix.fetch_add(count);
ut_ad(f >= FREED);
ut_ad(!((f ^ (f + 1)) & LRU_MASK));
@@ -1417,78 +1355,10 @@ public:
public:
/** @return whether the buffer pool contains a page
- @tparam allow_watch whether to allow watch_is_sentinel()
@param page_id page identifier
@param chain hash table chain for page_id.fold() */
- template<bool allow_watch= false>
- TRANSACTIONAL_INLINE
- bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
- {
- transactional_shared_lock_guard<page_hash_latch> g
- {page_hash.lock_get(chain)};
- buf_page_t *bpage= page_hash.get(page_id, chain);
- if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
- {
- ut_ad(!bpage->in_zip_hash);
- ut_ad(!bpage->zip.data);
- if (!allow_watch)
- bpage= nullptr;
- }
- return bpage;
- }
-
- /** Determine if a block is a sentinel for a buffer pool watch.
- @param bpage page descriptor
- @return whether bpage a sentinel for a buffer pool watch */
- bool watch_is_sentinel(const buf_page_t &bpage)
- {
-#ifdef SAFE_MUTEX
- DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
- page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
- is_locked());
-#endif /* SAFE_MUTEX */
- ut_ad(bpage.in_file());
- if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
- return false;
- ut_ad(!bpage.in_zip_hash);
- ut_ad(!bpage.zip.data);
- return true;
- }
-
- /** Check if a watched page has been read.
- This may only be called after !watch_set() and before invoking watch_unset().
- @param id page identifier
- @return whether the page was read to the buffer pool */
- TRANSACTIONAL_INLINE
- bool watch_occurred(const page_id_t id)
- {
- hash_chain &chain= page_hash.cell_get(id.fold());
- transactional_shared_lock_guard<page_hash_latch> g
- {page_hash.lock_get(chain)};
- /* The page must exist because watch_set() increments buf_fix_count. */
- return !watch_is_sentinel(*page_hash.get(id, chain));
- }
-
- /** Register a watch for a page identifier.
- @param id page identifier
- @param chain page_hash.cell_get(id.fold())
- @return a buffer page corresponding to id
- @retval nullptr if the block was not present in page_hash */
- buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
-
- /** Stop watching whether a page has been read in.
- watch_set(id) must have returned nullptr before.
- @param id page identifier
- @param chain unlocked hash table chain */
- void watch_unset(const page_id_t id, hash_chain &chain);
-
- /** Remove the sentinel block for the watch before replacing it with a
- real block. watch_unset() or watch_occurred() will notice
- that the block has been replaced with the real block.
- @param w sentinel
- @param chain locked hash table chain
- @return w->state() */
- inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain);
+ TRANSACTIONAL_TARGET
+ bool page_hash_contains(const page_id_t page_id, hash_chain &chain);
/** @return whether less than 1/4 of the buffer pool is available */
TPOOL_SUPPRESS_TSAN
@@ -1883,9 +1753,6 @@ public:
# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
#endif
- /** Sentinels to detect if pages are read into the buffer pool while
- a delete-buffering operation is pending. Protected by mutex. */
- buf_page_t watch[innodb_purge_threads_MAX + 1];
/** Reserve a buffer. */
buf_tmp_buffer_t *io_buf_reserve(bool wait_for_reads)
{ return io_buf.reserve(wait_for_reads); }
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
index 050c8493..048e3d15 100644
--- a/storage/innobase/include/buf0buf.inl
+++ b/storage/innobase/include/buf0buf.inl
@@ -79,7 +79,7 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
inline buf_block_t *buf_block_alloc()
{
- return buf_LRU_get_free_block(false);
+ return buf_LRU_get_free_block(have_no_mutex);
}
/********************************************************************//**
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
index 6e7662d9..f912775d 100644
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -53,9 +53,9 @@ class buf_dblwr_t
element* buf_block_arr;
};
- /** the page number of the first doublewrite block (block_size() pages) */
+ /** the page number of the first doublewrite block (block_size pages) */
page_id_t block1{0, 0};
- /** the page number of the second doublewrite block (block_size() pages) */
+ /** the page number of the second doublewrite block (block_size pages) */
page_id_t block2{0, 0};
/** mutex protecting the data members below */
@@ -74,6 +74,22 @@ class buf_dblwr_t
slot slots[2];
slot *active_slot;
+ /** Size of the doublewrite block in pages */
+ uint32_t block_size;
+
+public:
+ /** Values of use */
+ enum usage {
+ /** Assume that writes are atomic */
+ USE_NO= 0,
+ /** Use the doublewrite buffer with full durability */
+ USE_YES,
+ /** Durable writes to the doublewrite buffer, not to data files */
+ USE_FAST
+ };
+ /** The value of innodb_doublewrite */
+ ulong use;
+private:
/** Initialise the persistent storage of the doublewrite buffer.
@param header doublewrite page header in the TRX_SYS page */
inline void init(const byte *header);
@@ -126,9 +142,6 @@ public:
@param request the completed batch write request */
void flush_buffered_writes_completed(const IORequest &request);
- /** Size of the doublewrite block in pages */
- uint32_t block_size() const { return FSP_EXTENT_SIZE; }
-
/** Schedule a page write. If the doublewrite memory buffer is full,
flush_buffered_writes() will be invoked to make space.
@param request asynchronous write request
@@ -139,6 +152,19 @@ public:
bool is_created() const
{ return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
+ /** @return whether the doublewrite buffer is in use */
+ bool in_use() const { return is_created() && use; }
+ /** @return whether fsync() is needed on non-doublewrite pages */
+ bool need_fsync() const { return use < USE_FAST; }
+
+ void set_use(ulong use)
+ {
+ ut_ad(use <= USE_FAST);
+ mysql_mutex_lock(&mutex);
+ this->use= use;
+ mysql_mutex_unlock(&mutex);
+ }
+
/** @return whether a page identifier is part of the doublewrite buffer */
bool is_inside(const page_id_t id) const
{
@@ -147,8 +173,8 @@ public:
ut_ad(block1 < block2);
if (id < block1)
return false;
- const uint32_t size= block_size();
- return id < block1 + size || (id >= block2 && id < block2 + size);
+ return id < block1 + block_size ||
+ (id >= block2 && id < block2 + block_size);
}
/** Wait for flush_buffered_writes() to be fully completed */
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index 28410276..c52fc05c 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -62,6 +62,17 @@ bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
@retval NULL if the free list is empty */
buf_block_t* buf_LRU_get_free_only();
+/** How to acquire a block */
+enum buf_LRU_get {
+ /** The caller is not holding buf_pool.mutex */
+ have_no_mutex= 0,
+ /** The caller is holding buf_pool.mutex */
+ have_mutex,
+ /** The caller is not holding buf_pool.mutex and is OK if a block
+ cannot be allocated. */
+ have_no_mutex_soft
+};
+
/** Get a block from the buf_pool.free list.
If the list is empty, blocks will be moved from the end of buf_pool.LRU
to buf_pool.free.
@@ -83,9 +94,10 @@ we put it to free list to be used.
* scan whole LRU list
* scan LRU list even if buf_pool.try_LRU_scan is not set
-@param have_mutex whether buf_pool.mutex is already being held
-@return the free control block, in state BUF_BLOCK_MEMORY */
-buf_block_t* buf_LRU_get_free_block(bool have_mutex)
+@param get how to allocate the block
+@return the free control block, in state BUF_BLOCK_MEMORY
+@retval nullptr if get==have_no_mutex_soft and memory was not available */
+buf_block_t* buf_LRU_get_free_block(buf_LRU_get get)
MY_ATTRIBUTE((malloc,warn_unused_result));
/** @return whether the unzip_LRU list should be used for evicting a victim
@@ -127,6 +139,10 @@ buf_unzip_LRU_add_block(
ibool old); /*!< in: TRUE if should be put to the end
of the list, else put to the start */
+/** Evict the temporary tablespace pages above the given threshold
+@param threshold Above this page to be removed from LRU list */
+void buf_LRU_truncate_temp(uint32_t threshold);
+
/** Update buf_pool.LRU_old_ratio.
@param[in] old_pct Reserve this percentage of
the buffer pool for "old" blocks
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
index 3dd085dd..46d08243 100644
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ The database buffer read
Created 11/5/1995 Heikki Tuuri
*******************************************************/
-#ifndef buf0rea_h
-#define buf0rea_h
+#pragma once
#include "buf0buf.h"
@@ -33,15 +32,17 @@ Created 11/5/1995 Heikki Tuuri
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
-@param page_id page id
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@retval DB_SUCCESS if the page was read and is not corrupted
+@param page_id page id
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param chain buf_pool.page_hash cell for page_id
+@retval DB_SUCCESS if the page was read and is not corrupted,
@retval DB_SUCCESS_LOCKED_REC if the page was not read
-@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
-dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size,
+ buf_pool_t::hash_chain &chain);
/** High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
@@ -57,21 +58,14 @@ void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
/** Applies a random read-ahead in buf_pool if there are at least a threshold
value of accessed pages from the random read-ahead area. Does not read any
page, not even the one at the position (space, offset), if the read-ahead
-mechanism is not activated. NOTE 1: the calling thread may own latches on
+mechanism is not activated. NOTE: the calling thread may own latches on
pages: to avoid deadlocks this function must be written such that it cannot
-end up waiting for these latches! NOTE 2: the calling thread must want
-access to the page given: this rule is set to prevent unintended read-aheads
-performed by ibuf routines, a situation which could result in a deadlock if
-the OS does not support asynchronous i/o.
+end up waiting for these latches!
@param[in] page_id page id of a page which the current thread
wants to access
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] ibuf whether we are inside ibuf routine
-@return number of page read requests issued; NOTE that if we read ibuf
-pages, it may happen that the page at the given page number does not
-get read even if we return a positive value! */
-ulint
-buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf);
+@return number of page read requests issued */
+ulint buf_read_ahead_random(const page_id_t page_id, ulint zip_size);
/** Applies linear read-ahead if in the buf_pool the page is a border page of
a linear read-ahead area and all the pages in the area have been accessed.
@@ -92,29 +86,15 @@ only very improbably.
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
function must be written such that it cannot end up waiting for these
latches!
-NOTE 3: the calling thread must want access to the page given: this rule is
-set to prevent unintended read-aheads performed by ibuf routines, a situation
-which could result in a deadlock if the OS does not support asynchronous io.
@param[in] page_id page id; see NOTE 3 above
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] ibuf whether if we are inside ibuf routine
@return number of page read requests issued */
-ulint
-buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
+ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size);
/** Schedule a page for recovery.
@param space tablespace
@param page_id page identifier
@param recs log records
-@param init page initialization, or nullptr if the page needs to be read */
+@param init_lsn page initialization, or 0 if the page needs to be read */
void buf_read_recover(fil_space_t *space, const page_id_t page_id,
- page_recv_t &recs, recv_init *init);
-
-/** @name Modes used in read-ahead @{ */
-/** read only pages belonging to the insert buffer tree */
-#define BUF_READ_IBUF_PAGES_ONLY 131
-/** read any page */
-#define BUF_READ_ANY_PAGE 132
-/* @} */
-
-#endif
+ page_recv_t &recs, lsn_t init_lsn);
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
index 3d63ddb7..d4885186 100644
--- a/storage/innobase/include/data0type.h
+++ b/storage/innobase/include/data0type.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +33,6 @@ Created 1/16/1996 Heikki Tuuri
/** @return whether a length is actually stored in a field */
#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT)
-extern ulint data_mysql_default_charset_coll;
#define DATA_MYSQL_BINARY_CHARSET_COLL 63
/* SQL data type struct */
@@ -196,14 +195,6 @@ constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
/*-------------------------------------------*/
-/* This many bytes we need to store the type information affecting the
-alphabetical order for a single field and decide the storage size of an
-SQL null*/
-#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
-/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
-store the charset-collation number; one byte is left unused, though */
-#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6
-
/* Maximum multi-byte character length in bytes, plus 1 */
#define DATA_MBMAX 8
@@ -344,13 +335,11 @@ charset-collation code.
DATA_BINARY_TYPE etc.
@param[in] charset_coll character-set collation code
@return precise type, including the charset-collation code */
-UNIV_INLINE
-uint32_t
-dtype_form_prtype(ulint old_prtype, ulint charset_coll)
+inline uint32_t dtype_form_prtype(ulint old_prtype, ulint charset_coll)
{
- ut_ad(old_prtype < 256 * 256);
- ut_ad(charset_coll <= MAX_CHAR_COLL_NUM);
- return(uint32_t(old_prtype + (charset_coll << 16)));
+ ut_ad(old_prtype <= 0xffff);
+ ut_ad(charset_coll <= MAX_CHAR_COLL_NUM);
+ return uint32_t(old_prtype | (charset_coll << 16));
}
/*********************************************************************//**
@@ -439,40 +428,6 @@ dtype_get_sql_null_size(
const dtype_t* type, /*!< in: type */
ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
-/**********************************************************************//**
-Reads to a type the stored information which determines its alphabetical
-ordering and the storage size of an SQL NULL value. */
-UNIV_INLINE
-void
-dtype_read_for_order_and_null_size(
-/*===============================*/
- dtype_t* type, /*!< in: type struct */
- const byte* buf); /*!< in: buffer for the stored order info */
-/**********************************************************************//**
-Stores for a type the information which determines its alphabetical ordering
-and the storage size of an SQL NULL value. This is the >= 4.1.x storage
-format. */
-UNIV_INLINE
-void
-dtype_new_store_for_order_and_null_size(
-/*====================================*/
- byte* buf, /*!< in: buffer for
- DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
- bytes where we store the info */
- const dtype_t* type, /*!< in: type struct */
- ulint prefix_len);/*!< in: prefix length to
- replace type->len, or 0 */
-/**********************************************************************//**
-Reads to a type the stored information which determines its alphabetical
-ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
-format. */
-UNIV_INLINE
-void
-dtype_new_read_for_order_and_null_size(
-/*===================================*/
- dtype_t* type, /*!< in: type struct */
- const byte* buf); /*!< in: buffer for stored type order info */
-
/*********************************************************************//**
Validates a data type structure.
@return TRUE if ok */
@@ -494,8 +449,6 @@ struct dict_col_t;
If you add fields to this structure, be sure to initialize them everywhere.
This structure is initialized in the following functions:
dtype_set()
-dtype_read_for_order_and_null_size()
-dtype_new_read_for_order_and_null_size()
sym_tab_add_null_lit() */
struct dtype_t{
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
index 329cee5d..add6c211 100644
--- a/storage/innobase/include/data0type.inl
+++ b/storage/innobase/include/data0type.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -183,126 +183,6 @@ dtype_get_mbmaxlen(
return type->mbmaxlen;
}
-/**********************************************************************//**
-Stores for a type the information which determines its alphabetical ordering
-and the storage size of an SQL NULL value. This is the >= 4.1.x storage
-format. */
-UNIV_INLINE
-void
-dtype_new_store_for_order_and_null_size(
-/*====================================*/
- byte* buf, /*!< in: buffer for
- DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
- bytes where we store the info */
- const dtype_t* type, /*!< in: type struct */
- ulint prefix_len)/*!< in: prefix length to
- replace type->len, or 0 */
-{
- compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
-
- ulint len;
-
- ut_ad(type);
- ut_ad(type->mtype >= DATA_VARCHAR);
- ut_ad(type->mtype <= DATA_MTYPE_MAX);
-
- buf[0] = (byte)(type->mtype & 0xFFUL);
-
- if (type->prtype & DATA_BINARY_TYPE) {
- buf[0] |= 128;
- }
-
- /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) {
- buf[0] |= 64;
- }
- */
-
- buf[1] = (byte)(type->prtype & 0xFFUL);
-
- len = prefix_len ? prefix_len : type->len;
-
- mach_write_to_2(buf + 2, len & 0xFFFFUL);
-
- ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
- mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
-
- if (type->prtype & DATA_NOT_NULL) {
- buf[4] |= 128;
- }
-}
-
-/**********************************************************************//**
-Reads to a type the stored information which determines its alphabetical
-ordering and the storage size of an SQL NULL value. This is the < 4.1.x
-storage format. */
-UNIV_INLINE
-void
-dtype_read_for_order_and_null_size(
-/*===============================*/
- dtype_t* type, /*!< in: type struct */
- const byte* buf) /*!< in: buffer for stored type order info */
-{
- compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
- type->mtype = buf[0] & 63;
- type->prtype = buf[1];
-
- if (buf[0] & 128) {
- type->prtype |= DATA_BINARY_TYPE;
- }
-
- type->len = mach_read_from_2(buf + 2);
-
- type->prtype = dtype_form_prtype(type->prtype,
- data_mysql_default_charset_coll);
- dtype_set_mblen(type);
-}
-
-/**********************************************************************//**
-Reads to a type the stored information which determines its alphabetical
-ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
-storage format. */
-UNIV_INLINE
-void
-dtype_new_read_for_order_and_null_size(
-/*===================================*/
- dtype_t* type, /*!< in: type struct */
- const byte* buf) /*!< in: buffer for stored type order info */
-{
- compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
-
- type->mtype = buf[0] & 63;
- type->prtype = buf[1];
-
- if (buf[0] & 128) {
- type->prtype |= DATA_BINARY_TYPE;
- }
-
- if (buf[4] & 128) {
- type->prtype |= DATA_NOT_NULL;
- }
-
- type->len = mach_read_from_2(buf + 2);
-
- ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
-
- if (dtype_is_string_type(type->mtype)) {
- ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
-
- if (charset_coll == 0) {
- /* This insert buffer record was inserted with MySQL
- version < 4.1.2, and the charset-collation code was not
- explicitly stored to dtype->prtype at that time. It
- must be the default charset-collation of this MySQL
- installation. */
-
- charset_coll = data_mysql_default_charset_coll;
- }
-
- type->prtype = dtype_form_prtype(type->prtype, charset_coll);
- }
- dtype_set_mblen(type);
-}
-
/***********************************************************************//**
Returns the size of a fixed size data type, 0 if not a fixed size type.
@return fixed size, or 0 */
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
index a6528747..68400d20 100644
--- a/storage/innobase/include/dict0boot.h
+++ b/storage/innobase/include/dict0boot.h
@@ -44,39 +44,6 @@ dict_hdr_get_new_id(
(not assigned if NULL) */
uint32_t* space_id); /*!< out: space id
(not assigned if NULL) */
-/** Update dict_sys.row_id in the dictionary header file page. */
-void dict_hdr_flush_row_id(row_id_t id);
-/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
-inline row_id_t dict_sys_t::get_new_row_id()
-{
- row_id_t id= row_id.fetch_add(1);
- if (!(id % ROW_ID_WRITE_MARGIN))
- dict_hdr_flush_row_id(id);
- return id;
-}
-
-/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
-inline void dict_sys_t::update_row_id(row_id_t id)
-{
- row_id_t sys_id= row_id;
- while (id >= sys_id)
- {
- if (!row_id.compare_exchange_strong(sys_id, id))
- continue;
- if (!(id % ROW_ID_WRITE_MARGIN))
- dict_hdr_flush_row_id(id);
- break;
- }
-}
-
-/**********************************************************************//**
-Writes a row id to a record or other 6-byte stored form. */
-inline void dict_sys_write_row_id(byte *field, row_id_t row_id)
-{
- static_assert(DATA_ROW_ID_LEN == 6, "compatibility");
- mach_write_to_6(field, row_id);
-}
-
/*****************************************************************//**
Initializes the data dictionary memory structures when the database is
started. This function is also called when the data dictionary is created.
@@ -116,7 +83,7 @@ inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; }
/*-------------------------------------------------------------*/
/* Dictionary header offsets */
-#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */
+//#define DICT_HDR_ROW_ID 0 /* Was: latest assigned DB_ROW_ID */
#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */
#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */
#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
deleted file mode 100644
index 679484ad..00000000
--- a/storage/innobase/include/dict0defrag_bg.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2016, 2021, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/dict0defrag_bg.h
-Code used for background table and index
-defragmentation
-
-Created 25/08/2016 Jan Lindström
-*******************************************************/
-
-#ifndef dict0defrag_bg_h
-#define dict0defrag_bg_h
-
-#include "dict0types.h"
-
-/** Indices whose defrag stats need to be saved to persistent storage.*/
-struct defrag_pool_item_t {
- table_id_t table_id;
- index_id_t index_id;
-};
-
-/** Allocator type, used by std::vector */
-typedef ut_allocator<defrag_pool_item_t>
- defrag_pool_allocator_t;
-
-/** The multitude of tables to be defragmented- an STL vector */
-typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t>
- defrag_pool_t;
-
-/** Pool where we store information on which tables are to be processed
-by background defragmentation. */
-extern defrag_pool_t defrag_pool;
-
-/*****************************************************************//**
-Initialize the defrag pool, called once during thread initialization. */
-void
-dict_defrag_pool_init(void);
-/*========================*/
-
-/*****************************************************************//**
-Free the resources occupied by the defrag pool, called once during
-thread de-initialization. */
-void
-dict_defrag_pool_deinit(void);
-/*==========================*/
-
-/*****************************************************************//**
-Add an index in a table to the defrag pool, which is processed by the
-background stats gathering thread. Only the table id and index id are
-added to the list, so the table can be closed after being enqueued and
-it will be opened when needed. If the table or index does not exist later
-(has been DROPped), then it will be removed from the pool and skipped. */
-void
-dict_stats_defrag_pool_add(
-/*=======================*/
- const dict_index_t* index); /*!< in: table to add */
-
-/*****************************************************************//**
-Delete a given index from the auto defrag pool. */
-void
-dict_stats_defrag_pool_del(
-/*=======================*/
- const dict_table_t* table, /*!<in: if given, remove
- all entries for the table */
- const dict_index_t* index); /*!< in: index to remove */
-
-/**
-Get the first index that has been added for updating persistent defrag
-stats and eventually save its stats. */
-void dict_defrag_process_entries_from_defrag_pool(THD *thd);
-
-/*********************************************************************//**
-Save defragmentation result.
-@return DB_SUCCESS or error code */
-dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/*********************************************************************//**
-Save defragmentation stats for a given index.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_save_defrag_stats(
-/*============================*/
- dict_index_t* index); /*!< in: index */
-#endif /* dict0defrag_bg_h */
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 3baac658..47350f9c 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -2,7 +2,7 @@
Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -649,8 +649,6 @@ dict_table_get_next_index(
#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
#define dict_index_is_unique(index) (index)->is_unique()
#define dict_index_is_spatial(index) (index)->is_spatial()
-#define dict_index_is_ibuf(index) (index)->is_ibuf()
-#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary()
#define dict_index_has_virtual(index) (index)->has_virtual()
/** Get all the FTS indexes on a table.
@@ -665,7 +663,7 @@ dict_table_get_all_fts_indexes(
/********************************************************************//**
Gets the number of user-defined non-virtual columns in a table in the
dictionary cache.
-@return number of user-defined (e.g., not ROW_ID) non-virtual
+@return number of user-defined (e.g., not DB_ROW_ID) non-virtual
columns of a table */
UNIV_INLINE
unsigned
@@ -1381,27 +1379,10 @@ private:
std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID};
/** hash table of temporary table IDs */
hash_table_t temp_id_hash;
- /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID
- (FIXME: remove this, and move to dict_table_t) */
- Atomic_relaxed<row_id_t> row_id;
- /** The synchronization interval of row_id */
- static constexpr size_t ROW_ID_WRITE_MARGIN= 256;
public:
/** Diagnostic message for exceeding the lock_wait() timeout */
static const char fatal_msg[];
- /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
- inline row_id_t get_new_row_id();
-
- /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
- inline void update_row_id(row_id_t id);
-
- /** Recover the global DB_ROW_ID sequence on database startup */
- void recover_row_id(row_id_t id)
- {
- row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN;
- }
-
/** @return a new temporary table ID */
table_id_t acquire_temporary_table_id()
{
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
index 4cc3eae9..ead22a21 100644
--- a/storage/innobase/include/dict0dict.inl
+++ b/storage/innobase/include/dict0dict.inl
@@ -244,7 +244,7 @@ dict_table_get_next_index(
/********************************************************************//**
Gets the number of user-defined non-virtual columns in a table in the
dictionary cache.
-@return number of user-defined (e.g., not ROW_ID) non-virtual
+@return number of user-defined (e.g., not DB_ROW_ID) non-virtual
columns of a table */
UNIV_INLINE
unsigned
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index 3143aafd..c774a792 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -43,8 +43,10 @@ typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t;
/** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system.
Open each data file if an encryption plugin has been loaded.
-@param spaces set of tablespace files to open */
-void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces);
+@param spaces set of tablespace files to open
+@param upgrade whether we need to invoke ibuf_upgrade() */
+void dict_load_tablespaces(const std::set<uint32_t> *spaces= nullptr,
+ bool upgrade= false);
/** Make sure the data_file_name is saved in dict_table_t if needed.
@param[in,out] table Table object */
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 0268a280..52bb4777 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -65,7 +65,6 @@ combination of types */
auto-generated clustered indexes,
also DICT_UNIQUE will be set */
#define DICT_UNIQUE 2 /*!< unique index */
-#define DICT_IBUF 8 /*!< insert buffer tree */
#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag
in SYS_INDEXES.TYPE */
#define DICT_FTS 32 /* FTS index; can't be combined with the
@@ -266,7 +265,8 @@ use its own tablespace instead of the system tablespace. */
#define DICT_TF2_USE_FILE_PER_TABLE 16U
/** Set when we discard/detach the tablespace */
-#define DICT_TF2_DISCARDED 32U
+constexpr unsigned DICT_TF2_POS_DISCARDED= 5;
+constexpr unsigned DICT_TF2_DISCARDED= 1U << DICT_TF2_POS_DISCARDED;
/** This bit is set if all aux table names (both common tables and
index tables) of a FTS table are in HEX format. */
@@ -947,10 +947,6 @@ struct zip_pad_info_t {
rounds */
};
-/** Number of samples of data size kept when page compression fails for
-a certain index.*/
-#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
-
/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
system clustered index when there is no primary key. */
const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
@@ -996,7 +992,7 @@ struct dict_index_t {
# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50
unsigned type:DICT_IT_BITS;
/*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
- DICT_IBUF, DICT_CORRUPT) */
+ DICT_CORRUPT) */
#define MAX_KEY_LENGTH_BITS 12
unsigned trx_id_offset:MAX_KEY_LENGTH_BITS;
/*!< position of the trx id column
@@ -1116,23 +1112,6 @@ struct dict_index_t {
/*!< has persistent statistics error printed
for this index ? */
/* @} */
- /** Statistics for defragmentation, these numbers are estimations and
- could be very inaccurate at certain times, e.g. right after restart,
- during defragmentation, etc. */
- /* @{ */
- ulint stat_defrag_modified_counter;
- ulint stat_defrag_n_pages_freed;
- /* number of pages freed by defragmentation. */
- ulint stat_defrag_n_page_split;
- /* number of page splits since last full index
- defragmentation. */
- ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
- /* data size when compression failure happened
- the most recent 10 times. */
- ulint stat_defrag_sample_next_slot;
- /* in which slot the next sample should be
- saved. */
- /* @} */
private:
/** R-tree split sequence number */
Atomic_relaxed<node_seq_t> rtr_ssn;
@@ -1184,12 +1163,8 @@ public:
/** @return whether instant ALTER TABLE is in effect */
inline bool is_instant() const;
- /** @return whether the index is the primary key index
- (not the clustered index of the change buffer) */
- bool is_primary() const
- {
- return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF));
- }
+ /** @return whether the index is the primary key index */
+ bool is_primary() const { return is_clust(); }
/** @return whether this is a generated clustered index */
bool is_gen_clust() const { return type == DICT_CLUSTERED; }
@@ -1203,16 +1178,13 @@ public:
/** @return whether this is a spatial index */
bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); }
- /** @return whether this is the change buffer */
- bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
-
/** @return whether this index requires locking */
- bool has_locking() const { return !is_ibuf(); }
+ static constexpr bool has_locking() { return true; }
/** @return whether this is a normal B-tree index
(not the change buffer, not SPATIAL or FULLTEXT) */
bool is_btree() const {
- return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL
+ return UNIV_LIKELY(!(type & (DICT_SPATIAL
| DICT_FTS | DICT_CORRUPT)));
}
@@ -2126,8 +2098,9 @@ public:
process of altering partitions */
unsigned skip_alter_undo:1;
- /*!< whether this is in a single-table tablespace and the .ibd
- file is missing or page decryption failed and page is corrupted */
+ /** whether this is in a single-table tablespace and the .ibd file
+ is believed to be missing or page decryption failed and page is
+ corrupted */
unsigned file_unreadable:1;
/** TRUE if the table object has been added to the dictionary cache. */
@@ -2355,6 +2328,8 @@ private:
Atomic_relaxed<pthread_t> lock_mutex_owner{0};
#endif
public:
+ /** The next DB_ROW_ID value */
+ Atomic_counter<uint64_t> row_id{0};
/** Autoinc counter value to give to the next inserted row. */
uint64_t autoinc;
@@ -2632,19 +2607,6 @@ dict_col_get_spatial_status(
return(spatial_status);
}
-/** Clear defragmentation summary. */
-inline void dict_stats_empty_defrag_summary(dict_index_t* index)
-{
- index->stat_defrag_n_pages_freed = 0;
-}
-
-/** Clear defragmentation related index stats. */
-inline void dict_stats_empty_defrag_stats(dict_index_t* index)
-{
- index->stat_defrag_modified_counter = 0;
- index->stat_defrag_n_page_split = 0;
-}
-
#include "dict0mem.inl"
#endif /* dict0mem_h */
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 3b006daf..720c8e00 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -218,30 +218,15 @@ dict_stats_save_index_stat(
trx_t* trx)
MY_ATTRIBUTE((nonnull(1, 3, 6, 7)));
-/** Report an error if updating table statistics failed because
-.ibd file is missing, table decryption failed or table is corrupted.
-@param[in,out] table Table
-@param[in] defragment true if statistics is for defragment
-@retval DB_DECRYPTION_FAILED if decryption of the table failed
-@retval DB_TABLESPACE_DELETED if .ibd file is missing
-@retval DB_CORRUPTION if table is marked as corrupted */
-dberr_t
-dict_stats_report_error(dict_table_t* table, bool defragment = false)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-
#include "dict0stats.inl"
#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
void test_dict_stats_all();
#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
-/** Write all zeros (or 1 where it makes sense) into a table
-and its indexes'statistics members. The resulting stats
-correspond to an empty table.
-@param table table stats to be emptied
-@param empty_defrag_stats empty the defrag stats */
-void
-dict_stats_empty_table(
- dict_table_t* table,
- bool empty_defrag_stats);
+/** Write all zeros (or 1 where it makes sense) into a table and its indexes'
+statistics members. The resulting stats correspond to an empty table.
+@param table table statistics to be emptied */
+void dict_stats_empty_table(dict_table_t *table);
+
#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index ec50e8cd..f6169227 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -48,10 +48,6 @@ struct dict_add_v_col_t;
#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
-/* The ibuf table and indexes's ID are assigned as the number
-DICT_IBUF_ID_MIN plus the space id */
-#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL
-
typedef ib_id_t table_id_t;
typedef ib_id_t index_id_t;
@@ -136,13 +132,6 @@ struct table_name_t
inline bool is_temporary() const;
};
-#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
-/** Dump the change buffer at startup */
-extern my_bool ibuf_dump;
-/** Flag to control insert buffer debugging. */
-extern uint ibuf_debug;
-#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
-
/** Shift for spatial status */
#define SPATIAL_STATUS_SHIFT 12
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index dfda1178..41b6c59f 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -48,38 +48,6 @@ struct named_spaces_tag_t;
using space_list_t= ilist<fil_space_t, space_list_tag_t>;
-// Forward declaration
-extern my_bool srv_use_doublewrite_buf;
-
-/** Possible values of innodb_flush_method */
-enum srv_flush_t
-{
- /** fsync, the default */
- SRV_FSYNC= 0,
- /** open log files in O_DSYNC mode */
- SRV_O_DSYNC,
- /** do not call os_file_flush() when writing data files, but do flush
- after writing to log files */
- SRV_LITTLESYNC,
- /** do not flush after writing */
- SRV_NOSYNC,
- /** Open or create files with O_DIRECT. This implies using
- unbuffered I/O but still fdatasync(), because some filesystems might
- not flush meta-data on write completion */
- SRV_O_DIRECT,
- /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
- durable on write completion */
- SRV_O_DIRECT_NO_FSYNC
-#ifdef _WIN32
- /** Traditional Windows appoach to open all files without caching,
- and do FileFlushBuffers() */
- ,SRV_ALL_O_DIRECT_FSYNC
-#endif
-};
-
-/** innodb_flush_method */
-extern ulong srv_file_flush_method;
-
/** Undo tablespaces starts with space_id. */
extern uint32_t srv_undo_space_id_start;
/** The number of UNDO tablespaces that are open and ready to use. */
@@ -645,6 +613,8 @@ private:
}
public:
+ /** Reopen all files on set_write_through() or set_buffered(). */
+ static void reopen_all();
/** Try to close a file to adhere to the innodb_open_files limit.
@param print_info whether to diagnose why a file cannot be closed
@return whether a file was closed */
@@ -958,6 +928,11 @@ public:
freed_ranges.add_range(range);
}
+ /** Clear the freed range in temporary tablespace
+ which are in shrinking ranges.
+ @param threshold to be truncated value*/
+ inline void clear_freed_ranges(uint32_t threshold);
+
/** Set the tablespace size in pages */
void set_sizes(uint32_t s)
{
@@ -1035,6 +1010,9 @@ public:
/** @return the tablespace name (databasename/tablename) */
name_type name() const;
+ /** Update the data structures on write completion */
+ void complete_write();
+
private:
/** @return whether the file is usable for io() */
ATTRIBUTE_COLD bool prepare_acquired();
@@ -1107,9 +1085,6 @@ struct fil_node_t final
@return detached handle or OS_FILE_CLOSED */
inline pfs_os_file_t close_to_free(bool detach_handle= false);
- /** Update the data structures on write completion */
- inline void complete_write();
-
private:
/** Does stuff common for close() and detach() */
void prepare_to_close_or_detach();
@@ -1117,8 +1092,7 @@ private:
inline bool fil_space_t::use_doublewrite() const
{
- return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
- buf_dblwr.is_created();
+ return !UT_LIST_GET_FIRST(chain)->atomic_write && buf_dblwr.in_use();
}
inline void fil_space_t::set_imported()
@@ -1285,11 +1259,11 @@ constexpr uint16_t FIL_PAGE_RTREE= 17854;
constexpr uint16_t FIL_PAGE_UNDO_LOG= 2;
/** Index node (of file-in-file metadata) */
constexpr uint16_t FIL_PAGE_INODE= 3;
-/** Insert buffer free list */
+/** Former change buffer free list */
constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4;
/** Freshly allocated page */
constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0;
-/** Change buffer bitmap (pages n*innodb_page_size+1) */
+/** Former change buffer bitmap pages (pages n*innodb_page_size+1) */
constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5;
/** System page */
constexpr uint16_t FIL_PAGE_TYPE_SYS= 6;
@@ -1379,9 +1353,9 @@ struct fil_system_t
Some members may require late initialisation, thus we just mark object as
uninitialised. Real initialisation happens in create().
*/
- fil_system_t() : m_initialised(false) {}
+ fil_system_t() {}
- bool is_initialised() const { return m_initialised; }
+ bool is_initialised() const { return spaces.array; }
/**
Create the file system interface at database start.
@@ -1394,8 +1368,6 @@ struct fil_system_t
void close();
private:
- bool m_initialised;
-
/** Points to the last opened space in space_list. Protected with
fil_system.mutex. */
fil_space_t *space_list_last_opened= nullptr;
@@ -1430,6 +1402,33 @@ public:
fil_space_t* temp_space; /*!< The innodb_temporary tablespace */
/** Map of fil_space_t::id to fil_space_t* */
hash_table_t spaces;
+
+ /** false=invoke fsync() or fdatasync() on data files before checkpoint;
+ true=each write is durable (O_DSYNC) */
+ my_bool write_through;
+ /** whether data files are buffered (not O_DIRECT) */
+ my_bool buffered;
+ /** whether fdatasync() is needed on data files */
+ Atomic_relaxed<bool> need_unflushed_spaces;
+
+ /** Try to enable or disable write-through of data files */
+ void set_write_through(bool write_through);
+ /** Update innodb_doublewrite */
+ void set_use_doublewrite(ulong use)
+ {
+ buf_dblwr.set_use(use);
+ need_unflushed_spaces= !write_through && buf_dblwr.need_fsync();
+ }
+
+ /** Try to enable or disable file system caching of data files */
+ void set_buffered(bool buffered);
+
+ TPOOL_SUPPRESS_TSAN bool is_write_through() const { return write_through; }
+ TPOOL_SUPPRESS_TSAN bool is_buffered() const { return buffered; }
+
+ /** @return whether to update unflushed_spaces */
+ bool use_unflushed_spaces() const { return need_unflushed_spaces; }
+
/** tablespaces for which fil_space_t::needs_flush() holds */
sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
/** number of currently open files; protected by mutex */
@@ -1555,12 +1554,7 @@ template<bool have_reference> inline void fil_space_t::flush()
mysql_mutex_assert_not_owner(&fil_system.mutex);
ut_ad(!have_reference || (pending() & PENDING));
ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
- if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
- {
- ut_ad(!is_in_unflushed_spaces);
- ut_ad(!needs_flush());
- }
- else if (have_reference)
+ if (have_reference)
flush_low();
else
{
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
index 67e79f1a..62dec39b 100644
--- a/storage/innobase/include/fsp0file.h
+++ b/storage/innobase/include/fsp0file.h
@@ -317,6 +317,8 @@ public:
void set_space_id(uint32_t space_id) { m_space_id= space_id; }
void set_flags(uint32_t flags) { m_flags = flags; }
+
+ uint32_t param_size() const { return m_user_param_size; }
private:
/** Free the filepath buffer. */
void free_filepath();
@@ -401,6 +403,9 @@ private:
pages in SysTablespace::normalize_size() */
uint32_t m_size;
+ /** Size in pages; Initial parameter size */
+ uint32_t m_user_param_size;
+
/** ordinal position of this datafile in the tablespace */
ulint m_order;
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index 99459bcb..015cb48c 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -555,6 +555,12 @@ inline void fsp_init_file_page(
mtr->init(block);
}
+/** Truncate the system tablespace */
+void fsp_system_tablespace_truncate();
+
+/** Truncate the temporary tablespace */
+void fsp_shrink_temp_space();
+
#ifndef UNIV_DEBUG
# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr)
#endif
diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h
index 514f3fdb..3ff0e864 100644
--- a/storage/innobase/include/fsp0sysspace.h
+++ b/storage/innobase/include/fsp0sysspace.h
@@ -119,6 +119,12 @@ public:
return(m_auto_extend_last_file);
}
+ /** @return auto shrink */
+ bool can_auto_shrink() const
+ {
+ return m_auto_shrink;
+ }
+
/** Set the last file size.
@param[in] size the size to set */
void set_last_file_size(uint32_t size)
@@ -144,6 +150,16 @@ public:
}
/**
+ @return user specified tablespace size */
+ uint32_t get_min_size() const
+ {
+ uint32_t full_size= 0;
+ for (uint32_t i= 0; i < m_files.size(); i++)
+ full_size+= m_files.at(i).m_user_param_size;
+ return full_size;
+ }
+
+ /**
@return next increment size */
uint32_t get_increment() const;
@@ -251,6 +267,10 @@ private:
/** if false, then sanity checks are still pending */
bool m_sanity_checks_done;
+
+ /** Shrink the system tablespace if the value is
+ enabled */
+ bool m_auto_shrink;
};
/* GLOBAL OBJECTS */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index 9a23e840..757ead55 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -157,28 +157,20 @@ this many file pages */
/* This has been replaced with either srv_page_size or page_zip->size. */
/** @name The space low address page map
-The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+The 2 pages at FSP_XDES_OFFSET are repeated
every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
/* @{ */
/*--------------------------------------*/
#define FSP_XDES_OFFSET 0U /* !< extent descriptor */
-#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */
- /* The ibuf bitmap pages are the ones whose
- page number is the number above plus a
- multiple of XDES_DESCRIBED_PER_PAGE */
-
#define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */
/* The following pages exist
in the system tablespace (space 0). */
-#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer
+#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< former change buffer
header page, in
tablespace 0 */
-#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer
+#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< former change buffer
B-tree root page in
tablespace 0 */
- /* The ibuf tree root page number in
- tablespace 0; its fseg inode is on the page
- number FSP_FIRST_INODE_PAGE_NO */
#define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction
system header, in
tablespace 0 */
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index 1adec365..dc8806a5 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -152,6 +152,15 @@ inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
return flst_read_addr(node + FLST_PREV);
}
+/** Write a file address.
+@param[in] block file page
+@param[in,out] faddr file address location
+@param[in] page page number
+@param[in] boffset byte offset
+@param[in,out] mtr mini-transaction */
+void flst_write_addr(const buf_block_t &block, byte *faddr,
+ uint32_t page, uint16_t boffset, mtr_t *mtr);
+
# ifdef UNIV_DEBUG
/** Validate a file-based list. */
void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
index b07261ce..724a764d 100644
--- a/storage/innobase/include/gis0rtree.h
+++ b/storage/innobase/include/gis0rtree.h
@@ -62,40 +62,45 @@ Created 2013/03/27 Jimmy Yang and Allen Lai
/** Search for a spatial index leaf page record.
@param cur cursor
+@param thr query thread
@param tuple search tuple
@param latch_mode latching mode
@param mtr mini-transaction
@param mode search mode */
-dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+dberr_t rtr_search_leaf(btr_cur_t *cur, que_thr_t *thr, const dtuple_t *tuple,
btr_latch_mode latch_mode, mtr_t *mtr,
page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+ MY_ATTRIBUTE((nonnull(1,3,5), warn_unused_result));
/** Search for inserting a spatial index leaf page record.
@param cur cursor
@param tuple search tuple
@param latch_mode latching mode
@param mtr mini-transaction */
-inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+inline dberr_t rtr_insert_leaf(btr_cur_t *cur, que_thr_t *thr,
+ const dtuple_t *tuple,
btr_latch_mode latch_mode, mtr_t *mtr)
{
- return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT);
+ return rtr_search_leaf(cur, thr, tuple, latch_mode, mtr,
+ PAGE_CUR_RTREE_INSERT);
}
/** Search for a spatial index leaf page record.
-@param pcur cursor
+@param pcur cursor
+@param thr query thread
@param tuple search tuple
@param mode search mode
@param mtr mini-transaction */
-dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, que_thr_t *thr,
+ const dtuple_t *tuple,
page_cur_mode_t mode, mtr_t *mtr)
MY_ATTRIBUTE((nonnull, warn_unused_result));
-dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
- page_cur_mode_t mode,
- btr_latch_mode latch_mode,
- btr_cur_t *cur, mtr_t *mtr)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+dberr_t rtr_search_to_nth_level(btr_cur_t *cur, que_thr_t *thr,
+ const dtuple_t *tuple,
+ btr_latch_mode latch_mode, mtr_t *mtr,
+ page_cur_mode_t mode, ulint level)
+ MY_ATTRIBUTE((nonnull(1,3,5), warn_unused_result));
/**********************************************************************//**
Builds a Rtree node pointer out of a physical record and a page number.
@@ -132,7 +137,29 @@ rtr_page_split_and_insert(
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
mtr_t* mtr, /*!< in: mtr */
- dberr_t* err); /*!< out: error code */
+ dberr_t* err, /*!< out: error code */
+ que_thr_t* thr); /*!< in: query thread */
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts the tuple.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+rtr_root_raise_and_insert(
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ rec_offs** offsets,/*!< out: offsets on inserted record */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr, /*!< in: mtr */
+ dberr_t* err, /*!< out: error code */
+ que_thr_t* thr); /*!< in: query thread */
/**************************************************************//**
Sets the child node mbr in a node pointer. */
@@ -243,8 +270,8 @@ rtr_create_rtr_info(
bool init_matches, /*!< in: Whether to initiate the
"matches" structure for collecting
matched leaf records */
- btr_cur_t* cursor, /*!< in: tree search cursor */
- dict_index_t* index); /*!< in: index struct */
+ que_thr_t* thr, /*!< in/out: query thread */
+ btr_cur_t* cursor); /*!< in: tree search cursor */
/********************************************************************//**
Update a btr_cur_t with rtr_info */
@@ -299,8 +326,10 @@ rtr_get_mbr_from_tuple(
about parent nodes in search
@param[in,out] cursor cursor on node pointer record,
its page x-latched
+@param[in,out] thr query thread
@return whether the cursor was successfully positioned */
-bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor,
+ que_thr_t *thr)
MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
/************************************************************//**
@@ -312,11 +341,12 @@ rtr_page_get_father_block(
/*======================*/
rec_offs* offsets,/*!< in: work area for the return value */
mem_heap_t* heap, /*!< in: memory heap to use */
- mtr_t* mtr, /*!< in: mtr */
btr_cur_t* sea_cur,/*!< in: search cursor, contains information
about parent nodes in search */
- btr_cur_t* cursor);/*!< out: cursor on node pointer record,
+ btr_cur_t* cursor, /*!< out: cursor on node pointer record,
its page x-latched */
+ que_thr_t* thr, /*!< in/out: query thread */
+ mtr_t* mtr); /*!< in/out: mtr */
/**************************************************************//**
Store the parent path cursor
@return number of cursor stored */
@@ -337,6 +367,7 @@ bool rtr_search(
const dtuple_t* tuple, /*!< in: tuple on which search done */
btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ que_thr_t* thr, /*!< in/out; query thread */
mtr_t* mtr) /*!< in: mtr */
MY_ATTRIBUTE((warn_unused_result));
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
index 5101eeb6..460496d1 100644
--- a/storage/innobase/include/gis0rtree.inl
+++ b/storage/innobase/include/gis0rtree.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -240,6 +240,9 @@ rtr_info_reinit_in_cursor(
bool need_prdt) /*!< in: Whether predicate lock is
needed */
{
+ que_thr_t* thr = cursor->rtr_info->thr;
+ ut_ad(thr);
rtr_clean_rtr_info(cursor->rtr_info, false);
rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true);
+ cursor->rtr_info->thr = thr;
}
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
index c246b2ef..d1ff331f 100644
--- a/storage/innobase/include/ibuf0ibuf.h
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -1,7 +1,6 @@
/*****************************************************************************
-Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2022, MariaDB Corporation.
+Copyright (c) 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -17,420 +16,40 @@ this program; if not, write to the Free Software Foundation, Inc.,
*****************************************************************************/
-/**************************************************//**
-@file include/ibuf0ibuf.h
-Insert buffer
-
-Created 7/19/1997 Heikki Tuuri
-*******************************************************/
-
-#ifndef ibuf0ibuf_h
-#define ibuf0ibuf_h
-
-#include "mtr0mtr.h"
-#include "dict0mem.h"
-#include "fsp0fsp.h"
-
-/** Default value for maximum on-disk size of change buffer in terms
-of percentage of the buffer pool. */
-#define CHANGE_BUFFER_DEFAULT_SIZE (25)
-
-/* Possible operations buffered in the insert/whatever buffer. See
-ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
-typedef enum {
- IBUF_OP_INSERT = 0,
- IBUF_OP_DELETE_MARK = 1,
- IBUF_OP_DELETE = 2,
-
- /* Number of different operation types. */
- IBUF_OP_COUNT = 3
-} ibuf_op_t;
-
-/** Combinations of operations that can be buffered.
-@see innodb_change_buffering_names */
-enum ibuf_use_t {
- IBUF_USE_NONE = 0,
- IBUF_USE_INSERT, /* insert */
- IBUF_USE_DELETE_MARK, /* delete */
- IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */
- IBUF_USE_DELETE, /* delete+purge */
- IBUF_USE_ALL /* insert+delete+purge */
-};
-
-/** Operations that can currently be buffered. */
-extern ulong innodb_change_buffering;
-
-/** Insert buffer struct */
-struct ibuf_t{
- Atomic_relaxed<ulint> size; /*!< current size of the ibuf index
- tree, in pages */
- Atomic_relaxed<ulint> max_size; /*!< recommended maximum size of the
- ibuf index tree, in pages */
- ulint seg_size; /*!< allocated pages of the file
- segment containing ibuf header and
- tree */
- bool empty; /*!< Protected by the page
- latch of the root page of the
- insert buffer tree
- (FSP_IBUF_TREE_ROOT_PAGE_NO). true
- if and only if the insert
- buffer tree is empty. */
- ulint free_list_len; /*!< length of the free list */
- ulint height; /*!< tree height */
- dict_index_t* index; /*!< insert buffer index */
-
- /** number of pages merged */
- Atomic_counter<ulint> n_merges;
- Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT];
- /*!< number of operations of each type
- merged to index pages */
- Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT];
- /*!< number of operations of each type
- discarded without merging due to the
- tablespace being deleted or the
- index being dropped */
-};
-
-/** The insert buffer control structure */
-extern ibuf_t ibuf;
-
-/* The purpose of the insert buffer is to reduce random disk access.
-When we wish to insert a record into a non-unique secondary index and
-the B-tree leaf page where the record belongs to is not in the buffer
-pool, we insert the record into the insert buffer B-tree, indexed by
-(space_id, page_no). When the page is eventually read into the buffer
-pool, we look up the insert buffer B-tree for any modifications to the
-page, and apply these upon the completion of the read operation. This
-is called the insert buffer merge. */
-
-/* The insert buffer merge must always succeed. To guarantee this,
-the insert buffer subsystem keeps track of the free space in pages for
-which it can buffer operations. Two bits per page in the insert
-buffer bitmap indicate the available space in coarse increments. The
-free bits in the insert buffer bitmap must never exceed the free space
-on a page. It is safe to decrement or reset the bits in the bitmap in
-a mini-transaction that is committed before the mini-transaction that
-affects the free space. It is unsafe to increment the bits in a
-separately committed mini-transaction, because in crash recovery, the
-free bits could momentarily be set too high. */
-
-/******************************************************************//**
-Creates the insert buffer data structure at a database startup.
-@return DB_SUCCESS or failure */
-dberr_t
-ibuf_init_at_db_start(void);
-/*=======================*/
-/*********************************************************************//**
-Updates the max_size value for ibuf. */
-void
-ibuf_max_size_update(
-/*=================*/
- ulint new_val); /*!< in: new value in terms of
- percentage of the buffer pool size */
-/*********************************************************************//**
-Reads the biggest tablespace id from the high end of the insert buffer
-tree and updates the counter in fil_system. */
-void
-ibuf_update_max_tablespace_id(void);
-/*===============================*/
-/***************************************************************//**
-Starts an insert buffer mini-transaction. */
-UNIV_INLINE
-void
-ibuf_mtr_start(
-/*===========*/
- mtr_t* mtr) /*!< out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
-/***************************************************************//**
-Commits an insert buffer mini-transaction. */
-UNIV_INLINE
-void
-ibuf_mtr_commit(
-/*============*/
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((nonnull));
-/************************************************************************//**
-Resets the free bits of the page in the ibuf bitmap. This is done in a
-separate mini-transaction, hence this operation does not restrict
-further work to only ibuf bitmap operations, which would result if the
-latch to the bitmap page were kept. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is safe
-to decrement or reset the bits in the bitmap in a mini-transaction
-that is committed before the mini-transaction that affects the free
-space. */
-void
-ibuf_reset_free_bits(
-/*=================*/
- buf_block_t* block); /*!< in: index page; free bits are set to 0
- if the index is a non-clustered
- non-unique, and page level is 0 */
-/************************************************************************//**
-Updates the free bits of an uncompressed page in the ibuf bitmap if
-there is not enough free on the page any more. This is done in a
-separate mini-transaction, hence this operation does not restrict
-further work to only ibuf bitmap operations, which would result if the
-latch to the bitmap page were kept. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is
-unsafe to increment the bits in a separately committed
-mini-transaction, because in crash recovery, the free bits could
-momentarily be set too high. It is only safe to use this function for
-decrementing the free bits. Should more free space become available,
-we must not update the free bits here, because that would break crash
-recovery. */
-UNIV_INLINE
-void
-ibuf_update_free_bits_if_full(
-/*==========================*/
- buf_block_t* block, /*!< in: index page to which we have added new
- records; the free bits are updated if the
- index is non-clustered and non-unique and
- the page level is 0, and the page becomes
- fuller */
- ulint max_ins_size,/*!< in: value of maximum insert size with
- reorganize before the latest operation
- performed to the page */
- ulint increase);/*!< in: upper limit for the additional space
- used in the latest operation, if known, or
- ULINT_UNDEFINED */
-/**********************************************************************//**
-Updates the free bits for an uncompressed page to reflect the present
-state. Does this in the mtr given, which means that the latching
-order rules virtually prevent any further operations for this OS
-thread until mtr is committed. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is safe
-to set the free bits in the same mini-transaction that updated the
-page. */
-void
-ibuf_update_free_bits_low(
-/*======================*/
- const buf_block_t* block, /*!< in: index page */
- ulint max_ins_size, /*!< in: value of
- maximum insert size
- with reorganize before
- the latest operation
- performed to the page */
- mtr_t* mtr); /*!< in/out: mtr */
-/**********************************************************************//**
-Updates the free bits for a compressed page to reflect the present
-state. Does this in the mtr given, which means that the latching
-order rules virtually prevent any further operations for this OS
-thread until mtr is committed. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is safe
-to set the free bits in the same mini-transaction that updated the
-page. */
-void
-ibuf_update_free_bits_zip(
-/*======================*/
- buf_block_t* block, /*!< in/out: index page */
- mtr_t* mtr); /*!< in/out: mtr */
-/**********************************************************************//**
-Updates the free bits for the two pages to reflect the present state.
-Does this in the mtr given, which means that the latching order rules
-virtually prevent any further operations until mtr is committed.
-NOTE: The free bits in the insert buffer bitmap must never exceed the
-free space on a page. It is safe to set the free bits in the same
-mini-transaction that updated the pages. */
-void
-ibuf_update_free_bits_for_two_pages_low(
-/*====================================*/
- buf_block_t* block1, /*!< in: index page */
- buf_block_t* block2, /*!< in: index page */
- mtr_t* mtr); /*!< in: mtr */
-/**********************************************************************//**
-A basic partial test if an insert to the insert buffer could be possible and
-recommended. */
-UNIV_INLINE
-ibool
-ibuf_should_try(
-/*============*/
- dict_index_t* index, /*!< in: index where to insert */
- ulint ignore_sec_unique); /*!< in: if != 0, we should
- ignore UNIQUE constraint on
- a secondary index when we
- decide */
-/******************************************************************//**
-Returns TRUE if the current OS thread is performing an insert buffer
-routine.
-
-For instance, a read-ahead of non-ibuf pages is forbidden by threads
-that are executing an insert buffer routine.
-@return TRUE if inside an insert buffer routine */
-UNIV_INLINE
-ibool
-ibuf_inside(
-/*========*/
- const mtr_t* mtr) /*!< in: mini-transaction */
- MY_ATTRIBUTE((warn_unused_result));
-
-/** Checks if a page address is an ibuf bitmap page (level 3 page) address.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@return TRUE if a bitmap page */
-inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size)
-{
- ut_ad(ut_is_2pow(zip_size));
- ulint size = zip_size ? zip_size : srv_page_size;
- return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET;
-}
-
-/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
-Must not be called when recv_no_ibuf_operations==true.
-@param[in] page_id page id
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in] x_latch FALSE if relaxed check (avoid latching the
-bitmap page)
-@param[in,out] mtr mtr which will contain an x-latch to the
-bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
-in which case a new transaction is created.
-@return true if level 2 or level 3 page */
-bool
-ibuf_page_low(
- const page_id_t page_id,
- ulint zip_size,
-#ifdef UNIV_DEBUG
- bool x_latch,
-#endif /* UNIV_DEBUG */
- mtr_t* mtr)
- MY_ATTRIBUTE((warn_unused_result));
-
-#ifdef UNIV_DEBUG
-/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
-Must not be called when recv_no_ibuf_operations==true.
-@param[in] page_id tablespace/page identifier
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out] mtr mini-transaction or NULL
-@return TRUE if level 2 or level 3 page */
-# define ibuf_page(page_id, zip_size, mtr) \
- ibuf_page_low(page_id, zip_size, true, mtr)
-
-#else /* UNIV_DEBUG */
-
-/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
-Must not be called when recv_no_ibuf_operations==true.
-@param[in] page_id tablespace/page identifier
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out] mtr mini-transaction or NULL
-@return TRUE if level 2 or level 3 page */
-# define ibuf_page(page_id, zip_size, mtr) \
- ibuf_page_low(page_id, zip_size, mtr)
-
-#endif /* UNIV_DEBUG */
-/***********************************************************************//**
-Frees excess pages from the ibuf free list. This function is called when an OS
-thread calls fsp services to allocate a new file segment, or a new page to a
-file segment, and the thread did not own the fsp latch before this call. */
-void
-ibuf_free_excess_pages(void);
-/*========================*/
-
-/** Buffer an operation in the change buffer, instead of applying it
-directly to the file page, if this is possible. Does not do it if the index
-is clustered or unique.
-@param[in] op operation type
-@param[in] entry index entry to insert
-@param[in,out] index index where to insert
-@param[in] page_id page id where to insert
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out] thr query thread
-@return true if success */
-bool
-ibuf_insert(
- ibuf_op_t op,
- const dtuple_t* entry,
- dict_index_t* index,
- const page_id_t page_id,
- ulint zip_size,
- que_thr_t* thr);
-
-/** Check whether buffered changes exist for a page.
-@param[in] id page identifier
-@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@return whether buffered changes exist */
-bool ibuf_page_exists(const page_id_t id, ulint zip_size);
-
-/** When an index page is read from a disk to the buffer pool, this function
-applies any buffered operations to the page and deletes the entries from the
-insert buffer. If the page is not read, but created in the buffer pool, this
-function deletes its buffered entries from the insert buffer; there can
-exist entries for such a page if the page belonged to an index which
-subsequently was dropped.
-@param block X-latched page to try to apply changes to, or NULL to discard
-@param page_id page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
-@return error code */
-dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
- const page_id_t page_id,
- ulint zip_size);
-
-/** Delete all change buffer entries for a tablespace,
-in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
-@param[in] space missing or to-be-discarded tablespace */
-void ibuf_delete_for_discarded_space(uint32_t space);
-
-/** Contract the change buffer by reading pages to the buffer pool.
-@return a lower limit for the combined size in bytes of entries which
-will be merged from ibuf trees to the pages read
-@retval 0 if ibuf.empty */
-ulint ibuf_contract();
-
-/** Contracts insert buffer trees by reading pages referring to space_id
-to the buffer pool.
-@returns number of pages merged.*/
-ulint
-ibuf_merge_space(
-/*=============*/
- ulint space); /*!< in: space id */
-
-/******************************************************************//**
-Looks if the insert buffer is empty.
-@return true if empty */
-bool
-ibuf_is_empty(void);
-/*===============*/
-/******************************************************************//**
-Prints info of ibuf. */
-void
-ibuf_print(
-/*=======*/
- FILE* file); /*!< in: file where to print */
-/********************************************************************
-Read the first two bytes from a record's fourth field (counter field in new
-records; something else in older records).
-@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
-ulint
-ibuf_rec_get_counter(
-/*=================*/
- const rec_t* rec); /*!< in: ibuf record */
-/******************************************************************//**
-Closes insert buffer and frees the data structures. */
-void
-ibuf_close(void);
-/*============*/
-
-/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
-@param[in] trx transaction
-@param[in,out] space tablespace being imported
-@return DB_SUCCESS or error code */
-dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/** Update free bits and buffered bits for bulk loaded page.
-@param block secondary index leaf page
-@param mtr mini-transaction
-@param reset whether the page is full */
-void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset);
-
-#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
-#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
-
-/* The ibuf header page currently contains only the file segment header
-for the file segment from which the pages for the ibuf tree are allocated */
-#define IBUF_HEADER PAGE_DATA
-#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
-
-/* The insert buffer tree itself is always located in space 0. */
-#define IBUF_SPACE_ID static_cast<ulint>(0)
-
-#include "ibuf0ibuf.inl"
-
-#endif
+#include "db0err.h"
+
+/* The purpose of the change buffer was to reduce random disk access.
+When we wished to
+(1) insert a record into a non-unique secondary index,
+(2) delete-mark a secondary index record,
+(3) delete a secondary index record as part of purge (but not ROLLBACK),
+and the B-tree leaf page where the record belongs to is not in the buffer
+pool, we inserted a record into the change buffer B-tree, indexed by
+the page identifier. When the page was eventually read into the buffer
+pool, we looked up the change buffer B-tree for any modifications to the
+page, applied these upon the completion of the read operation. This
+was called the insert buffer merge.
+
+There was a hash index of the change buffer B-tree, implemented as the
+"change buffer bitmap". Bits in these bitmap pages indicated how full
+the page roughly was, and whether any records for the page identifier
+exist in the change buffer. The "free" bits had to be updated as part of
+operations that modified secondary index leaf pages.
+
+Because the change buffer has been removed, we will no longer update
+any change buffer bitmap pages. Instead, on database startup, we will
+check if an upgrade needs to be performed, and apply any buffered
+changes if that is the case. Finally, the change buffer will be
+transformed to a format that will not be recognized by earlier
+versions of MariaDB Server, to prevent downgrades from causing
+corruption (due to the removed updates of the bitmap pages) when the
+change buffer might be enabled. */
+
+/** Check if ibuf_upgrade() is needed as part of server startup.
+@return error code
+@retval DB_SUCCESS if no upgrade is needed
+@retval DB_FAIL if the change buffer is not empty (need ibuf_upgrade()) */
+dberr_t ibuf_upgrade_needed();
+
+/** Upgrade the change buffer after all redo log has been applied. */
+dberr_t ibuf_upgrade();
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
deleted file mode 100644
index 003bf22a..00000000
--- a/storage/innobase/include/ibuf0ibuf.inl
+++ /dev/null
@@ -1,282 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2021, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/ibuf0ibuf.ic
-Insert buffer
-
-Created 7/19/1997 Heikki Tuuri
-*******************************************************/
-
-#include "page0page.h"
-#include "page0zip.h"
-#include "fsp0types.h"
-#include "buf0lru.h"
-
-/** An index page must contain at least srv_page_size /
-IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
-buffer inserts to this page. If there is this much of free space, the
-corresponding bits are set in the ibuf bitmap. */
-#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
-
-/***************************************************************//**
-Starts an insert buffer mini-transaction. */
-UNIV_INLINE
-void
-ibuf_mtr_start(
-/*===========*/
- mtr_t* mtr) /*!< out: mini-transaction */
-{
- mtr_start(mtr);
- mtr->enter_ibuf();
-
- if (high_level_read_only || srv_read_only_mode) {
- mtr_set_log_mode(mtr, MTR_LOG_NO_REDO);
- }
-
-}
-/***************************************************************//**
-Commits an insert buffer mini-transaction. */
-UNIV_INLINE
-void
-ibuf_mtr_commit(
-/*============*/
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ut_ad(mtr->is_inside_ibuf());
- ut_d(mtr->exit_ibuf());
-
- mtr_commit(mtr);
-}
-
-/************************************************************************//**
-Sets the free bit of the page in the ibuf bitmap. This is done in a separate
-mini-transaction, hence this operation does not restrict further work to only
-ibuf bitmap operations, which would result if the latch to the bitmap page
-were kept. */
-void
-ibuf_set_free_bits_func(
-/*====================*/
- buf_block_t* block, /*!< in: index page of a non-clustered index;
- free bit is reset if page level is 0 */
-#ifdef UNIV_IBUF_DEBUG
- ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
- value which the bits must have before
- setting; this is for debugging */
-#endif /* UNIV_IBUF_DEBUG */
- ulint val); /*!< in: value to set: < 4 */
-#ifdef UNIV_IBUF_DEBUG
-# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
-#else /* UNIV_IBUF_DEBUG */
-# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
-#endif /* UNIV_IBUF_DEBUG */
-
-/**********************************************************************//**
-A basic partial test if an insert to the insert buffer could be possible and
-recommended. */
-UNIV_INLINE
-ibool
-ibuf_should_try(
-/*============*/
- dict_index_t* index, /*!< in: index where to insert */
- ulint ignore_sec_unique) /*!< in: if != 0, we should
- ignore UNIQUE constraint on
- a secondary index when we
- decide */
-{
- if (index->type & (DICT_CLUSTERED | DICT_IBUF | DICT_SPATIAL) ||
- !innodb_change_buffering || !ibuf.max_size)
- return false;
- if (!ignore_sec_unique && index->is_unique())
- return false;
- if (index->table->quiesce != QUIESCE_NONE)
- return false;
- for (unsigned i= 0; i < index->n_fields; i++)
- if (index->fields[i].descending)
- return false;
- return true;
-}
-
-/******************************************************************//**
-Returns TRUE if the current OS thread is performing an insert buffer
-routine.
-
-For instance, a read-ahead of non-ibuf pages is forbidden by threads
-that are executing an insert buffer routine.
-@return TRUE if inside an insert buffer routine */
-UNIV_INLINE
-ibool
-ibuf_inside(
-/*========*/
- const mtr_t* mtr) /*!< in: mini-transaction */
-{
- return(mtr->is_inside_ibuf());
-}
-
-/** Translates the free space on a page to a value in the ibuf bitmap.
-@param[in] page_size page size in bytes
-@param[in] max_ins_size maximum insert size after reorganize for
-the page
-@return value for ibuf bitmap bits */
-UNIV_INLINE
-ulint
-ibuf_index_page_calc_free_bits(
- ulint page_size,
- ulint max_ins_size)
-{
- ulint n;
- ut_ad(ut_is_2pow(page_size));
- ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
-
- n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
-
- if (n == 3) {
- n = 2;
- }
-
- if (n > 3) {
- n = 3;
- }
-
- return(n);
-}
-
-/*********************************************************************//**
-Translates the free space on a compressed page to a value in the ibuf bitmap.
-@return value for ibuf bitmap bits */
-UNIV_INLINE
-ulint
-ibuf_index_page_calc_free_zip(
-/*==========================*/
- const buf_block_t* block) /*!< in: buffer block */
-{
- ulint max_ins_size;
- const page_zip_des_t* page_zip;
- lint zip_max_ins;
-
- ut_ad(block->page.zip.data);
-
- /* Consider the maximum insert size on the uncompressed page
- without reorganizing the page. We must not assume anything
- about the compression ratio. If zip_max_ins > max_ins_size and
- there is 1/4 garbage on the page, recompression after the
- reorganize could fail, in theory. So, let us guarantee that
- merging a buffered insert to a compressed page will always
- succeed without reorganizing or recompressing the page, just
- by using the page modification log. */
- max_ins_size = page_get_max_insert_size(
- buf_block_get_frame(block), 1);
-
- page_zip = buf_block_get_page_zip(block);
- zip_max_ins = page_zip_max_ins_size(page_zip,
- FALSE/* not clustered */);
-
- if (zip_max_ins < 0) {
- return(0);
- } else if (max_ins_size > (ulint) zip_max_ins) {
- max_ins_size = (ulint) zip_max_ins;
- }
-
- return(ibuf_index_page_calc_free_bits(block->physical_size(),
- max_ins_size));
-}
-
-/*********************************************************************//**
-Translates the free space on a page to a value in the ibuf bitmap.
-@return value for ibuf bitmap bits */
-UNIV_INLINE
-ulint
-ibuf_index_page_calc_free(
-/*======================*/
- const buf_block_t* block) /*!< in: buffer block */
-{
- if (!block->page.zip.data) {
- ulint max_ins_size;
-
- max_ins_size = page_get_max_insert_size_after_reorganize(
- buf_block_get_frame(block), 1);
-
- return(ibuf_index_page_calc_free_bits(
- block->physical_size(), max_ins_size));
- } else {
- return(ibuf_index_page_calc_free_zip(block));
- }
-}
-
-/************************************************************************//**
-Updates the free bits of an uncompressed page in the ibuf bitmap if
-there is not enough free on the page any more. This is done in a
-separate mini-transaction, hence this operation does not restrict
-further work to only ibuf bitmap operations, which would result if the
-latch to the bitmap page were kept. NOTE: The free bits in the insert
-buffer bitmap must never exceed the free space on a page. It is
-unsafe to increment the bits in a separately committed
-mini-transaction, because in crash recovery, the free bits could
-momentarily be set too high. It is only safe to use this function for
-decrementing the free bits. Should more free space become available,
-we must not update the free bits here, because that would break crash
-recovery. */
-UNIV_INLINE
-void
-ibuf_update_free_bits_if_full(
-/*==========================*/
- buf_block_t* block, /*!< in: index page to which we have added new
- records; the free bits are updated if the
- index is non-clustered and non-unique and
- the page level is 0, and the page becomes
- fuller */
- ulint max_ins_size,/*!< in: value of maximum insert size with
- reorganize before the latest operation
- performed to the page */
- ulint increase)/*!< in: upper limit for the additional space
- used in the latest operation, if known, or
- ULINT_UNDEFINED */
-{
- ulint before;
- ulint after;
-
- ut_ad(buf_block_get_page_zip(block) == NULL);
-
- before = ibuf_index_page_calc_free_bits(
- srv_page_size, max_ins_size);
-
- if (max_ins_size >= increase) {
- compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX);
- after = ibuf_index_page_calc_free_bits(
- srv_page_size, max_ins_size - increase);
-#ifdef UNIV_IBUF_DEBUG
- ut_a(after <= ibuf_index_page_calc_free(block));
-#endif
- } else {
- after = ibuf_index_page_calc_free(block);
- }
-
- if (after == 0) {
- /* We move the page to the front of the buffer pool LRU list:
- the purpose of this is to prevent those pages to which we
- cannot make inserts using the insert buffer from slipping
- out of the buffer pool */
-
- buf_page_make_young(&block->page);
- }
-
- if (before > after) {
- ibuf_set_free_bits(block, after, before);
- }
-}
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 08b9f4bc..cab44dd9 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -185,13 +185,6 @@ lock_update_split_left(
void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
const page_id_t right);
-/** Update the locks when a page is split and merged to two pages,
-in defragmentation. */
-void lock_update_split_and_merge(
- const buf_block_t* left_block, /*!< in: left page to which merged */
- const rec_t* orig_pred, /*!< in: original predecessor of
- supremum on the left page before merge*/
- const buf_block_t* right_block);/*!< in: right page from which merged */
/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
index 2500ac05..1b8c4b41 100644
--- a/storage/innobase/include/log0crypt.h
+++ b/storage/innobase/include/log0crypt.h
@@ -87,7 +87,7 @@ void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end);
/** Encrypt or decrypt a temporary file block.
@param[in] src block to encrypt or decrypt
-@param[in] size size of the block
+@param[in] size length of both src and dst in bytes
@param[out] dst destination block
@param[in] offs offset to block
@param[in] encrypt true=encrypt; false=decrypt
@@ -102,7 +102,7 @@ bool log_tmp_block_encrypt(
/** Decrypt a temporary file block.
@param[in] src block to decrypt
-@param[in] size size of the block
+@param[in] size length of both src and dst in bytes
@param[out] dst destination block
@param[in] offs offset to block
@return whether the operation succeeded */
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index cef0dcae..85d01f2f 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -291,6 +291,8 @@ public:
bool log_maybe_unbuffered;
# endif
#endif
+ /** whether each write to ib_logfile0 is durable (O_DSYNC) */
+ my_bool log_write_through;
/** Fields involved in checkpoints @{ */
lsn_t log_capacity; /*!< capacity of the log; if
@@ -407,6 +409,8 @@ public:
/** Try to enable or disable file system caching (update log_buffered) */
void set_buffered(bool buffered);
#endif
+ /** Try to enable or disable durable writes (update log_write_through) */
+ void set_write_through(bool write_through);
void close_file();
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index a73b7279..9321a8b8 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -203,22 +203,9 @@ struct page_recv_t
inline void will_not_read();
};
-/** A page initialization operation that was parsed from the redo log */
-struct recv_init
-{
- /** log sequence number of the page initialization */
- lsn_t lsn;
- /** Whether btr_page_create() avoided a read of the page.
- At the end of the last recovery batch, mark_ibuf_exist()
- will mark pages for which this flag is set. */
- bool created;
-};
-
/** Recovery system data structure */
struct recv_sys_t
{
- using init= recv_init;
-
/** mutex protecting this as well as some of page_recv_t */
alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
private:
@@ -273,7 +260,10 @@ private:
lsn_t lsn;
/** truncated size of the tablespace, or 0 if not truncated */
unsigned pages;
- } truncated_undo_spaces[127];
+ };
+
+ trunc truncated_undo_spaces[127];
+ trunc truncated_sys_space;
public:
/** The contents of the doublewrite buffer */
@@ -299,23 +289,23 @@ public:
pages_it= pages.end();
}
+ /** Allow to apply system tablespace truncate redo log only
+ if the size to be extended is lesser than current size.
+ @retval true To apply the truncate shrink redo log record
+ @retval false otherwise */
+ bool check_sys_truncate();
+
private:
/** Attempt to initialize a page based on redo log records.
@param p iterator
@param mtr mini-transaction
@param b pre-allocated buffer pool block
- @param init page initialization
+ @param init_lsn page initialization
@return the recovered block
@retval nullptr if the page cannot be initialized based on log records
@retval -1 if the page cannot be recovered due to corruption */
inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr,
- buf_block_t *b, init &init);
- /** Attempt to initialize a page based on redo log records.
- @param page_id page identifier
- @return the recovered block
- @retval nullptr if the page cannot be initialized based on log records
- @retval -1 if the page cannot be recovered due to corruption */
- ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id);
+ buf_block_t *b, lsn_t init_lsn);
/** All found log files (multiple ones are possible if we are upgrading
from before MariaDB Server 10.5.1) */
@@ -460,15 +450,14 @@ public:
/** @return whether log file corruption was found */
bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }
- /** Attempt to initialize a page based on redo log records.
+ /** Read a page or recover it based on redo log records.
@param page_id page identifier
- @return the recovered block
- @retval nullptr if the page cannot be initialized based on log records
- @retval -1 if the page cannot be recovered due to corruption */
- buf_block_t *recover(const page_id_t page_id)
- {
- return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
- }
+ @param mtr mini-transaction
+ @param err error code
+ @return the requested block
+ @retval nullptr if the page cannot be accessed due to corruption */
+ ATTRIBUTE_COLD
+ buf_block_t *recover(const page_id_t page_id, mtr_t *mtr, dberr_t *err);
/** Try to recover a tablespace that was not readable earlier
@param p iterator
@@ -484,16 +473,6 @@ public:
/** The recovery system */
extern recv_sys_t recv_sys;
-/** If the following is TRUE, the buffer pool file pages must be invalidated
-after recovery and no ibuf operations are allowed; this will be set if
-recv_sys.pages becomes too full, and log records must be merged
-to file pages already before the recovery is finished: in this case no
-ibuf operations are allowed, as they could modify the pages read in the
-buffer pool before the pages have been recovered to the up-to-date state.
-
-TRUE means that recovery is running and no operations on the log files
-are allowed yet: the variable name is misleading. */
-extern bool recv_no_ibuf_operations;
/** TRUE when recv_init_crash_recovery() has been called. */
extern bool recv_needed_recovery;
#ifdef UNIV_DEBUG
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index bfa66216..27811872 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -309,15 +309,6 @@ public:
@retval 0 if the transaction only modified temporary tablespaces */
lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
- /** Note that we are inside the change buffer code. */
- void enter_ibuf() { m_inside_ibuf= true; }
-
- /** Note that we have exited from the change buffer code. */
- void exit_ibuf() { m_inside_ibuf= false; }
-
- /** @return true if we are inside the change buffer code */
- bool is_inside_ibuf() const { return m_inside_ibuf; }
-
/** Note that some pages have been freed */
void set_trim_pages() { m_trim_pages= true; }
@@ -772,10 +763,6 @@ private:
/** whether log_sys.latch is locked exclusively */
uint16_t m_latch_ex:1;
- /** whether change buffer is latched; only needed in non-debug builds
- to suppress some read-ahead operations, @see ibuf_inside() */
- uint16_t m_inside_ibuf:1;
-
/** whether the pages has been trimmed */
uint16_t m_trim_pages:1;
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index 7eba359f..ce686475 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -197,10 +197,14 @@ public:
WRITE_SYNC= 16,
/** Asynchronous write */
WRITE_ASYNC= WRITE_SYNC | 1,
+ /** Asynchronous doublewritten page */
+ WRITE_DBL= WRITE_ASYNC | 4,
/** A doublewrite batch */
DBLWR_BATCH= WRITE_ASYNC | 8,
/** Write data and punch hole for the rest */
PUNCH= WRITE_ASYNC | 16,
+ /** Write doublewritten data and punch hole for the rest */
+ PUNCH_DBL= PUNCH | 4,
/** Zero out a range of bytes in fil_space_t::io() */
PUNCH_RANGE= WRITE_SYNC | 32,
};
@@ -216,6 +220,14 @@ public:
bool is_read() const { return (type & READ_SYNC) != 0; }
bool is_write() const { return (type & WRITE_SYNC) != 0; }
bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
+ bool is_doublewritten() const { return (type & 4) != 0; }
+
+ /** Create a write request for the doublewrite buffer. */
+ IORequest doublewritten() const
+ {
+ ut_ad(type == WRITE_ASYNC || type == PUNCH);
+ return IORequest{bpage, slot, node, Type(type | 4)};
+ }
void write_complete(int io_error) const;
void read_complete(int io_error) const;
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
index 28aa3056..279138ac 100644
--- a/storage/innobase/include/page0cur.h
+++ b/storage/innobase/include/page0cur.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2022, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -117,11 +117,6 @@ succeed, i.e., enough space available, NULL otherwise. The cursor stays at
the same logical position, but the physical position may change if it is
pointing to a compressed page that was reorganized.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to record if succeed, NULL otherwise */
UNIV_INLINE
rec_t*
@@ -151,11 +146,6 @@ page_cur_insert_rec_low(
Inserts a record next to page cursor on a compressed and uncompressed
page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to inserted record
@return nullptr on failure */
rec_t*
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
index 7c4eafa2..a73c31a7 100644
--- a/storage/innobase/include/page0cur.inl
+++ b/storage/innobase/include/page0cur.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -155,11 +155,6 @@ succeed, i.e., enough space available, NULL otherwise. The cursor stays at
the same logical position, but the physical position may change if it is
pointing to a compressed page that was reorganized.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to record if succeed, NULL otherwise */
UNIV_INLINE
rec_t*
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index 2978656b..38373f6b 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -1,6 +1,6 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -95,7 +95,7 @@ this byte can be garbage. */
direction */
#define PAGE_N_RECS 16 /* number of user records on the page */
/** The largest DB_TRX_ID that may have modified a record on the page;
-Defined only in secondary index leaf pages and in change buffer leaf pages.
+Defined only in secondary index leaf pages.
Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */
#define PAGE_MAX_TRX_ID 18
/** The AUTO_INCREMENT value (on persistent clustered index root pages). */
@@ -901,11 +901,6 @@ MY_ATTRIBUTE((nonnull, warn_unused_result))
Differs from page_copy_rec_list_end, because this function does not
touch the lock table and max trx id on page or compress the page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_t::commit().
-
@return error code */
dberr_t
page_copy_rec_list_end_no_locks(
@@ -920,11 +915,6 @@ Copies records from page to new_page, from the given record onward,
including that record. Infimum and supremum records are not copied.
The records are copied to the start of the record list on new_page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_t::commit().
-
@return pointer to the original successor of the infimum record on new_block
@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
rec_t*
@@ -942,11 +932,6 @@ Copies records from page to new_page, up to the given record, NOT
including that record. Infimum and supremum records are not copied.
The records are copied to the end of the record list on new_page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to the original predecessor of the supremum record on new_block
@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
rec_t*
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 43329906..501ef31a 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -2,7 +2,7 @@
Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -213,9 +213,9 @@ page_zip_max_ins_size(
/**********************************************************************//**
Determine if enough space is available in the modification log.
-@return TRUE if page_zip_write_rec() will succeed */
+@return true if page_zip_write_rec() will succeed */
UNIV_INLINE
-ibool
+bool
page_zip_available(
/*===============*/
const page_zip_des_t* page_zip,/*!< in: compressed page */
@@ -323,10 +323,6 @@ Reorganize and compress a page. This is a low-level operation for
compressed pages, to be used when page_zip_compress() fails.
On success, redo log will be written.
The function btr_page_reorganize() should be preferred whenever possible.
-IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
-non-clustered index, the caller must update the insert buffer free
-bits in the same mini-transaction in such a way that the modification
-will be redo-logged.
@return error code
@retval DB_FAIL on overflow; the block_zip will be left intact */
dberr_t
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
index afc877c3..edcd4ab4 100644
--- a/storage/innobase/include/page0zip.inl
+++ b/storage/innobase/include/page0zip.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -269,7 +269,7 @@ page_zip_max_ins_size(
Determine if enough space is available in the modification log.
@return TRUE if enough space is available */
UNIV_INLINE
-ibool
+bool
page_zip_available(
/*===============*/
const page_zip_des_t* page_zip,/*!< in: compressed page */
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
index 46c209cb..da7337a3 100644
--- a/storage/innobase/include/rem0rec.inl
+++ b/storage/innobase/include/rem0rec.inl
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1095,9 +1095,7 @@ rec_get_converted_size(
ut_ad(dtuple_check_typed(dtuple));
#ifdef UNIV_DEBUG
- if (dict_index_is_ibuf(index)) {
- ut_ad(dtuple->n_fields > 1);
- } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+ if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
== REC_STATUS_NODE_PTR) {
ut_ad(dtuple->n_fields - 1
== dict_index_get_n_unique_in_tree_nonleaf(index));
diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h
index fd2651da..33e0da0f 100644
--- a/storage/innobase/include/row0import.h
+++ b/storage/innobase/include/row0import.h
@@ -33,6 +33,7 @@ Created 2012-02-08 by Sunny Bains
struct trx_t;
struct dict_table_t;
struct row_prebuilt_t;
+struct HA_CREATE_INFO;
/*****************************************************************//**
Imports a tablespace. The space id in the .ibd file must match the space id
@@ -64,4 +65,13 @@ dberr_t
row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Prepare the create info to create a new stub table for import.
+@param thd Connection
+@param name Table name, format: "db/table_name".
+@param create_info The create info for creating a stub.
+@return ER_ error code
+@retval 0 on success */
+int prepare_create_stub_for_import(THD *thd, const char *name,
+ HA_CREATE_INFO& create_info);
+
#endif /* row0import_h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 1daf4d4a..33ac8599 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -37,39 +37,6 @@ Created 3/14/1997 Heikki Tuuri
#include <unordered_map>
class MDL_ticket;
-/** Determines if it is possible to remove a secondary index entry.
-Removal is possible if the secondary index entry does not refer to any
-not delete marked version of a clustered index record where DB_TRX_ID
-is newer than the purge view.
-
-NOTE: This function should only be called by the purge thread, only
-while holding a latch on the leaf page of the secondary index entry
-(or keeping the buffer pool watch on the page). It is possible that
-this function first returns true and then false, if a user transaction
-inserts a record that the secondary index entry would refer to.
-However, in that case, the user transaction would also re-insert the
-secondary index entry after purge has removed it and released the leaf
-page latch.
-@param[in,out] node row purge node
-@param[in] index secondary index
-@param[in] entry secondary index entry
-@param[in,out] sec_pcur secondary index cursor or NULL
- if it is called for purge buffering
- operation.
-@param[in,out] sec_mtr mini-transaction which holds
- secondary index entry or NULL if it is
- called for purge buffering operation.
-@param[in] is_tree true=pessimistic purge,
- false=optimistic (leaf-page only)
-@return true if the secondary index record can be purged */
-bool
-row_purge_poss_sec(
- purge_node_t* node,
- dict_index_t* index,
- const dtuple_t* entry,
- btr_pcur_t* sec_pcur=NULL,
- mtr_t* sec_mtr=NULL,
- bool is_tree=false);
/***************************************************************
Does the purge operation.
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
index 7056c77f..85c18dde 100644
--- a/storage/innobase/include/row0row.h
+++ b/storage/innobase/include/row0row.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2022, MariaDB Corporation.
+Copyright (c) 2016, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,7 +28,6 @@ Created 4/20/1996 Heikki Tuuri
#define row0row_h
#include "que0types.h"
-#include "ibuf0ibuf.h"
#include "trx0types.h"
#include "mtr0mtr.h"
#include "rem0types.h"
@@ -344,23 +343,10 @@ row_parse_int(
ulint mtype,
bool unsigned_type);
-/** Result of row_search_index_entry */
-enum row_search_result {
- ROW_FOUND = 0, /*!< the record was found */
- ROW_NOT_FOUND, /*!< record not found */
- ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or
- BTR_DELETE_MARK was specified, the
- secondary index leaf page was not in
- the buffer pool, and the operation was
- enqueued in the insert/delete buffer */
- ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and
- row_purge_poss_sec() failed */
-};
-
/***************************************************************//**
Searches an index record.
-@return whether the record was found or buffered */
-enum row_search_result
+@return whether the record was found */
+bool
row_search_index_entry(
/*===================*/
const dtuple_t* entry, /*!< in: index entry */
@@ -404,22 +390,17 @@ row_raw_format(
in bytes */
MY_ATTRIBUTE((nonnull, warn_unused_result));
+#include "dict0mem.h"
+
/** Prepare to start a mini-transaction to modify an index.
@param[in,out] mtr mini-transaction
-@param[in,out] index possibly secondary index
-@param[in] pessimistic whether this is a pessimistic operation */
-inline
-void
-row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic)
+@param[in,out] index possibly secondary index */
+inline void row_mtr_start(mtr_t* mtr, dict_index_t* index)
{
mtr->start();
switch (index->table->space_id) {
- case IBUF_SPACE_ID:
- if (pessimistic
- && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
- ibuf_free_excess_pages();
- }
+ case 0:
break;
case SRV_TMP_SPACE_ID:
mtr->set_log_mode(MTR_LOG_NO_REDO);
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index 2ed26748..6b9a6f09 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -2,7 +2,7 @@
Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -226,12 +226,8 @@ enum monitor_id_t {
MONITOR_MODULE_BUF_PAGE,
MONITOR_INDEX_LEAF_PAGE_READ,
MONITOR_INDEX_NON_LEAF_PAGE_READ,
- MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
- MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
MONITOR_UNDO_LOG_PAGE_READ,
MONITOR_INODE_PAGE_READ,
- MONITOR_IBUF_FREELIST_PAGE_READ,
- MONITOR_IBUF_BITMAP_PAGE_READ,
MONITOR_SYSTEM_PAGE_READ,
MONITOR_TRX_SYSTEM_PAGE_READ,
MONITOR_FSP_HDR_PAGE_READ,
@@ -242,12 +238,8 @@ enum monitor_id_t {
MONITOR_OTHER_PAGE_READ,
MONITOR_INDEX_LEAF_PAGE_WRITTEN,
MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
- MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
- MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
MONITOR_UNDO_LOG_PAGE_WRITTEN,
MONITOR_INODE_PAGE_WRITTEN,
- MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
- MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
MONITOR_SYSTEM_PAGE_WRITTEN,
MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
MONITOR_FSP_HDR_PAGE_WRITTEN,
@@ -345,17 +337,6 @@ enum monitor_id_t {
MONITOR_MODULE_FIL_SYSTEM,
MONITOR_OVLD_N_FILE_OPENED,
- /* InnoDB Change Buffer related counters */
- MONITOR_MODULE_IBUF_SYSTEM,
- MONITOR_OVLD_IBUF_MERGE_INSERT,
- MONITOR_OVLD_IBUF_MERGE_DELETE,
- MONITOR_OVLD_IBUF_MERGE_PURGE,
- MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
- MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
- MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
- MONITOR_OVLD_IBUF_MERGES,
- MONITOR_OVLD_IBUF_SIZE,
-
/* Counters for server operations */
MONITOR_MODULE_SERVER,
MONITOR_MASTER_THREAD_SLEEP,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 5e6bfc33..df25983a 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -161,9 +161,9 @@ extern char* srv_data_home;
recovery and open all tables in RO mode instead of RW mode. We don't
sync the max trx id to disk either. */
extern my_bool srv_read_only_mode;
-/** Set if InnoDB operates in read-only mode or innodb-force-recovery
-is greater than SRV_FORCE_NO_IBUF_MERGE. */
-extern my_bool high_level_read_only;
+/** Set if innodb_read_only is set or innodb_force_recovery
+is SRV_FORCE_NO_UNDO_LOG_SCAN or greater. */
+extern bool high_level_read_only;
/** store to its own file each table created by an user; data
dictionary tables are in the system tablespace 0 */
extern my_bool srv_file_per_table;
@@ -253,18 +253,6 @@ extern ulong srv_read_ahead_threshold;
extern uint srv_n_read_io_threads;
extern uint srv_n_write_io_threads;
-/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
-#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
-extern my_bool srv_defragment;
-extern uint srv_defragment_n_pages;
-extern uint srv_defragment_stats_accuracy;
-extern uint srv_defragment_fill_factor_n_recs;
-extern double srv_defragment_fill_factor;
-extern uint srv_defragment_frequency;
-extern ulonglong srv_defragment_interval;
-
-extern uint srv_change_buffer_max_size;
-
/* Number of IO operations per second the server can do */
extern ulong srv_io_capacity;
@@ -289,7 +277,7 @@ extern ulong srv_flushing_avg_loops;
extern ulong srv_force_recovery;
-/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+/** innodb_fast_shutdown=1 skips purge.
innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
of active transaction (to be done on restart). */
@@ -305,7 +293,6 @@ extern my_bool srv_stats_include_delete_marked;
extern unsigned long long srv_stats_modified_counter;
extern my_bool srv_stats_sample_traditional;
-extern my_bool srv_use_doublewrite_buf;
extern ulong srv_checksum_algorithm;
extern my_bool srv_force_primary_key;
@@ -562,11 +549,6 @@ void srv_monitor_task(void*);
void srv_master_callback(void*);
-/**
-Complete the shutdown tasks such as background DROP TABLE,
-and optionally change buffer merge (on innodb_fast_shutdown=0). */
-void srv_shutdown(bool ibuf_merge);
-
} /* extern "C" */
#ifdef UNIV_DEBUG
@@ -630,14 +612,6 @@ struct export_var_t{
/** Number of undo tablespace truncation operations */
ulong innodb_undo_truncations;
- ulint innodb_defragment_compression_failures; /*!< Number of
- defragment re-compression
- failures */
-
- ulint innodb_defragment_failures; /*!< Number of defragment
- failures*/
- ulint innodb_defragment_count; /*!< Number of defragment
- operations*/
/** Number of instant ALTER TABLE operations that affect columns */
ulong innodb_instant_alter_column;
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
index 2c0167ac..7a7f93b6 100644
--- a/storage/innobase/include/sux_lock.h
+++ b/storage/innobase/include/sux_lock.h
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2020, 2022, MariaDB Corporation.
+Copyright (c) 2020, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -149,7 +149,7 @@ private:
#endif
public:
- /** In crash recovery or the change buffer, claim the ownership
+ /** In crash recovery, claim the ownership
of the exclusive block lock to the current thread */
void claim_ownership() { set_new_owner(pthread_self()); }
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 3fa41fdf..e6e8eb6b 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -589,8 +589,7 @@ public:
So we take more expensive approach: get trx through current_thd()->ha_data.
Some threads don't have trx attached to THD, and at least server
- initialisation thread, fts_optimize_thread, srv_master_thread,
- dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
+ initialisation thread doesn't even
have THD at all. For such cases we allocate pins only for duration of
search and free them immediately.
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 15255354..7457addb 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -796,6 +796,7 @@ public:
/** normally set; "SET unique_checks=0, foreign_key_checks=0"
enables bulk insert into an empty table */
unsigned check_unique_secondary:1;
+
/** whether an insert into an empty table is active */
unsigned bulk_insert:1;
/*------------------------------*/
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
index 3d22a33e..2954cf73 100644
--- a/storage/innobase/include/trx0undo.h
+++ b/storage/innobase/include/trx0undo.h
@@ -451,10 +451,10 @@ completely purged and trx_purge_free_segment() has started freeing it */
/** Transaction end identifier (if the log is in a history list),
or 0 if the transaction has not been committed */
#define TRX_UNDO_TRX_NO 8
-/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of
+/* Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of
surviving user records, this used to be called TRX_UNDO_DEL_MARKS.
-This field is redundant; it is only being read by some debug assertions.
+This field was removed in MariaDB 11.0.
The value 1 indicates that purge needs to process the undo log segment.
The value 0 indicates that all of it has been processed, and
@@ -463,7 +463,7 @@ trx_purge_free_segment() has been invoked, so the log is not safe to access.
Before MariaDB 10.3.1, a log segment may carry the value 0 even before
trx_purge_free_segment() was called, for those undo log records for
which purge would not result in removing delete-marked records. */
-#define TRX_UNDO_NEEDS_PURGE 16
+/*#define TRX_UNDO_NEEDS_PURGE 16*/
#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record
of this log on the header page; purge
may remove undo log record from the
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 1b4f70b6..8ef01bc0 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -135,7 +135,6 @@ using the call command. */
assertions. */
#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */
#define UNIV_HASH_DEBUG /* debug HASH_ macros */
-#define UNIV_IBUF_DEBUG /* debug the insert buffer */
#define UNIV_PERF_DEBUG /* debug flag that enables
light weight performance
related stuff. */
@@ -468,9 +467,6 @@ extern mysql_pfs_key_t fts_cache_mutex_key;
extern mysql_pfs_key_t fts_cache_init_mutex_key;
extern mysql_pfs_key_t fts_delete_mutex_key;
extern mysql_pfs_key_t fts_doc_id_mutex_key;
-extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
-extern mysql_pfs_key_t ibuf_mutex_key;
-extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
extern mysql_pfs_key_t recalc_pool_mutex_key;
extern mysql_pfs_key_t purge_sys_pq_mutex_key;
extern mysql_pfs_key_t recv_sys_mutex_key;
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index a2107007..27a0d154 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -4423,7 +4423,7 @@ static void lock_rec_unlock_unmodified(hash_cell_t &cell, lock_t *lock,
mtr.start();
if (buf_block_t *block=
btr_block_get(*index, lock->un_member.rec_lock.page_id.page_no(),
- RW_S_LATCH, true, &mtr))
+ RW_S_LATCH, &mtr))
{
if (UNIV_UNLIKELY(!page_is_leaf(block->page.frame)))
{
@@ -6967,45 +6967,3 @@ void lock_sys_t::deadlock_check()
if (acquired)
wr_unlock();
}
-
-/** Update the locks when a page is split and merged to two pages,
-in defragmentation. */
-void lock_update_split_and_merge(
- const buf_block_t* left_block, /*!< in: left page to which merged */
- const rec_t* orig_pred, /*!< in: original predecessor of
- supremum on the left page before merge*/
- const buf_block_t* right_block) /*!< in: right page from which merged */
-{
- ut_ad(page_is_leaf(left_block->page.frame));
- ut_ad(page_is_leaf(right_block->page.frame));
- ut_ad(page_align(orig_pred) == left_block->page.frame);
-
- const page_id_t l{left_block->page.id()};
- const page_id_t r{right_block->page.id()};
- const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
- if (UNIV_UNLIKELY(!left_next_rec))
- {
- ut_ad("corrupted page" == 0);
- return;
- }
- ut_ad(!page_rec_is_metadata(left_next_rec));
-
- /* This would likely be too large for a memory transaction. */
- LockMultiGuard g{lock_sys.rec_hash, l, r};
-
- /* Inherit the locks on the supremum of the left page to the
- first record which was moved from the right page */
- lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left_block->page.frame,
- page_rec_get_heap_no(left_next_rec),
- PAGE_HEAP_NO_SUPREMUM);
-
- /* Reset the locks on the supremum of the left page,
- releasing waiting transactions */
- lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
-
- /* Inherit the locks to the supremum of the left page from the
- successor of the infimum on the right page */
- lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, left_block->page.frame,
- PAGE_HEAP_NO_SUPREMUM,
- lock_get_min_heap_no(right_block));
-}
diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc
index 8a771410..fc7a5180 100644
--- a/storage/innobase/log/log0crypt.cc
+++ b/storage/innobase/log/log0crypt.cc
@@ -221,9 +221,9 @@ ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size)
ut_ad(LOG_CRYPT_HDR_SIZE + dst_size
== 512 - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY);
- uint dst_len;
+ uint dst_len = static_cast<uint>(dst_size);
int rc = encryption_crypt(
- buf + LOG_CRYPT_HDR_SIZE, static_cast<uint>(dst_size),
+ buf + LOG_CRYPT_HDR_SIZE, dst_len,
reinterpret_cast<byte*>(dst), &dst_len,
const_cast<byte*>(info.crypt_key),
MY_AES_BLOCK_SIZE,
@@ -332,10 +332,10 @@ ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn)
}
found:
byte dst[512];
- uint dst_len;
byte aes_ctr_iv[MY_AES_BLOCK_SIZE];
const uint src_len = 512 - LOG_BLOCK_HDR_SIZE;
+ uint dst_len = src_len;
ulint log_block_no = log_block_get_hdr_no(buf);
@@ -429,7 +429,7 @@ ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf)
/** Encrypt or decrypt a temporary file block.
@param[in] src block to encrypt or decrypt
-@param[in] size size of the block
+@param[in] size length of both src and dst blocks in bytes
@param[out] dst destination block
@param[in] offs offset to block
@param[in] encrypt true=encrypt; false=decrypt
@@ -441,7 +441,7 @@ bool log_tmp_block_encrypt(
uint64_t offs,
bool encrypt)
{
- uint dst_len;
+ uint dst_len = static_cast<uint>(size);
uint64_t iv[MY_AES_BLOCK_SIZE / sizeof(uint64_t)];
iv[0] = offs;
memcpy(iv + 1, tmp_iv, sizeof iv - sizeof *iv);
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index ea717de2..9d239ce8 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,7 +36,6 @@ Created 12/9/1995 Heikki Tuuri
#include "log0recv.h"
#include "fil0fil.h"
#include "dict0stats_bg.h"
-#include "btr0defragment.h"
#include "srv0srv.h"
#include "srv0start.h"
#include "trx0sys.h"
@@ -436,6 +435,31 @@ void log_t::set_buffered(bool buffered)
}
#endif
+ /** Try to enable or disable durable writes (update log_write_through) */
+void log_t::set_write_through(bool write_through)
+{
+ if (is_pmem() || high_level_read_only)
+ return;
+ log_resize_acquire();
+ if (!resize_in_progress() && is_opened() &&
+ bool(log_write_through) != write_through)
+ {
+ os_file_close_func(log.m_file);
+ log.m_file= OS_FILE_CLOSED;
+ std::string path{get_log_file_path()};
+ log_write_through= write_through;
+ bool success;
+ log.m_file= os_file_create_func(path.c_str(),
+ OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
+ false, &success);
+ ut_a(log.m_file != OS_FILE_CLOSED);
+ sql_print_information(log_write_through
+ ? "InnoDB: Log writes write through"
+ : "InnoDB: Log writes may be cached");
+ }
+ log_resize_release();
+}
+
/** Start resizing the log and release the exclusive latch.
@param size requested new file_size
@return whether the resizing was started successfully */
@@ -892,7 +916,7 @@ bool log_t::flush(lsn_t lsn) noexcept
{
ut_ad(lsn >= get_flushed_lsn());
flush_lock.set_pending(lsn);
- const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()};
+ const bool success{log_write_through || log.flush()};
if (UNIV_LIKELY(success))
{
flushed_to_disk_lsn.store(lsn, std::memory_order_release);
@@ -929,15 +953,6 @@ void log_write_up_to(lsn_t lsn, bool durable,
ut_ad(!srv_read_only_mode || log_sys.buf_free_ok());
ut_ad(lsn != LSN_MAX);
ut_ad(lsn != 0);
-
- if (UNIV_UNLIKELY(recv_no_ibuf_operations))
- {
- /* A non-final batch of recovery is active no writes to the log
- are allowed yet. */
- ut_a(!callback);
- return;
- }
-
ut_ad(lsn <= log_sys.get_lsn());
#ifdef HAVE_PMEM
@@ -963,6 +978,7 @@ repeat:
if (write_lock.acquire(lsn, durable ? nullptr : callback) ==
group_commit_lock::ACQUIRED)
{
+ ut_ad(!recv_no_log_write || srv_operation != SRV_OPERATION_NORMAL);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
pending_write_lsn= write_lock.release(log_sys.write_buf<true>());
}
@@ -1081,18 +1097,15 @@ ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
ib::info() << "Starting shutdown...";
- /* Wait until the master thread and all other operations are idle: our
+ /* Wait until the master task and all other operations are idle: our
algorithm only works if the server is idle at shutdown */
- bool do_srv_shutdown = false;
if (srv_master_timer) {
- do_srv_shutdown = srv_fast_shutdown < 2;
srv_master_timer.reset();
}
/* Wait for the end of the buffer resize task.*/
buf_resize_shutdown();
dict_stats_shutdown();
- btr_defragment_shutdown();
srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
@@ -1102,11 +1115,6 @@ ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
}
srv_monitor_timer.reset();
- if (do_srv_shutdown) {
- srv_shutdown(srv_fast_shutdown == 0);
- }
-
-
loop:
ut_ad(lock_sys.is_initialised() || !srv_was_started);
ut_ad(log_sys.is_initialised() || !srv_was_started);
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 6b6a6868..44ba50e5 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -46,7 +46,6 @@ Created 9/20/1997 Heikki Tuuri
#include "page0page.h"
#include "page0cur.h"
#include "trx0undo.h"
-#include "ibuf0ibuf.h"
#include "trx0undo.h"
#include "trx0rec.h"
#include "fil0fil.h"
@@ -71,17 +70,6 @@ number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
recv_recovery_from_checkpoint_start(). */
bool recv_lsn_checks_on;
-/** If the following is TRUE, the buffer pool file pages must be invalidated
-after recovery and no ibuf operations are allowed; this becomes TRUE if
-the log record hash table becomes too full, and log records must be merged
-to file pages already before the recovery is finished: in this case no
-ibuf operations are allowed, as they could modify the pages read in the
-buffer pool before the pages have been recovered to the up-to-date state.
-
-true means that recovery is running and no operations on the log file
-are allowed yet: the variable name is misleading. */
-bool recv_no_ibuf_operations;
-
/** The maximum lsn we see for a page during the recovery process. If this
is bigger than the lsn we are able to scan up to, that is an indication that
the recovery failed and the database may be corrupt. */
@@ -739,7 +727,7 @@ static struct
retry:
log_sys.latch.wr_unlock();
fil_space_t *space= fil_system.sys_space;
- buf_block_t *free_block= buf_LRU_get_free_block(false);
+ buf_block_t *free_block= buf_LRU_get_free_block(have_no_mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
mysql_mutex_lock(&recv_sys.mutex);
@@ -938,9 +926,9 @@ void (*first_page_init)(uint32_t space_id);
FIXME: Rely on recv_sys.pages! */
class mlog_init_t
{
- using map= std::map<const page_id_t, recv_init,
+ using map= std::map<const page_id_t, lsn_t,
std::less<const page_id_t>,
- ut_allocator<std::pair<const page_id_t, recv_init>>>;
+ ut_allocator<std::pair<const page_id_t, lsn_t>>>;
/** Map of page initialization operations.
FIXME: Merge this to recv_sys.pages! */
map inits;
@@ -959,23 +947,20 @@ public:
bool add(const page_id_t page_id, lsn_t lsn)
{
mysql_mutex_assert_owner(&recv_sys.mutex);
- const recv_init init = { lsn, false };
std::pair<map::iterator, bool> p=
- inits.insert(map::value_type(page_id, init));
- ut_ad(!p.first->second.created);
+ inits.emplace(map::value_type{page_id, lsn});
if (p.second) return true;
- if (p.first->second.lsn >= lsn) return false;
- p.first->second = init;
- i = p.first;
+ if (p.first->second >= lsn) return false;
+ p.first->second= lsn;
+ i= p.first;
return true;
}
- /** Get the last stored lsn of the page id and its respective
- init/load operation.
+ /** Get the last initialization lsn of a page.
@param page_id page identifier
@return the latest page initialization;
not valid after releasing recv_sys.mutex. */
- recv_init &last(page_id_t page_id)
+ lsn_t last(page_id_t page_id)
{
mysql_mutex_assert_owner(&recv_sys.mutex);
return inits.find(page_id)->second;
@@ -989,90 +974,13 @@ public:
{
mysql_mutex_assert_owner(&recv_sys.mutex);
if (i != inits.end() && i->first == page_id)
- return i->second.lsn > lsn;
- i = inits.lower_bound(page_id);
- return i != inits.end() && i->first == page_id && i->second.lsn > lsn;
- }
-
- /** At the end of each recovery batch, reset the 'created' flags. */
- void reset()
- {
- mysql_mutex_assert_owner(&recv_sys.mutex);
- ut_ad(recv_no_ibuf_operations);
- for (map::value_type &i : inits)
- i.second.created= false;
- }
-
- /** During the last recovery batch, mark whether there exist
- buffered changes for the pages that were initialized
- by buf_page_create() and still reside in the buffer pool. */
- void mark_ibuf_exist()
- {
- mysql_mutex_assert_owner(&recv_sys.mutex);
-
- for (const map::value_type &i : inits)
- if (i.second.created)
- {
- auto &chain= buf_pool.page_hash.cell_get(i.first.fold());
- page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
-
- hash_lock.lock_shared();
- buf_block_t *block= reinterpret_cast<buf_block_t*>
- (buf_pool.page_hash.get(i.first, chain));
- bool got_latch= block && block->page.lock.x_lock_try();
- hash_lock.unlock_shared();
-
- if (!block)
- continue;
-
- uint32_t state;
-
- if (!got_latch)
- {
- mysql_mutex_lock(&buf_pool.mutex);
- block= reinterpret_cast<buf_block_t*>
- (buf_pool.page_hash.get(i.first, chain));
- if (!block)
- {
- mysql_mutex_unlock(&buf_pool.mutex);
- continue;
- }
-
- state= block->page.fix();
- mysql_mutex_unlock(&buf_pool.mutex);
- if (state < buf_page_t::UNFIXED)
- {
- block->page.unfix();
- continue;
- }
- block->page.lock.x_lock();
- state= block->page.unfix();
- ut_ad(state < buf_page_t::READ_FIX);
- if (state >= buf_page_t::UNFIXED && block->page.id() == i.first)
- goto check_ibuf;
- }
- else
- {
- state= block->page.state();
- ut_ad(state >= buf_page_t::FREED);
- ut_ad(state < buf_page_t::READ_FIX);
-
- if (state >= buf_page_t::UNFIXED)
- {
- check_ibuf:
- mysql_mutex_unlock(&recv_sys.mutex);
- if (ibuf_page_exists(block->page.id(), block->zip_size()))
- block->page.set_ibuf_exist();
- mysql_mutex_lock(&recv_sys.mutex);
- }
- }
-
- block->page.lock.x_unlock();
- }
+ return i->second > lsn;
+ i= inits.lower_bound(page_id);
+ return i != inits.end() && i->first == page_id && i->second > lsn;
}
/** Clear the data structure */
- void clear() { inits.clear(); i = inits.end(); }
+ void clear() { inits.clear(); i= inits.end(); }
};
static mlog_init_t mlog_init;
@@ -1099,9 +1007,9 @@ fil_space_t *recv_sys_t::recover_deferred(const recv_sys_t::map::iterator &p,
mtr_t mtr;
ut_ad(!p->second.being_processed);
p->second.being_processed= 1;
- init &init= mlog_init.last(p->first);
+ lsn_t init_lsn= mlog_init.last(p->first);
mysql_mutex_unlock(&mutex);
- buf_block_t *block= recover_low(p, mtr, free_block, init);
+ buf_block_t *block= recover_low(p, mtr, free_block, init_lsn);
mysql_mutex_lock(&mutex);
p->second.being_processed= -1;
ut_ad(block == free_block || block == reinterpret_cast<buf_block_t*>(-1));
@@ -1445,6 +1353,7 @@ void recv_sys_t::create()
recv_max_page_lsn = 0;
memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces);
+ truncated_sys_space= {0, 0};
UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU);
}
@@ -2054,7 +1963,7 @@ ATTRIBUTE_COLD buf_block_t *recv_sys_t::add_block()
UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
if (UNIV_LIKELY(bs > BUF_LRU_MIN_LEN || rs < bs))
{
- buf_block_t *block= buf_LRU_get_free_block(true);
+ buf_block_t *block= buf_LRU_get_free_block(have_mutex);
mysql_mutex_unlock(&buf_pool.mutex);
return block;
}
@@ -2781,23 +2690,28 @@ restart:
cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
if (rlen == 1 && *cl == TRIM_PAGES)
{
-#if 0 /* For now, we can only truncate an undo log tablespace */
- if (UNIV_UNLIKELY(!space_id || !page_no))
- goto record_corrupted;
-#else
- if (!srv_is_undo_tablespace(space_id) ||
- page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
- goto record_corrupted;
+ if (srv_is_undo_tablespace(space_id))
+ {
+ if (page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+ goto record_corrupted;
+ /* The entire undo tablespace will be reinitialized by
+ innodb_undo_log_truncate=ON. Discard old log for all
+ pages. */
+ trim({space_id, 0}, start_lsn);
+ truncated_undo_spaces[space_id - srv_undo_space_id_start]=
+ { start_lsn, page_no};
+ }
+ else if (space_id != 0) goto record_corrupted;
+ else
+ {
+ /* Shrink the system tablespace */
+ trim({space_id, page_no}, start_lsn);
+ truncated_sys_space= {start_lsn, page_no};
+ }
static_assert(UT_ARR_SIZE(truncated_undo_spaces) ==
TRX_SYS_MAX_UNDO_SPACES, "compatibility");
- /* The entire undo tablespace will be reinitialized by
- innodb_undo_log_truncate=ON. Discard old log for all pages. */
- trim({space_id, 0}, start_lsn);
- truncated_undo_spaces[space_id - srv_undo_space_id_start]=
- { start_lsn, page_no };
- if (!store && undo_space_trunc)
+ if (!store && undo_space_trunc && space_id)
undo_space_trunc(space_id);
-#endif
last_offset= 1; /* the next record must not be same_page */
continue;
}
@@ -3140,19 +3054,16 @@ lsn of a log record.
@param[in,out] mtr mini-transaction
@param[in,out] recs log records to apply
@param[in,out] space tablespace, or NULL if not looked up yet
-@param[in,out] init page initialization operation, or NULL
+@param[in,out] init_lsn page initialization LSN, or 0
@return the recovered page
@retval nullptr on failure */
static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr,
page_recv_t &recs,
- fil_space_t *space,
- recv_init *init)
+ fil_space_t *space, lsn_t init_lsn = 0)
{
mysql_mutex_assert_not_owner(&recv_sys.mutex);
ut_ad(recv_sys.apply_log_recs);
ut_ad(recv_needed_recovery);
- ut_ad(!init || init->created);
- ut_ad(!init || init->lsn);
ut_ad(recs.being_processed == 1);
ut_ad(!space || space->id == block->page.id().space());
ut_ad(log_sys.is_latest());
@@ -3168,13 +3079,12 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr,
byte *frame = UNIV_LIKELY_NULL(block->page.zip.data)
? block->page.zip.data
: block->page.frame;
- const lsn_t page_lsn = init
+ const lsn_t page_lsn = init_lsn
? 0
: mach_read_from_8(frame + FIL_PAGE_LSN);
bool free_page = false;
lsn_t start_lsn = 0, end_lsn = 0;
ut_d(lsn_t recv_start_lsn = 0);
- const lsn_t init_lsn = init ? init->lsn : 0;
bool skipped_after_init = false;
@@ -3301,10 +3211,6 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr,
set_start_lsn:
if ((a == log_phys_t::APPLIED_CORRUPTED
|| recv_sys.is_corrupt_log()) && !srv_force_recovery) {
- if (init) {
- init->created = false;
- }
-
mtr.discard_modifications();
mtr.commit();
@@ -3339,12 +3245,9 @@ set_start_lsn:
UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page);
buf_pool.page_cleaner_wakeup();
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- } else if (free_page && init) {
+ } else if (free_page && init_lsn) {
/* There have been no operations that modify the page.
- Any buffered changes must not be merged. A subsequent
- buf_page_create() from a user thread should discard
- any buffered changes. */
- init->created = false;
+ Any buffered changes will be merged in ibuf_upgrade(). */
ut_ad(!mtr.has_modifications());
block->page.set_freed(block->page.state());
}
@@ -3440,11 +3343,9 @@ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage)
else
{
p->second.being_processed= 1;
- recv_sys_t::init *init= nullptr;
- if (p->second.skip_read)
- (init= &mlog_init.last(id))->created= true;
+ const lsn_t init_lsn{p->second.skip_read ? mlog_init.last(id) : 0};
mysql_mutex_unlock(&recv_sys.mutex);
- success= recv_recover_page(success, mtr, p->second, space, init);
+ success= recv_recover_page(success, mtr, p->second, space, init_lsn);
p->second.being_processed= -1;
goto func_exit;
}
@@ -3481,12 +3382,11 @@ void IORequest::fake_read_complete(os_offset_t offset) const
page_recv_t &recs= *reinterpret_cast<page_recv_t*>(slot);
ut_ad(recs.being_processed == 1);
- recv_init &init= *reinterpret_cast<recv_init*>(offset);
- ut_ad(init.lsn > 1);
- init.created= true;
+ const lsn_t init_lsn{offset};
+ ut_ad(init_lsn > 1);
if (recv_recover_page(reinterpret_cast<buf_block_t*>(bpage),
- mtr, recs, node->space, &init))
+ mtr, recs, node->space, init_lsn))
{
ut_ad(bpage->oldest_modification() || bpage->is_freed());
bpage->lock.x_unlock(true);
@@ -3660,9 +3560,9 @@ bool recv_sys_t::apply_batch(uint32_t space_id, fil_space_t *&space,
page_recv_t &recs= pages_it->second;
ut_ad(!recs.log.empty());
recs.being_processed= 1;
- init *init= recs.skip_read ? &mlog_init.last(id) : nullptr;
+ const lsn_t init_lsn{recs.skip_read ? mlog_init.last(id) : 0};
mysql_mutex_unlock(&mutex);
- buf_read_recover(space, id, recs, init);
+ buf_read_recover(space, id, recs, init_lsn);
}
if (!--n)
@@ -3703,7 +3603,7 @@ bool recv_sys_t::apply_batch(uint32_t space_id, fil_space_t *&space,
@retval nullptr if the page cannot be initialized based on log records
@retval -1 if the page cannot be recovered due to corruption */
inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr,
- buf_block_t *b, init &init)
+ buf_block_t *b, lsn_t init_lsn)
{
mysql_mutex_assert_not_owner(&mutex);
page_recv_t &recs= p->second;
@@ -3711,9 +3611,9 @@ inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr,
ut_ad(recs.being_processed == 1);
buf_block_t* block= nullptr;
const lsn_t end_lsn= recs.log.last()->lsn;
- if (end_lsn < init.lsn)
+ if (end_lsn < init_lsn)
DBUG_LOG("ib_log", "skip log for page " << p->first
- << " LSN " << end_lsn << " < " << init.lsn);
+ << " LSN " << end_lsn << " < " << init_lsn);
fil_space_t *space= fil_space_t::get(p->first.space());
mtr.start();
@@ -3757,8 +3657,7 @@ inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr,
ut_d(mysql_mutex_lock(&mutex));
ut_ad(&recs == &pages.find(p->first)->second);
ut_d(mysql_mutex_unlock(&mutex));
- init.created= true;
- block= recv_recover_page(block, mtr, recs, space, &init);
+ block= recv_recover_page(block, mtr, recs, space, init_lsn);
ut_ad(mtr.has_committed());
if (space)
@@ -3767,33 +3666,70 @@ inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr,
return block ? block : reinterpret_cast<buf_block_t*>(-1);
}
-/** Attempt to initialize a page based on redo log records.
+/** Read a page or recover it based on redo log records.
@param page_id page identifier
-@return recovered block
-@retval nullptr if the page cannot be initialized based on log records */
-ATTRIBUTE_COLD buf_block_t *recv_sys_t::recover_low(const page_id_t page_id)
+@param mtr mini-transaction
+@param err error code
+@return the requested block
+@retval nullptr if the page cannot be accessed due to corruption */
+ATTRIBUTE_COLD
+buf_block_t *
+recv_sys_t::recover(const page_id_t page_id, mtr_t *mtr, dberr_t *err)
{
+ if (!recovery_on)
+ must_read:
+ return buf_page_get_gen(page_id, 0, RW_NO_LATCH, nullptr, BUF_GET_RECOVER,
+ mtr, err);
+
mysql_mutex_lock(&mutex);
map::iterator p= pages.find(page_id);
- if (p != pages.end() && !p->second.being_processed && p->second.skip_read)
+ if (p == pages.end() || p->second.being_processed || !p->second.skip_read)
{
- p->second.being_processed= 1;
- init &init= mlog_init.last(page_id);
mysql_mutex_unlock(&mutex);
- buf_block_t *free_block= buf_LRU_get_free_block(false);
- mtr_t mtr;
- buf_block_t *block= recover_low(p, mtr, free_block, init);
- p->second.being_processed= -1;
- ut_ad(!block || block == reinterpret_cast<buf_block_t*>(-1) ||
- block == free_block);
- if (UNIV_UNLIKELY(!block))
- buf_pool.free_block(free_block);
- return block;
+ goto must_read;
}
+ p->second.being_processed= 1;
+ const lsn_t init_lsn{mlog_init.last(page_id)};
mysql_mutex_unlock(&mutex);
- return nullptr;
+ buf_block_t *free_block= buf_LRU_get_free_block(have_no_mutex);
+ buf_block_t *block;
+ {
+ mtr_t local_mtr;
+ block= recover_low(p, local_mtr, free_block, init_lsn);
+ }
+ p->second.being_processed= -1;
+ if (UNIV_UNLIKELY(!block))
+ {
+ buf_pool.free_block(free_block);
+ goto must_read;
+ }
+ else if (block == reinterpret_cast<buf_block_t*>(-1))
+ {
+ corrupted:
+ if (err)
+ *err= DB_CORRUPTION;
+ return nullptr;
+ }
+
+ ut_ad(block == free_block);
+ auto s= block->page.fix();
+ ut_ad(s >= buf_page_t::FREED);
+ /* The block may be write-fixed at this point because we are not
+ holding a latch, but it must not be read-fixed. */
+ ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+ if (s < buf_page_t::UNFIXED)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ block->page.unfix();
+ buf_LRU_free_page(&block->page, true);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto corrupted;
+ }
+
+ mtr->page_lock(block, RW_NO_LATCH);
+ return block;
}
inline fil_space_t *fil_system_t::find(const char *path) const
@@ -3879,15 +3815,36 @@ void recv_sys_t::apply(bool last_batch)
if (!pages.empty())
{
- recv_no_ibuf_operations = !last_batch ||
- srv_operation == SRV_OPERATION_RESTORE ||
- srv_operation == SRV_OPERATION_RESTORE_EXPORT;
ut_ad(!last_batch || lsn == scanned_lsn);
progress_time= time(nullptr);
report_progress();
apply_log_recs= true;
+ if (truncated_sys_space.lsn)
+ {
+ trim({0, truncated_sys_space.pages}, truncated_sys_space.lsn);
+ fil_node_t *file= UT_LIST_GET_LAST(fil_system.sys_space->chain);
+ ut_ad(file->is_open());
+
+ /* Last file new size after truncation */
+ uint32_t new_last_file_size=
+ truncated_sys_space.pages -
+ (srv_sys_space.get_min_size()
+ - srv_sys_space.m_files.at(
+ srv_sys_space.m_files.size() - 1). param_size());
+
+ os_file_truncate(
+ file->name, file->handle,
+ os_offset_t{new_last_file_size} << srv_page_size_shift, true);
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_system.sys_space->size= truncated_sys_space.pages;
+ fil_system.sys_space->chain.end->size= new_last_file_size;
+ srv_sys_space.set_last_file_size(new_last_file_size);
+ truncated_sys_space={0, 0};
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+
for (auto id= srv_undo_tablespaces_open; id--;)
{
const trunc& t= truncated_undo_spaces[id];
@@ -3933,7 +3890,7 @@ void recv_sys_t::apply(bool last_batch)
a redo log write and therefore acquire log_sys.latch. To avoid
deadlocks, log_sys.latch must not be acquired while holding
recv_sys.mutex. */
- free_block= buf_LRU_get_free_block(false);
+ free_block= buf_LRU_get_free_block(have_no_mutex);
if (!last_batch)
log_sys.latch.wr_lock(SRW_LOCK_CALL);
mysql_mutex_lock(&mutex);
@@ -3972,18 +3929,8 @@ void recv_sys_t::apply(bool last_batch)
}
}
- if (last_batch)
- {
- if (!recv_no_ibuf_operations)
- /* We skipped this in buf_page_create(). */
- mlog_init.mark_ibuf_exist();
- mlog_init.clear();
- }
- else
- {
- mlog_init.reset();
+ if (!last_batch)
log_sys.latch.wr_unlock();
- }
mysql_mutex_unlock(&mutex);
@@ -4746,7 +4693,6 @@ err_exit:
goto early_exit;
}
recv_sys.apply_log_recs = true;
- recv_no_ibuf_operations = false;
ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE
|| srv_operation == SRV_OPERATION_RESTORE_EXPORT);
if (srv_operation == SRV_OPERATION_NORMAL) {
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 74d3adb2..8db52ac1 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -203,7 +203,6 @@ void mtr_t::start()
m_made_dirty= false;
m_latch_ex= false;
- m_inside_ibuf= false;
m_modifications= false;
m_log_mode= MTR_LOG_ALL;
ut_d(m_user_space_id= TRX_SYS_SPACE);
@@ -483,7 +482,6 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
void mtr_t::commit()
{
ut_ad(is_active());
- ut_ad(!is_inside_ibuf());
/* This is a dirty read, for debugging. */
ut_ad(!m_modifications || !recv_no_log_write);
@@ -561,14 +559,12 @@ inline void fil_space_t::set_create_lsn(lsn_t lsn)
void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
{
ut_ad(is_active());
- ut_ad(!is_inside_ibuf());
ut_ad(!high_level_read_only);
ut_ad(m_modifications);
ut_ad(!m_memo.empty());
ut_ad(!recv_recovery_is_on());
ut_ad(m_log_mode == MTR_LOG_ALL);
ut_ad(!m_freed_pages);
- ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
log_write_and_flush_prepare();
m_latch_ex= true;
@@ -580,8 +576,16 @@ void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
fil_node_t *file= UT_LIST_GET_LAST(space.chain);
mysql_mutex_lock(&fil_system.mutex);
ut_ad(file->is_open());
- space.size= file->size= size;
- space.set_create_lsn(m_commit_lsn);
+ ut_ad(space.size >= size);
+ ut_ad(file->size >= space.size - size);
+ file->size-= space.size - size;
+ space.size= space.size_in_header= size;
+
+ if (space.id == TRX_SYS_SPACE)
+ srv_sys_space.set_last_file_size(file->size);
+ else
+ space.set_create_lsn(m_commit_lsn);
+
mysql_mutex_unlock(&fil_system.mutex);
space.clear_freed_ranges();
@@ -590,8 +594,8 @@ void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
log_write_and_flush();
ut_ad(log_sys.latch_have_wr());
- os_file_truncate(space.chain.start->name, space.chain.start->handle,
- os_offset_t{size} << srv_page_size_shift, true);
+ os_file_truncate(file->name, file->handle,
+ os_offset_t{file->size} << srv_page_size_shift, true);
space.clear_freed_ranges();
@@ -667,7 +671,6 @@ void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
bool mtr_t::commit_file(fil_space_t &space, const char *name)
{
ut_ad(is_active());
- ut_ad(!is_inside_ibuf());
ut_ad(!high_level_read_only);
ut_ad(m_modifications);
ut_ad(!m_made_dirty);
@@ -747,7 +750,6 @@ ATTRIBUTE_COLD lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
{
ut_ad(log_sys.latch_have_wr());
ut_ad(is_active());
- ut_ad(!is_inside_ibuf());
ut_ad(m_log_mode == MTR_LOG_ALL);
ut_ad(!m_made_dirty);
ut_ad(m_memo.empty());
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 3293db12..8f067110 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -65,7 +65,9 @@ Created 10/21/1995 Heikki Tuuri
#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
#ifdef _WIN32
-#include <winioctl.h>
+# include <winioctl.h>
+#elif !defined O_DSYNC
+# define O_DSYNC O_SYNC
#endif
// my_test_if_atomic_write() , my_win_secattr()
@@ -941,6 +943,8 @@ bool
os_file_flush_func(
os_file_t file)
{
+ if (UNIV_UNLIKELY(my_disable_sync)) return true;
+
int ret;
ret = os_file_sync_posix(file);
@@ -1003,46 +1007,35 @@ os_file_create_simple_func(
}
}
- bool retry;
-
+ if (fil_system.is_write_through()) create_flag |= O_DSYNC;
#ifdef O_DIRECT
- int direct_flag = 0;
- /* This function is always called for data files, we should disable
- OS caching (O_DIRECT) here as we do in os_file_create_func(), so
- we open the same file in the same mode, see man page of open(2). */
- switch (srv_file_flush_method) {
- case SRV_O_DSYNC:
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
- direct_flag = O_DIRECT;
- break;
- }
+ int direct_flag = fil_system.is_buffered() ? 0 : O_DIRECT;
#else
constexpr int direct_flag = 0;
#endif
- do {
+ for (;;) {
file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) {
#ifdef O_DIRECT
if (direct_flag && errno == EINVAL) {
direct_flag = 0;
- retry = true;
continue;
}
#endif
- *success = false;
- retry = os_file_handle_error_no_exit(
- name,
- create_mode == OS_FILE_CREATE
- ? "create" : "open", false);
+
+ if (!os_file_handle_error_no_exit(
+ name,
+ create_mode == OS_FILE_CREATE
+ ? "create" : "open", false)) {
+ break;
+ }
} else {
*success = true;
- retry = false;
+ break;
}
-
- } while (retry);
+ }
if (!read_only
&& *success
@@ -1088,7 +1081,7 @@ os_file_create_directory(
}
#ifdef O_DIRECT
-# if defined __linux
+# ifdef __linux__
/** Note that the log file uses buffered I/O. */
static ATTRIBUTE_COLD void os_file_log_buffered()
{
@@ -1192,6 +1185,8 @@ os_file_create_func(
create_flag = O_RDWR | O_CLOEXEC;
}
+ ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
#ifdef O_DIRECT
struct stat st;
ut_a(type == OS_LOG_FILE
@@ -1199,14 +1194,8 @@ os_file_create_func(
int direct_flag = 0;
if (type == OS_DATA_FILE) {
- switch (srv_file_flush_method) {
- case SRV_O_DSYNC:
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
+ if (!fil_system.is_buffered()) {
direct_flag = O_DIRECT;
- break;
- default:
- break;
}
# ifdef __linux__
} else if (type != OS_LOG_FILE) {
@@ -1218,11 +1207,6 @@ os_file_create_func(
&& !log_sys.is_opened()) {
if (stat(name, &st)) {
if (errno == ENOENT) {
- if (create_mode & OS_FILE_ON_ERROR_SILENT) {
- goto not_found;
- }
- sql_print_error(
- "InnoDB: File %s was not found", name);
goto not_found;
}
goto skip_o_direct;
@@ -1240,18 +1224,12 @@ os_file_create_func(
ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
constexpr int direct_flag = 0;
#endif
- ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
- /* We let O_DSYNC only affect log files */
-
- if (!read_only
- && type == OS_LOG_FILE
- && srv_file_flush_method == SRV_O_DSYNC) {
-#ifdef O_DSYNC
+ if (read_only) {
+ } else if (type == OS_LOG_FILE
+ ? log_sys.log_write_through
+ : fil_system.is_write_through()) {
create_flag |= O_DSYNC;
-#else
- create_flag |= O_SYNC;
-#endif
}
os_file_t file;
@@ -1276,26 +1254,24 @@ os_file_create_func(
}
continue;
}
+# ifdef __linux__
+not_found:
+# endif
#endif
- if (!os_file_handle_error_no_exit(
+ if (os_file_handle_error_no_exit(
name, (create_flag & O_CREAT)
? "create" : "open",
create_mode & OS_FILE_ON_ERROR_SILENT)) {
- break;
+ continue;
}
+
+ return OS_FILE_CLOSED;
} else {
*success = true;
break;
}
}
- if (!*success) {
-#ifdef __linux__
-not_found:
-#endif
- return OS_FILE_CLOSED;
- }
-
#ifdef __linux__
if ((create_flag & O_CREAT) && type == OS_LOG_FILE) {
if (fstat(file, &st) || !os_file_log_maybe_unbuffered(st)) {
@@ -1746,6 +1722,9 @@ Flushes the write buffers of a given file to the disk.
@return true if success */
bool os_file_flush_func(os_file_t file)
{
+ if (UNIV_UNLIKELY(my_disable_sync))
+ return true;
+
++os_n_fsyncs;
static bool disable_datasync;
@@ -1919,6 +1898,11 @@ os_file_create_simple_func(
access = GENERIC_READ | GENERIC_WRITE;
}
+ if (fil_system.is_write_through())
+ attributes |= FILE_FLAG_WRITE_THROUGH;
+ if (!fil_system.is_buffered())
+ attributes |= FILE_FLAG_NO_BUFFERING;
+
for (;;) {
/* Use default security attributes and no template file. */
@@ -2057,25 +2041,16 @@ os_file_create_func(
if (!log_sys.is_opened() && !log_sys.log_buffered) {
attributes|= FILE_FLAG_NO_BUFFERING;
}
- if (srv_file_flush_method == SRV_O_DSYNC)
+ if (log_sys.log_write_through)
attributes|= FILE_FLAG_WRITE_THROUGH;
- }
- else if (type == OS_DATA_FILE) {
- switch (srv_file_flush_method) {
- case SRV_FSYNC:
- case SRV_LITTLESYNC:
- case SRV_NOSYNC:
- break;
- default:
+ } else {
+ if (type == OS_DATA_FILE && !fil_system.is_buffered())
attributes|= FILE_FLAG_NO_BUFFERING;
- }
+ if (fil_system.is_write_through())
+ attributes|= FILE_FLAG_WRITE_THROUGH;
}
- DWORD access = GENERIC_READ;
-
- if (!read_only) {
- access |= GENERIC_WRITE;
- }
+ DWORD access = read_only ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE;
for (;;) {
const char *operation;
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index b019694b..2562ae1d 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -2,7 +2,7 @@
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2018, 2022, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1369,8 +1369,7 @@ page_cur_insert_rec_low(
ut_ad(!!page_is_comp(block->page.frame) == !!rec_offs_comp(offsets));
ut_ad(fil_page_index_page_check(block->page.frame));
ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame) ==
- index->id ||
- mtr->is_inside_ibuf());
+ index->id || index->is_dummy);
ut_ad(page_dir_get_n_slots(block->page.frame) >= 2);
ut_ad(!page_rec_is_supremum(cur->rec));
@@ -1769,11 +1768,6 @@ static inline void page_zip_dir_add_slot(buf_block_t *block,
Inserts a record next to page cursor on a compressed and uncompressed
page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if this is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to inserted record
@return nullptr on failure */
rec_t*
@@ -1797,8 +1791,8 @@ page_cur_insert_rec_zip(
ut_ad(rec_offs_comp(offsets));
ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX ||
fil_page_get_type(page) == FIL_PAGE_RTREE);
- ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + page) ==
- index->id || mtr->is_inside_ibuf());
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + page) == index->id ||
+ index->is_dummy);
ut_ad(!page_get_instant(page));
ut_ad(!page_cur_is_after_last(cursor));
#ifdef UNIV_ZIP_DEBUG
@@ -2265,8 +2259,7 @@ page_cur_delete_rec(
== index->table->not_redundant());
ut_ad(fil_page_index_page_check(block->page.frame));
ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame)
- == index->id
- || mtr->is_inside_ibuf());
+ == index->id || index->is_dummy);
ut_ad(mtr->is_named_space(index->table->space));
/* The record must not be the supremum or infimum record. */
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
index 258d47a5..1060e702 100644
--- a/storage/innobase/page/page0page.cc
+++ b/storage/innobase/page/page0page.cc
@@ -2,7 +2,7 @@
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -336,17 +336,13 @@ page_create_zip(
/* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for
temporary tables. */
ut_ad(max_trx_id == 0 || !index->table->is_temporary());
- /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID
+ /* In secondary indexes, PAGE_MAX_TRX_ID
must be zero on non-leaf pages. max_trx_id can be 0 when the
- index consists of an empty root (leaf) page. */
- ut_ad(max_trx_id == 0
- || level == 0
- || !dict_index_is_sec_or_ibuf(index)
- || index->table->is_temporary());
- /* In the clustered index, PAGE_ROOT_AUTOINC or
+ index consists of an empty root (leaf) page.
+
+ the clustered index, PAGE_ROOT_AUTOINC or
PAGE_MAX_TRX_ID must be 0 on other pages than the root. */
- ut_ad(level == 0 || max_trx_id == 0
- || !dict_index_is_sec_or_ibuf(index)
+ ut_ad(max_trx_id == 0 || level == 0 || index->is_primary()
|| index->table->is_temporary());
buf_block_modify_clock_inc(block);
@@ -390,8 +386,7 @@ page_create_empty(
same temp-table in parallel.
max_trx_id is ignored for temp tables because it not required
for MVCC. */
- if (dict_index_is_sec_or_ibuf(index)
- && !index->table->is_temporary()
+ if (!index->is_primary() && !index->table->is_temporary()
&& page_is_leaf(block->page.frame)) {
max_trx_id = page_get_max_trx_id(block->page.frame);
ut_ad(max_trx_id);
@@ -435,11 +430,6 @@ page_create_empty(
Differs from page_copy_rec_list_end, because this function does not
touch the lock table and max trx id on page or compress the page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return error code */
dberr_t
page_copy_rec_list_end_no_locks(
@@ -507,11 +497,6 @@ Copies records from page to new_page, from a given record onward,
including that record. Infimum and supremum records are not copied.
The records are copied to the start of the record list on new_page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_t::commit().
-
@return pointer to the original successor of the infimum record on new_block
@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
rec_t*
@@ -603,8 +588,7 @@ err_exit:
same temp-table in parallel.
max_trx_id is ignored for temp tables because it not required
for MVCC. */
- if (dict_index_is_sec_or_ibuf(index)
- && page_is_leaf(page)
+ if (!index->is_primary() && page_is_leaf(page)
&& !index->table->is_temporary()) {
ut_ad(!was_empty || page_dir_get_n_heap(new_page)
== PAGE_HEAP_NO_USER_LOW
@@ -677,11 +661,6 @@ Copies records from page to new_page, up to the given record,
NOT including that record. Infimum and supremum records are not copied.
The records are copied to the end of the record list on new_page.
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
@return pointer to the original predecessor of the supremum record on new_block
@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
rec_t*
@@ -2057,7 +2036,7 @@ func_exit2:
max_trx_id is ignored for temp tables because it not required
for MVCC. */
if (!page_is_leaf(page) || page_is_empty(page)
- || !dict_index_is_sec_or_ibuf(index)
+ || index->is_primary()
|| index->table->is_temporary()) {
} else if (trx_id_t sys_max_trx_id = trx_sys.get_max_trx_id()) {
trx_id_t max_trx_id = page_get_max_trx_id(page);
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index e375fbfb..4eda8322 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -2,7 +2,7 @@
Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -408,8 +408,6 @@ inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
static void page_zip_compress_write_log(buf_block_t *block,
dict_index_t *index, mtr_t *mtr)
{
- ut_ad(!index->is_ibuf());
-
if (!mtr->is_logged())
return;
@@ -463,8 +461,7 @@ page_zip_get_n_prev_extern(
ut_ad(page_is_leaf(page));
ut_ad(page_is_comp(page));
ut_ad(dict_table_is_comp(index->table));
- ut_ad(dict_index_is_clust(index));
- ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(index->is_primary());
heap_no = rec_get_heap_no_new(rec);
ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
@@ -1282,7 +1279,6 @@ page_zip_compress(
ut_ad(page_simple_validate_new((page_t*) page));
ut_ad(page_zip_simple_validate(page_zip));
ut_ad(dict_table_is_comp(index->table));
- ut_ad(!dict_index_is_ibuf(index));
MEM_CHECK_DEFINED(page, srv_page_size);
@@ -4371,10 +4367,6 @@ Reorganize and compress a page. This is a low-level operation for
compressed pages, to be used when page_zip_compress() fails.
On success, redo log will be written.
The function btr_page_reorganize() should be preferred whenever possible.
-IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
-non-clustered index, the caller must update the insert buffer free
-bits in the same mini-transaction in such a way that the modification
-will be redo-logged.
@return error code
@retval DB_FAIL on overflow; the block_zip will be left intact */
dberr_t
@@ -4395,7 +4387,6 @@ page_zip_reorganize(
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
ut_ad(block->page.zip.data);
ut_ad(page_is_comp(page));
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(!index->table->is_temporary());
/* Note that page_zip_validate(page_zip, page, index) may fail here. */
MEM_CHECK_DEFINED(page, srv_page_size);
@@ -4502,7 +4493,6 @@ page_zip_copy_recs(
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
- ut_ad(!dict_index_is_ibuf(index));
ut_ad(!index->table->is_temporary());
#ifdef UNIV_ZIP_DEBUG
/* The B-tree operations that call this function may set
diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc
index d190a001..3620363f 100644
--- a/storage/innobase/rem/rem0cmp.cc
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, 2022, MariaDB Corporation.
+Copyright (c) 2020, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -402,8 +402,8 @@ int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec,
ut_ad(!dfield_is_ext(dtuple_field));
- ret = cmp_data(type->mtype, type->prtype, !index->is_ibuf()
- && index->fields[cur_field].descending,
+ ret = cmp_data(type->mtype, type->prtype,
+ index->fields[cur_field].descending,
dtuple_b_ptr, dtuple_f_len,
rec_b_ptr, rec_f_len);
if (ret) {
@@ -481,7 +481,6 @@ cmp_dtuple_rec_with_match_bytes(
ut_ad(rec_offs_validate(rec, index, offsets));
ut_ad(!(REC_INFO_MIN_REC_FLAG
& dtuple_get_info_bits(dtuple)));
- ut_ad(!index->is_ibuf());
if (UNIV_UNLIKELY(REC_INFO_MIN_REC_FLAG
& rec_get_info_bits(rec, rec_offs_comp(offsets)))) {
@@ -833,32 +832,21 @@ cmp_rec_rec(
dict_index_get_n_unique_in_tree(index));
for (; cur_field < n_fields; cur_field++) {
- ulint mtype;
- ulint prtype;
- bool descending;
-
- if (UNIV_UNLIKELY(dict_index_is_ibuf(index))) {
- /* This is for the insert buffer B-tree. */
- mtype = DATA_BINARY;
+ const dict_field_t* field = dict_index_get_nth_field(
+ index, cur_field);
+ bool descending = field->descending;
+ ulint mtype = field->col->mtype;
+ ulint prtype = field->col->prtype;
+
+ if (UNIV_LIKELY(!index->is_spatial())) {
+ } else if (cur_field == 0) {
+ ut_ad(DATA_GEOMETRY_MTYPE(mtype));
+ prtype |= DATA_GIS_MBR;
+ } else if (!page_rec_is_leaf(rec2)) {
+ /* Compare the child page number. */
+ ut_ad(cur_field == 1);
+ mtype = DATA_SYS_CHILD;
prtype = 0;
- descending = false;
- } else {
- const dict_field_t* field = dict_index_get_nth_field(
- index, cur_field);
- descending = field->descending;
- mtype = field->col->mtype;
- prtype = field->col->prtype;
-
- if (UNIV_LIKELY(!dict_index_is_spatial(index))) {
- } else if (cur_field == 0) {
- ut_ad(DATA_GEOMETRY_MTYPE(mtype));
- prtype |= DATA_GIS_MBR;
- } else if (!page_rec_is_leaf(rec2)) {
- /* Compare the child page number. */
- ut_ad(cur_field == 1);
- mtype = DATA_SYS_CHILD;
- prtype = 0;
- }
}
/* We should never encounter an externally stored field.
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
index 47be5e44..46dcfb1b 100644
--- a/storage/innobase/rem/rem0rec.cc
+++ b/storage/innobase/rem/rem0rec.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -480,7 +480,7 @@ rec_offs_make_valid(
const bool is_alter_metadata = leaf
&& rec_is_alter_metadata(rec, *index);
ut_ad((leaf && rec_is_metadata(rec, *index))
- || index->is_dummy || index->is_ibuf()
+ || index->is_dummy
|| (leaf
? rec_offs_n_fields(offsets)
<= dict_index_get_n_fields(index)
@@ -882,18 +882,15 @@ rec_get_offsets_func(
/* The infimum and supremum records carry 1 field. */
ut_ad(is_user_rec || n == 1);
ut_ad(!is_user_rec || n_core || index->is_dummy
- || dict_index_is_ibuf(index)
|| n == n_fields /* dict_stats_analyze_index_level() */
|| n - 1
== dict_index_get_n_unique_in_tree_nonleaf(index));
ut_ad(!is_user_rec || !n_core || index->is_dummy
- || dict_index_is_ibuf(index)
|| n == n_fields /* btr_pcur_restore_position() */
|| (n + (index->id == DICT_INDEXES_ID) >= n_core));
if (is_user_rec && n_core && n < index->n_fields) {
ut_ad(!index->is_dummy);
- ut_ad(!dict_index_is_ibuf(index));
n = index->n_fields;
}
}
@@ -1972,7 +1969,7 @@ rec_copy_prefix_to_buf(
or NULL */
ulint* buf_size) /*!< in/out: buffer size */
{
- ut_ad(n_fields <= index->n_fields || dict_index_is_ibuf(index));
+ ut_ad(n_fields <= index->n_fields);
ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
UNIV_PREFETCH_RW(*buf);
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index 6194e9c3..5febd6df 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -293,6 +293,7 @@ struct fil_iterator_t {
byte* io_buffer; /*!< Buffer to use for IO */
fil_space_crypt_t *crypt_data; /*!< Crypt data (if encrypted) */
byte* crypt_io_buffer; /*!< IO buffer when encrypted */
+ byte* crypt_tmp_buffer; /*!< Temporary buffer for crypt use */
};
/** Use the page cursor to iterate over records in a block. */
@@ -739,7 +740,8 @@ struct FetchIndexRootPages : public AbstractCallback {
/** Constructor
@param trx covering (user) transaction
@param table table definition in server .*/
- FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+ FetchIndexRootPages(const dict_table_t* table = nullptr,
+ trx_t* trx = nullptr)
:
AbstractCallback(trx, UINT32_MAX),
m_table(table), m_index(0, 0) UNIV_NOTHROW { }
@@ -754,18 +756,46 @@ struct FetchIndexRootPages : public AbstractCallback {
dberr_t run(const fil_iterator_t& iter,
buf_block_t* block) UNIV_NOTHROW override;
- /** Called for each block as it is read from the file.
+ /** Check that fsp flags and row formats match.
@param block block to convert, it is not from the buffer pool.
@retval DB_SUCCESS or error code. */
dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+ /** Get row format from the header and the root index page. */
+ enum row_type get_row_format(const buf_block_t &block)
+ {
+ if (!page_is_comp(block.page.frame))
+ return ROW_TYPE_REDUNDANT;
+ /* With full_crc32 we cannot tell between dynamic or
+ compact, and return not_used. We cannot simply return
+ dynamic or compact, as the client of this function
+ will not be able to tell whether it is dynamic because
+ of this or the other branch below. Returning default
+ would also work if it is immediately handled, but is
+ still more ambiguous than not_used, which is not a
+ row_format at all. */
+ if (fil_space_t::full_crc32(m_space_flags))
+ return ROW_TYPE_NOT_USED;
+ if (!(m_space_flags & FSP_FLAGS_MASK_ATOMIC_BLOBS))
+ return ROW_TYPE_COMPACT;
+ if (FSP_FLAGS_GET_ZIP_SSIZE(m_space_flags))
+ return ROW_TYPE_COMPRESSED;
+ return ROW_TYPE_DYNAMIC;
+ }
+
/** Update the import configuration that will be used to import
the tablespace. */
dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
- /** Table definition in server. */
+ /** Table definition in server. When the table is being
+ created, there's no table yet so m_table is nullptr */
const dict_table_t* m_table;
+ /** Table row format. Only used when a (stub) table is being
+ created in which case m_table is null, for obtaining row
+ format from the .ibd for the stub table. */
+ enum row_type m_row_format;
+
/** Index information */
Index m_index;
};
@@ -2151,9 +2181,9 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
/* If we already had an old page with matching number
in the buffer pool, evict it now, because
we no longer evict the pages on DISCARD TABLESPACE. */
- buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH,
+ buf_page_get_gen(block->page.id(), get_zip_size(), RW_NO_LATCH,
nullptr, BUF_PEEK_IF_IN_POOL,
- nullptr, nullptr, false);
+ nullptr, nullptr);
uint16_t page_type;
@@ -2207,8 +2237,9 @@ row_import_cleanup(row_prebuilt_t* prebuilt,
dberr_t err,
dict_table_t* fts_table = nullptr)
{
+ dict_table_t* table = prebuilt->table;
+
if (err != DB_SUCCESS) {
- dict_table_t* table = prebuilt->table;
table->file_unreadable = true;
if (table->space) {
fil_close_tablespace(table->space_id);
@@ -2237,6 +2268,7 @@ row_import_cleanup(row_prebuilt_t* prebuilt,
if (err == DB_SUCCESS) {
reload_fts_table(prebuilt, fts_table);
+ table= prebuilt->table;
ib::warn() << "Added system generated FTS_DOC_ID "
"and FTS_DOC_ID_INDEX while importing "
"the tablespace " << prebuilt->table->name;
@@ -2272,7 +2304,25 @@ row_import_cleanup(row_prebuilt_t* prebuilt,
DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
- return(err);
+ if (err != DB_SUCCESS
+ || !dict_table_get_first_index(table)->is_gen_clust()) {
+ return err;
+ }
+
+ btr_cur_t cur;
+ mtr_t mtr;
+ mtr.start();
+ err = cur.open_leaf(false, dict_table_get_first_index(table),
+ BTR_SEARCH_LEAF, &mtr);
+ if (err != DB_SUCCESS) {
+ } else if (const rec_t *rec =
+ page_rec_get_prev(btr_cur_get_rec(&cur))) {
+ if (page_rec_is_user_rec(rec))
+ table->row_id= mach_read_from_6(rec);
+ }
+ mtr.commit();
+
+ return err;
}
/** Report error during tablespace import.
@@ -2411,55 +2461,6 @@ row_import_adjust_root_pages_of_secondary_indexes(
}
/*****************************************************************//**
-Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */
-MY_ATTRIBUTE((nonnull)) static
-void
-row_import_set_sys_max_row_id(
-/*==========================*/
- row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from
- handler */
- const dict_table_t* table) /*!< in: table to import */
-{
- const rec_t* rec;
- mtr_t mtr;
- btr_pcur_t pcur;
- row_id_t row_id = 0;
- dict_index_t* index;
-
- index = dict_table_get_first_index(table);
- ut_ad(index->is_primary());
- ut_ad(dict_index_is_auto_gen_clust(index));
-
- mtr_start(&mtr);
-
- mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
-
- if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr)
- == DB_SUCCESS) {
- rec = btr_pcur_move_to_prev_on_page(&pcur);
-
- if (!rec) {
- /* The table is corrupted. */
- } else if (page_rec_is_infimum(rec)) {
- /* The table is empty. */
- } else if (rec_is_metadata(rec, *index)) {
- /* The clustered index contains the metadata
- record only, that is, the table is empty. */
- } else {
- row_id = mach_read_from_6(rec);
- }
- }
-
- mtr_commit(&mtr);
-
- if (row_id) {
- /* Update the system row id if the imported index row id is
- greater than the max system row id. */
- dict_sys.update_row_id(row_id);
- }
-}
-
-/*****************************************************************//**
Read the a string from the meta data file.
@return DB_SUCCESS or error code. */
static
@@ -3142,17 +3143,25 @@ row_import_read_meta_data(
/* decrypt and decompress page if needed */
static dberr_t decrypt_decompress(fil_space_crypt_t *space_crypt,
uint32_t space_flags, span<byte> page,
- uint32_t space_id, byte *page_compress_buf)
+ uint32_t space_id, byte *page_compress_buf,
+ byte *tmp_frame)
{
auto *data= page.data();
if (space_crypt && space_crypt->should_encrypt())
{
+ uint page_size= static_cast<uint>(page.size());
+
if (!buf_page_verify_crypt_checksum(data, space_flags))
return DB_CORRUPTION;
- if (dberr_t err= fil_space_decrypt(space_id, space_flags, space_crypt,
- data, page.size(), data))
+ dberr_t err=
+ fil_space_decrypt(space_id, space_flags, space_crypt,
+ tmp_frame, page_size, data);
+
+ memcpy(data, tmp_frame, page_size);
+
+ if (err)
return err;
}
@@ -3404,11 +3413,16 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
return err;
std::unique_ptr<byte[]> page_compress_buf(new byte[get_buf_size()]);
+ std::unique_ptr<byte[], decltype(&aligned_free)> crypt_tmp_frame(
+ static_cast<byte *>(
+ aligned_malloc(physical_size, CPU_LEVEL1_DCACHE_LINESIZE)),
+ &aligned_free);
if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
{page.get(), static_cast<size_t>
(physical_size)},
- space_id, page_compress_buf.get()))
+ space_id, page_compress_buf.get(),
+ crypt_tmp_frame.get()))
return err;
if (table->supports_instant())
@@ -3462,7 +3476,8 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
{page.get(), static_cast<size_t>
(physical_size)}, space_id,
- page_compress_buf.get()))
+ page_compress_buf.get(),
+ crypt_tmp_frame.get()))
return err;
}
@@ -3544,7 +3559,8 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
{second_page.get(),
static_cast<size_t>(physical_size)},
- space_id, page_compress_buf.get()))
+ space_id, page_compress_buf.get(),
+ crypt_tmp_frame.get()))
return err;
if (fil_page_get_type(second_page.get()) != FIL_PAGE_TYPE_BLOB ||
@@ -3627,6 +3643,35 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
}
/**
+Read the contents of a .cfg file.
+@param[in] filename Path to the cfg file
+@param[in] thd Connection
+@param[out] cfg Contents of the .cfg file.
+@return DB_SUCCESS or error code. */
+static dberr_t row_import_read_cfg_internal(const char *filename, THD *thd,
+ row_import &cfg)
+{
+ FILE *file= fopen(filename, "rb");
+
+ cfg.m_missing= !file;
+
+ if (!file)
+ {
+ char msg[BUFSIZ];
+ snprintf(msg, sizeof(msg),
+ "Error opening '%s', will attempt to import"
+ " without schema verification", filename);
+ ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+ (ulong) errno, strerror(errno), msg);
+ return DB_FAIL;
+ }
+
+ dberr_t err= row_import_read_meta_data(file, thd, cfg);
+ fclose(file);
+ return err;
+}
+
+/**
Read the contents of the <tablename>.cfg file.
@return DB_SUCCESS or error code. */
static MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -3637,38 +3682,60 @@ row_import_read_cfg(
THD* thd, /*!< in: session */
row_import& cfg) /*!< out: contents of the .cfg file */
{
- dberr_t err;
char name[OS_FILE_MAX_PATH];
cfg.m_table = table;
srv_get_meta_data_filename(table, name, sizeof(name));
- FILE* file = fopen(name, "rb");
-
- if (file == NULL) {
- char msg[BUFSIZ];
-
- snprintf(msg, sizeof(msg),
- "Error opening '%s', will attempt to import"
- " without schema verification", name);
-
- ib_senderrf(
- thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
- (ulong) errno, strerror(errno), msg);
-
- cfg.m_missing = true;
+ return row_import_read_cfg_internal(name, thd, cfg);
+}
- err = DB_FAIL;
- } else {
- cfg.m_missing = false;
+/** Convert the InnoDB ROW_FORMAT from rec_format_enum to row_type.
+@param[in] from ROW_FORMAT as a rec_format_enum
+@return the row_type representation of ROW_FORMAT. */
+static enum row_type from_rec_format(const rec_format_enum from)
+{
+ switch (from) {
+ case REC_FORMAT_COMPACT:
+ return ROW_TYPE_COMPACT;
+ case REC_FORMAT_DYNAMIC:
+ return ROW_TYPE_DYNAMIC;
+ case REC_FORMAT_REDUNDANT:
+ return ROW_TYPE_REDUNDANT;
+ case REC_FORMAT_COMPRESSED:
+ return ROW_TYPE_COMPRESSED;
+ }
- err = row_import_read_meta_data(file, thd, cfg);
- fclose(file);
- }
+ ut_ad("invalid format" == 0);
+ return ROW_TYPE_NOT_USED;
+}
- return(err);
+/**
+Read the row type from a .cfg file.
+@param dir_path Path to the data directory containing the .cfg file
+@param name Name of the table
+@param thd Connection
+@retval ROW_TYPE_COMPACT for ROW_FORMAT=COMPACT
+@retval ROW_TYPE_DYNAMIC for ROW_FORMAT=DYNAMIC
+@retval ROW_TYPE_REDUNDANT for ROW_FORMAT=REDUNDANT
+@retval ROW_TYPE_COMPRESSED for ROW_FORMAT=COMPRESSED
+@retval ROW_TYPE_NOT_USED to signal error */
+static enum row_type get_row_type_from_cfg(const char* dir_path,
+ const char* name, THD* thd)
+{
+ char* filename= fil_make_filepath(dir_path,
+ table_name_t(const_cast<char*>(name)),
+ CFG, dir_path != nullptr);
+ if (!filename)
+ return ROW_TYPE_NOT_USED;
+ row_import cfg;
+ dberr_t err= row_import_read_cfg_internal(filename, thd, cfg);
+ ut_free(filename);
+ if (err == DB_SUCCESS)
+ return from_rec_format(dict_tf_get_rec_format(cfg.m_flags));
+ return ROW_TYPE_NOT_USED;
}
/** Update the root page numbers and tablespace ID of a table.
@@ -3801,11 +3868,20 @@ row_import_set_discarded(
ulint flags2 = mach_read_from_4(
static_cast<byte*>(dfield_get_data(dfield)));
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic push
+# if __GNUC__ < 12 || defined WITH_UBSAN
+# pragma GCC diagnostic ignored "-Wconversion"
+# endif
+#endif
if (discard->state) {
flags2 |= DICT_TF2_DISCARDED;
} else {
flags2 &= ~DICT_TF2_DISCARDED;
}
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic pop
+#endif
mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
@@ -3986,8 +4062,14 @@ page_corrupted:
if (!buf_page_verify_crypt_checksum(readptr, m_space_flags))
goto page_corrupted;
- if ((err= fil_space_decrypt(get_space_id(), m_space_flags, iter.crypt_data,
- readptr, size, readptr)))
+ dberr_t err= fil_space_decrypt(get_space_id(), m_space_flags,
+ iter.crypt_data, iter.crypt_tmp_buffer,
+ size, readptr);
+
+ memcpy_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(readptr, iter.crypt_tmp_buffer,
+ size);
+
+ if (err)
goto func_exit;
}
@@ -4008,7 +4090,12 @@ page_corrupted:
&& buf_page_is_corrupted(false, readptr, m_space_flags))
goto page_corrupted;
- err= this->operator()(block);
+ /* m_table is null iff we are trying to create a (stub) table, in
+ which case we want to get row format for the table creation. */
+ if (m_table)
+ err= this->operator()(block);
+ else
+ m_row_format= get_row_format(*block);
func_exit:
free(page_compress_buf);
return err;
@@ -4331,19 +4418,21 @@ func_exit:
return err;
}
-/********************************************************************//**
-Iterate over all the pages in the tablespace.
-@param table - the table definiton in the server
-@param n_io_buffers - number of blocks to read and write together
-@param callback - functor that will do the page updates
+/**
+Iterate over all or some pages in the tablespace.
+@param dir_path the path to data dir storing the tablespace
+@param name the table name
+@param n_io_buffers number of blocks to read and write together
+@param callback functor that will do the page queries or updates
@return DB_SUCCESS or error code */
static
dberr_t
fil_tablespace_iterate(
/*===================*/
- dict_table_t* table,
- ulint n_io_buffers,
- AbstractCallback& callback)
+ const char *name,
+ ulint n_io_buffers,
+ AbstractCallback &callback,
+ const char *dir_path)
{
dberr_t err;
pfs_os_file_t file;
@@ -4355,18 +4444,9 @@ fil_tablespace_iterate(
DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
return(DB_CORRUPTION););
- /* Make sure the data_dir_path is set. */
- dict_get_and_save_data_dir_path(table);
-
- ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
-
- const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
- ? table->data_dir_path : nullptr;
-
- filepath = fil_make_filepath(data_dir_path,
- {table->name.m_name,
- strlen(table->name.m_name)},
- IBD, data_dir_path != nullptr);
+ table_name_t table_name(const_cast<char*>(name));
+ filepath= fil_make_filepath(dir_path, table_name, IBD,
+ dir_path != nullptr);
if (!filepath) {
return(DB_OUT_OF_MEMORY);
} else {
@@ -4379,9 +4459,9 @@ fil_tablespace_iterate(
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(true);
- ib::error() << "Trying to import a tablespace,"
- " but could not open the tablespace file "
- << filepath;
+ sql_print_error("InnoDB: could not open the "
+ "tablespace file %s.\n",
+ filepath);
ut_free(filepath);
return DB_TABLESPACE_NOT_FOUND;
} else {
@@ -4444,17 +4524,21 @@ fil_tablespace_iterate(
iter.file_size = file_size;
iter.n_io_buffers = n_io_buffers;
+ size_t buf_size = (1 + iter.n_io_buffers) * srv_page_size;
+
/* Add an extra page for compressed page scratch area. */
iter.io_buffer = static_cast<byte*>(
- aligned_malloc((1 + iter.n_io_buffers)
- << srv_page_size_shift, srv_page_size));
+ aligned_malloc(buf_size, srv_page_size));
- iter.crypt_io_buffer = iter.crypt_data
- ? static_cast<byte*>(
- aligned_malloc((1 + iter.n_io_buffers)
- << srv_page_size_shift,
- srv_page_size))
- : NULL;
+ if (iter.crypt_data) {
+ iter.crypt_io_buffer = static_cast<byte *>(
+ aligned_malloc(buf_size, srv_page_size));
+ iter.crypt_tmp_buffer = static_cast<byte *>(
+ aligned_malloc(buf_size, CPU_LEVEL1_DCACHE_LINESIZE));
+ } else {
+ iter.crypt_io_buffer = NULL;
+ iter.crypt_tmp_buffer = NULL;
+ }
if (block->page.zip.ssize) {
ut_ad(iter.n_io_buffers == 1);
@@ -4469,6 +4553,7 @@ fil_tablespace_iterate(
fil_space_destroy_crypt_data(&iter.crypt_data);
}
+ aligned_free(iter.crypt_tmp_buffer);
aligned_free(iter.crypt_io_buffer);
aligned_free(iter.io_buffer);
}
@@ -4493,6 +4578,24 @@ fil_tablespace_iterate(
return(err);
}
+/**
+Iterate over all or some pages in the tablespace.
+@param table the table definiton in the server
+@param n_io_buffers number of blocks to read and write together
+@param callback functor that will do the page queries or updates
+@return DB_SUCCESS or error code */
+static dberr_t fil_tablespace_iterate(dict_table_t *table, ulint n_io_buffers,
+ AbstractCallback &callback)
+{
+ /* Make sure the data_dir_path is set. */
+ dict_get_and_save_data_dir_path(table);
+ ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+ const char *data_dir_path= DICT_TF_HAS_DATA_DIR(table->flags)
+ ? table->data_dir_path : nullptr;
+ return fil_tablespace_iterate(table->name.m_name, n_io_buffers, callback,
+ data_dir_path);
+}
+
static void row_import_autoinc(dict_table_t *table, row_prebuilt_t *prebuilt,
uint64_t autoinc)
{
@@ -4621,8 +4724,6 @@ row_import_for_mysql(
ut_ad(!table->is_readable());
ut_ad(prebuilt->table == table);
- ibuf_delete_for_discarded_space(table->space_id);
-
#ifdef BTR_CUR_HASH_ADAPT
/* On DISCARD TABLESPACE, we did not drop any adaptive hash
index entries. If we replaced the discarded tablespace with a
@@ -4830,12 +4931,6 @@ import_error:
ut_free(filepath);
- if (err == DB_SUCCESS) {
- err = ibuf_check_bitmap_on_import(trx, table->space);
- }
-
- DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
-
if (err != DB_SUCCESS) {
return row_import_cleanup(prebuilt, err);
}
@@ -4893,13 +4988,6 @@ import_error:
goto import_error;
}
- /* Ensure that the next available DB_ROW_ID is not smaller than
- any DB_ROW_ID stored in the table. */
-
- if (prebuilt->clust_index_was_generated) {
- row_import_set_sys_max_row_id(prebuilt, table);
- }
-
ib::info() << "Phase III - Flush changes to disk";
/* Ensure that all pages dirtied during the IMPORT make it to disk.
@@ -4963,3 +5051,58 @@ import_error:
return row_import_cleanup(prebuilt, err, table);
}
+
+/** Prepare the create info to create a new stub table for import.
+@param thd Connection
+@param name Table name, format: "db/table_name".
+@param create_info The create info for creating a stub.
+@return ER_ error code
+@retval 0 on success */
+int prepare_create_stub_for_import(THD *thd, const char *name,
+ HA_CREATE_INFO& create_info)
+{
+ DBUG_ENTER("prepare_create_stub_for_import");
+ FetchIndexRootPages fetchIndexRootPages;
+ if (fil_tablespace_iterate(name, IO_BUFFER_SIZE(srv_page_size),
+ fetchIndexRootPages, fil_path_to_mysql_datadir)
+ != DB_SUCCESS)
+ {
+ const char *ibd_path= fil_make_filepath(
+ fil_path_to_mysql_datadir, table_name_t(const_cast<char*>(name)), IBD,
+ true);
+ if (!ibd_path)
+ return(ER_ENGINE_OUT_OF_MEMORY);
+ sql_print_error("InnoDB: failed to get row format from %s.\n",
+ ibd_path);
+ DBUG_RETURN(ER_INNODB_IMPORT_ERROR);
+ }
+ create_info.init();
+ /* get the row format from ibd. */
+ create_info.row_type= fetchIndexRootPages.m_row_format;
+ /* if .cfg exists, get the row format from cfg, and compare with
+ ibd, report error if different, except when cfg reports
+ compact/dynamic and ibd reports not_used (indicating either compact
+ or dynamic but not sure) */
+ const enum row_type row_type_from_cfg=
+ get_row_type_from_cfg(fil_path_to_mysql_datadir, name, thd);
+ if (row_type_from_cfg != ROW_TYPE_NOT_USED)
+ {
+ /* if ibd reports not_used but cfg reports compact or dynamic, go
+ with cfg. */
+ if (create_info.row_type != row_type_from_cfg &&
+ !((row_type_from_cfg == ROW_TYPE_COMPACT ||
+ row_type_from_cfg == ROW_TYPE_DYNAMIC) &&
+ create_info.row_type == ROW_TYPE_NOT_USED))
+ {
+ sql_print_error(
+ "InnoDB: cfg and ibd disagree on row format for table %s.\n",
+ name);
+ DBUG_RETURN(ER_INNODB_IMPORT_ERROR);
+ }
+ else
+ create_info.row_type= row_type_from_cfg;
+ }
+ else if (create_info.row_type == ROW_TYPE_NOT_USED)
+ create_info.row_type= ROW_TYPE_DYNAMIC;
+ DBUG_RETURN(0);
+}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index 1f319aae..952ccee4 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -31,7 +31,6 @@ Created 4/20/1996 Heikki Tuuri
#include "btr0btr.h"
#include "btr0cur.h"
#include "mach0data.h"
-#include "ibuf0ibuf.h"
#include "que0que.h"
#include "row0upd.h"
#include "row0sel.h"
@@ -2686,8 +2685,6 @@ err_exit:
page_set_autoinc(root, auto_inc, &mtr, false);
}
- btr_pcur_get_btr_cur(&pcur)->thr = thr;
-
#ifdef UNIV_DEBUG
{
page_t* page = btr_pcur_get_page(&pcur);
@@ -2972,7 +2969,6 @@ row_ins_sec_index_entry_low(
ut_ad(!dict_index_is_clust(index));
ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_INSERT_TREE);
- cursor.thr = thr;
cursor.rtr_info = NULL;
cursor.page_cur.index = index;
ut_ad(thr_get_trx(thr)->id != 0);
@@ -2994,9 +2990,10 @@ row_ins_sec_index_entry_low(
if (index->is_spatial()) {
rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+ rtr_info.thr = thr;
rtr_info_update_btr(&cursor, &rtr_info);
- err = rtr_insert_leaf(&cursor, entry, search_mode, &mtr);
+ err = rtr_insert_leaf(&cursor, thr, entry, search_mode, &mtr);
if (err == DB_SUCCESS && search_mode == BTR_MODIFY_LEAF
&& rtr_info.mbr_adj) {
@@ -3005,6 +3002,7 @@ row_ins_sec_index_entry_low(
rtr_clean_rtr_info(&rtr_info, true);
rtr_init_rtr_info(&rtr_info, false, &cursor,
index, false);
+ rtr_info.thr = thr;
rtr_info_update_btr(&cursor, &rtr_info);
mtr.start();
if (index->table->is_temporary()) {
@@ -3012,7 +3010,7 @@ row_ins_sec_index_entry_low(
} else {
index->set_modified(mtr);
}
- err = rtr_insert_leaf(&cursor, entry,
+ err = rtr_insert_leaf(&cursor, thr, entry,
search_mode, &mtr);
}
@@ -3021,14 +3019,6 @@ row_ins_sec_index_entry_low(
goto func_exit;});
} else {
- if (!index->table->is_temporary()) {
- search_mode = btr_latch_mode(
- search_mode
- | (thr_get_trx(thr)->check_unique_secondary
- ? BTR_INSERT
- : BTR_INSERT | BTR_IGNORE_SEC_UNIQUE));
- }
-
err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode,
&mtr);
}
@@ -3040,12 +3030,6 @@ row_ins_sec_index_entry_low(
goto func_exit;
}
- if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
- ut_ad(!dict_index_is_spatial(index));
- /* The insert was buffered during the search: we are done */
- goto func_exit;
- }
-
#ifdef UNIV_DEBUG
{
page_t* page = btr_cur_get_page(&cursor);
@@ -3105,13 +3089,9 @@ row_ins_sec_index_entry_low(
locked with s-locks the necessary records to
prevent any insertion of a duplicate by another
transaction. Let us now reposition the cursor and
- continue the insertion (bypassing the change buffer). */
- err = cursor.search_leaf(
- entry, PAGE_CUR_LE,
- btr_latch_mode(search_mode
- & ~(BTR_INSERT
- | BTR_IGNORE_SEC_UNIQUE)),
- &mtr);
+ continue the insertion. */
+ err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode,
+ &mtr);
if (err != DB_SUCCESS) {
goto func_exit;
}
@@ -3342,11 +3322,6 @@ row_ins_sec_index_entry(
if (err == DB_FAIL) {
mem_heap_empty(heap);
- if (index->table->space == fil_system.sys_space
- && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
- ibuf_free_excess_pages();
- }
-
/* Try then pessimistic descent to the B-tree */
log_free_check();
@@ -3570,19 +3545,6 @@ row_ins_index_entry_step(
}
/***********************************************************//**
-Allocates a row id for row and inits the node->index field. */
-UNIV_INLINE
-void
-row_ins_alloc_row_id_step(
-/*======================*/
- ins_node_t* node) /*!< in: row insert node */
-{
- ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
- if (dict_table_get_first_index(node->table)->is_gen_clust())
- dict_sys_write_row_id(node->sys_buf, dict_sys.get_new_row_id());
-}
-
-/***********************************************************//**
Gets a row to insert from the values list. */
UNIV_INLINE
void
@@ -3662,13 +3624,18 @@ row_ins(
DBUG_PRINT("row_ins", ("table: %s", node->table->name.m_name));
if (node->state == INS_NODE_ALLOC_ROW_ID) {
-
- row_ins_alloc_row_id_step(node);
-
node->index = dict_table_get_first_index(node->table);
ut_ad(node->entry_list.empty() == false);
node->entry = node->entry_list.begin();
+ if (node->index->is_gen_clust()) {
+ const uint64_t db_row_id{++node->table->row_id};
+ if (db_row_id >> 48) {
+ DBUG_RETURN(DB_OUT_OF_FILE_SPACE);
+ }
+ mach_write_to_6(node->sys_buf, db_row_id);
+ }
+
if (node->ins_type == INS_SEARCHED) {
row_ins_get_row_from_select(node);
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
index c4f46304..882e7c64 100644
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@@ -1701,22 +1701,6 @@ err_exit:
if (error) {
goto err_exit;
}
-#ifdef UNIV_DEBUG
- switch (btr_pcur_get_btr_cur(pcur)->flag) {
- case BTR_CUR_DELETE_REF:
- case BTR_CUR_DEL_MARK_IBUF:
- case BTR_CUR_DELETE_IBUF:
- case BTR_CUR_INSERT_TO_IBUF:
- /* We did not request buffering. */
- break;
- case BTR_CUR_HASH:
- case BTR_CUR_HASH_FAIL:
- case BTR_CUR_BINARY:
- goto flag_ok;
- }
- ut_ad(0);
-flag_ok:
-#endif /* UNIV_DEBUG */
if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
|| btr_pcur_get_low_match(pcur) < index->n_uniq) {
@@ -1724,8 +1708,8 @@ flag_ok:
found, because new_table is being modified by
this thread only, and all indexes should be
updated in sync. */
- mtr->commit();
- return(DB_INDEX_CORRUPT);
+ error = DB_INDEX_CORRUPT;
+ goto err_exit;
}
btr_cur_pessimistic_delete(&error, FALSE,
@@ -1785,22 +1769,6 @@ row_log_table_apply_delete(
if (err != DB_SUCCESS) {
goto all_done;
}
-#ifdef UNIV_DEBUG
- switch (btr_pcur_get_btr_cur(&pcur)->flag) {
- case BTR_CUR_DELETE_REF:
- case BTR_CUR_DEL_MARK_IBUF:
- case BTR_CUR_DELETE_IBUF:
- case BTR_CUR_INSERT_TO_IBUF:
- /* We did not request buffering. */
- break;
- case BTR_CUR_HASH:
- case BTR_CUR_HASH_FAIL:
- case BTR_CUR_BINARY:
- goto flag_ok;
- }
- ut_ad(0);
-flag_ok:
-#endif /* UNIV_DEBUG */
if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
|| btr_pcur_get_low_match(&pcur) < index->n_uniq) {
@@ -1934,19 +1902,6 @@ func_exit_committed:
return error;
}
-#ifdef UNIV_DEBUG
- switch (btr_pcur_get_btr_cur(&pcur)->flag) {
- case BTR_CUR_DELETE_REF:
- case BTR_CUR_DEL_MARK_IBUF:
- case BTR_CUR_DELETE_IBUF:
- case BTR_CUR_INSERT_TO_IBUF:
- ut_ad(0);/* We did not request buffering. */
- case BTR_CUR_HASH:
- case BTR_CUR_HASH_FAIL:
- case BTR_CUR_BINARY:
- break;
- }
-#endif /* UNIV_DEBUG */
ut_ad(!page_rec_is_infimum(btr_pcur_get_rec(&pcur))
&& btr_pcur_get_low_match(&pcur) >= index->n_uniq);
@@ -2096,8 +2051,17 @@ func_exit_committed:
ut_free(pcur.old_rec_buf);
pcur.old_rec_buf = nullptr;
- if (ROW_FOUND != row_search_index_entry(
- entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+ error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
+ &pcur, &mtr);
+
+ if (error != DB_SUCCESS) {
+ ut_ad(0);
+ break;
+ }
+
+ if (btr_pcur_is_before_first_on_page(&pcur)
+ || btr_pcur_get_low_match(&pcur)
+ != dtuple_get_n_fields(entry)) {
ut_ad(0);
error = DB_CORRUPTION;
break;
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 6fb530f0..08ccaea2 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -150,7 +150,7 @@ public:
false);
rtr_info_update_btr(&ins_cur, &rtr_info);
- error = rtr_insert_leaf(&ins_cur, dtuple,
+ error = rtr_insert_leaf(&ins_cur, nullptr, dtuple,
BTR_MODIFY_LEAF, &mtr);
/* It need to update MBR in parent entry,
@@ -163,7 +163,8 @@ public:
rtr_info_update_btr(&ins_cur, &rtr_info);
mtr.start();
index->set_modified(mtr);
- error = rtr_insert_leaf(&ins_cur, dtuple,
+ error = rtr_insert_leaf(&ins_cur, nullptr,
+ dtuple,
BTR_MODIFY_TREE, &mtr);
}
@@ -186,7 +187,8 @@ public:
&ins_cur, index, false);
rtr_info_update_btr(&ins_cur, &rtr_info);
- error = rtr_insert_leaf(&ins_cur, dtuple,
+ error = rtr_insert_leaf(&ins_cur, nullptr,
+ dtuple,
BTR_MODIFY_TREE, &mtr);
if (error == DB_SUCCESS) {
@@ -2227,7 +2229,7 @@ end_of_index:
next_page_no),
old_table->space->zip_size(),
RW_S_LATCH, nullptr, BUF_GET, &mtr,
- &err, false);
+ &err);
if (!block) {
goto err_exit;
}
@@ -3687,8 +3689,6 @@ row_merge_mtuple_to_dtuple(
dtuple_t* dtuple,
const mtuple_t* mtuple)
{
- ut_ad(!dict_index_is_ibuf(index));
-
memcpy(dtuple->fields, mtuple->fields,
dtuple->n_fields * sizeof *mtuple->fields);
}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 6a71cf3a..4b570779 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -44,7 +44,6 @@ Created 9/17/2000 Heikki Tuuri
#include "fsp0file.h"
#include "fts0fts.h"
#include "fts0types.h"
-#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "log0log.h"
#include "pars0pars.h"
@@ -2339,12 +2338,7 @@ row_discard_tablespace(
2) Purge and rollback: we assign a new table id for the
table. Since purge and rollback look for the table based on
the table id, they see the table as 'dropped' and discard
- their operations.
-
- 3) Insert buffer: we remove all entries for the tablespace in
- the insert buffer tree. */
-
- ibuf_delete_for_discarded_space(table->space_id);
+ their operations. */
table_id_t new_id;
@@ -2446,15 +2440,23 @@ rollback:
/* Note: The following cannot be rolled back. Rollback would see the
UPDATE of SYS_INDEXES.TABLE_ID as two operations: DELETE and INSERT.
It would invoke btr_free_if_exists() when rolling back the INSERT,
- effectively dropping all indexes of the table. Furthermore, calls like
- ibuf_delete_for_discarded_space() are already discarding data
- before the transaction is committed.
+ effectively dropping all indexes of the table. Furthermore, we are
+ already discarding data before the transaction is committed.
It would be better to remove the integrity-breaking
ALTER TABLE...DISCARD TABLESPACE operation altogether. */
table->file_unreadable= true;
table->space= nullptr;
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic push
+# if __GNUC__ < 12 || defined WITH_UBSAN
+# pragma GCC diagnostic ignored "-Wconversion"
+# endif
+#endif
table->flags2|= DICT_TF2_DISCARDED;
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic pop
+#endif
err= row_discard_tablespace(trx, table);
DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
log_buffer_flush_to_disk(); DBUG_SUICIDE(););
@@ -2474,7 +2476,6 @@ rollback:
if (fts_exist)
purge_sys.resume_FTS();
- ibuf_delete_for_discarded_space(space_id);
buf_flush_remove_pages(space_id);
trx->op_info= "";
return err;
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
index d83ab861..adac8ecf 100644
--- a/storage/innobase/row/row0purge.cc
+++ b/storage/innobase/row/row0purge.cc
@@ -185,10 +185,6 @@ close_and_exit:
table = nullptr;
}
- if (space_id) {
- ibuf_delete_for_discarded_space(space_id);
- }
-
mtr.start();
index->set_modified(mtr);
@@ -277,10 +273,10 @@ not delete marked version of a clustered index record where DB_TRX_ID
is newer than the purge view.
NOTE: This function should only be called by the purge thread, only
-while holding a latch on the leaf page of the secondary index entry
-(or keeping the buffer pool watch on the page). It is possible that
-this function first returns true and then false, if a user transaction
-inserts a record that the secondary index entry would refer to.
+while holding a latch on the leaf page of the secondary index entry.
+It is possible that this function first returns true and then false,
+if a user transaction inserts a record that the secondary index entry
+would refer to.
However, in that case, the user transaction would also re-insert the
secondary index entry after purge has removed it and released the leaf
page latch.
@@ -296,6 +292,7 @@ page latch.
@param[in] is_tree true=pessimistic purge,
false=optimistic (leaf-page only)
@return true if the secondary index record can be purged */
+static
bool
row_purge_poss_sec(
purge_node_t* node,
@@ -353,14 +350,11 @@ row_purge_remove_sec_if_poss_tree(
pcur.btr_cur.page_cur.index = index;
if (index->is_spatial()) {
- if (!rtr_search(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
- goto found;
+ if (rtr_search(entry, BTR_PURGE_TREE, &pcur, nullptr, &mtr)) {
+ goto func_exit;
}
- goto func_exit;
- }
-
- switch (row_search_index_entry(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
- case ROW_NOT_FOUND:
+ } else if (!row_search_index_entry(entry, BTR_PURGE_TREE,
+ &pcur, &mtr)) {
/* Not found. This is a legitimate condition. In a
rollback, InnoDB will remove secondary recs that would
be purged anyway. Then the actual purge will not find
@@ -370,25 +364,13 @@ row_purge_remove_sec_if_poss_tree(
index, it will remove it. Then if/when the purge
comes to consider the secondary index record a second
time, it will not exist any more in the index. */
-
- /* fputs("PURGE:........sec entry not found\n", stderr); */
- /* dtuple_print(stderr, entry); */
goto func_exit;
- case ROW_FOUND:
- break;
- case ROW_BUFFERED:
- case ROW_NOT_DELETED_REF:
- /* These are invalid outcomes, because the mode passed
- to row_search_index_entry() did not include any of the
- flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
- ut_error;
}
/* We should remove the index record if no later version of the row,
which cannot be purged yet, requires its existence. If some requires,
we should do nothing. */
-found:
if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
/* Remove the index record, which should have been
@@ -457,24 +439,17 @@ row_purge_remove_sec_if_poss_leaf(
pcur.btr_cur.page_cur.index = index;
- /* Set the purge node for the call to row_purge_poss_sec(). */
- pcur.btr_cur.purge_node = node;
if (index->is_spatial()) {
- pcur.btr_cur.thr = NULL;
- if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, &mtr)) {
+ if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, nullptr,
+ &mtr)) {
goto found;
}
- goto func_exit;
- }
-
- /* Set the query thread, so that ibuf_insert_low() will be
- able to invoke thd_get_trx(). */
- pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
-
- switch (row_search_index_entry(entry, index->has_virtual()
- ? BTR_MODIFY_LEAF : BTR_PURGE_LEAF,
- &pcur, &mtr)) {
- case ROW_FOUND:
+ } else if (btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_LEAF, &pcur,
+ &mtr)
+ == DB_SUCCESS
+ && !btr_pcur_is_before_first_on_page(&pcur)
+ && btr_pcur_get_low_match(&pcur)
+ == dtuple_get_n_fields(entry)) {
found:
/* Before attempting to purge a record, check
if it is safe to do so. */
@@ -503,25 +478,18 @@ found:
if (index->is_spatial()) {
const buf_block_t* block = btr_cur_get_block(
btr_cur);
+ const page_id_t id{block->page.id()};
- if (block->page.id().page_no()
- != index->page
+ if (id.page_no() != index->page
&& page_get_n_recs(block->page.frame) < 2
- && !lock_test_prdt_page_lock(
- btr_cur->rtr_info
- && btr_cur->rtr_info->thr
- ? thr_get_trx(
- btr_cur->rtr_info->thr)
- : nullptr,
- block->page.id())) {
+ && !lock_test_prdt_page_lock(nullptr, id)){
/* this is the last record on page,
and it has a "page" lock on it,
which mean search is still depending
on it, so do not delete */
DBUG_LOG("purge",
"skip purging last"
- " record on page "
- << block->page.id());
+ " record on page " << id);
goto func_exit;
}
}
@@ -529,25 +497,13 @@ found:
success = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
!= DB_FAIL;
}
+ }
- /* (The index entry is still needed,
- or the deletion succeeded) */
- /* fall through */
- case ROW_NOT_DELETED_REF:
- /* The index entry is still needed. */
- case ROW_BUFFERED:
- /* The deletion was buffered. */
- case ROW_NOT_FOUND:
- /* The index entry does not exist, nothing to do. */
func_exit:
- mtr.commit();
+ mtr.commit();
cleanup:
- btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
- return(success);
- }
-
- ut_error;
- return(false);
+ btr_pcur_close(&pcur);
+ return success;
}
/***********************************************************//**
@@ -600,10 +556,7 @@ Purges a delete marking of a record.
@retval false the purge needs to be suspended because of
running out of file space */
static MY_ATTRIBUTE((nonnull, warn_unused_result))
-bool
-row_purge_del_mark(
-/*===============*/
- purge_node_t* node) /*!< in/out: row purge node */
+bool row_purge_del_mark(purge_node_t *node)
{
if (node->index)
{
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
index 057b20c7..f86dbf2f 100644
--- a/storage/innobase/row/row0quiesce.cc
+++ b/storage/innobase/row/row0quiesce.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -27,7 +27,6 @@ Created 2012-02-08 by Sunny Bains.
#include "row0quiesce.h"
#include "row0mysql.h"
#include "buf0flu.h"
-#include "ibuf0ibuf.h"
#include "srv0start.h"
#include "trx0purge.h"
@@ -539,18 +538,6 @@ row_quiesce_table_start(
purge_sys.stop();
}
- for (ulint count = 0;
- ibuf_merge_space(table->space_id);
- ++count) {
- if (trx_is_interrupted(trx)) {
- goto aborted;
- }
- if (!(count % 20)) {
- ib::info() << "Merging change buffer entries for "
- << table->name;
- }
- }
-
while (buf_flush_list_space(table->space)) {
if (trx_is_interrupted(trx)) {
goto aborted;
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
index 4a00b2a4..a7cddee0 100644
--- a/storage/innobase/row/row0row.cc
+++ b/storage/innobase/row/row0row.cc
@@ -215,28 +215,20 @@ row_build_index_entry_low(
entry = dtuple_create(heap, entry_len);
}
- if (dict_index_is_ibuf(index)) {
- dtuple_set_n_fields_cmp(entry, entry_len);
- /* There may only be externally stored columns
- in a clustered index B-tree of a user table. */
- ut_a(!ext);
- } else {
- dtuple_set_n_fields_cmp(
- entry, dict_index_get_n_unique_in_tree(index));
- if (dict_index_is_spatial(index)) {
- /* Set the MBR field */
- if (!row_build_spatial_index_key(
- index, ext,
- dtuple_get_nth_field(entry, 0),
- dtuple_get_nth_field(
- row,
- dict_index_get_nth_field(index, i)
- ->col->ind), flag, heap)) {
- return NULL;
- }
-
- i = 1;
+ dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique_in_tree(index));
+ if (index->is_spatial()) {
+ /* Set the MBR field */
+ if (!row_build_spatial_index_key(
+ index, ext,
+ dtuple_get_nth_field(entry, 0),
+ dtuple_get_nth_field(
+ row,
+ dict_index_get_nth_field(index, i)
+ ->col->ind), flag, heap)) {
+ return NULL;
}
+
+ i = 1;
}
for (; i < entry_len; i++) {
@@ -1262,8 +1254,8 @@ row_get_clust_rec(
/***************************************************************//**
Searches an index record.
-@return whether the record was found or buffered */
-enum row_search_result
+@return whether the record was found */
+bool
row_search_index_entry(
/*===================*/
const dtuple_t* entry, /*!< in: index entry */
@@ -1272,47 +1264,14 @@ row_search_index_entry(
be closed by the caller */
mtr_t* mtr) /*!< in: mtr */
{
- ulint n_fields;
- ulint low_match;
- rec_t* rec;
-
ut_ad(dtuple_check_typed(entry));
if (btr_pcur_open(entry, PAGE_CUR_LE, mode, pcur, mtr) != DB_SUCCESS) {
- return ROW_NOT_FOUND;
- }
-
- switch (btr_pcur_get_btr_cur(pcur)->flag) {
- case BTR_CUR_DELETE_REF:
- ut_ad(!(~mode & BTR_DELETE));
- return(ROW_NOT_DELETED_REF);
-
- case BTR_CUR_DEL_MARK_IBUF:
- case BTR_CUR_DELETE_IBUF:
- case BTR_CUR_INSERT_TO_IBUF:
- return(ROW_BUFFERED);
-
- case BTR_CUR_HASH:
- case BTR_CUR_HASH_FAIL:
- case BTR_CUR_BINARY:
- break;
- }
-
- low_match = btr_pcur_get_low_match(pcur);
-
- rec = btr_pcur_get_rec(pcur);
-
- n_fields = dtuple_get_n_fields(entry);
-
- if (page_rec_is_infimum(rec)) {
-
- return(ROW_NOT_FOUND);
- } else if (low_match != n_fields) {
-
- return(ROW_NOT_FOUND);
+ return false;
}
- return(ROW_FOUND);
+ return !btr_pcur_is_before_first_on_page(pcur)
+ && btr_pcur_get_low_match(pcur) == dtuple_get_n_fields(entry);
}
/*******************************************************************//**
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 944f7358..0a38e30b 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -4039,7 +4039,8 @@ row_search_idx_cond_check(
ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
if (!prebuilt->idx_cond) {
- if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+ if (!prebuilt->pk_filter ||
+ !handler_rowid_filter_is_active(prebuilt->pk_filter)) {
return(CHECK_POS);
}
} else {
@@ -4081,7 +4082,8 @@ row_search_idx_cond_check(
switch (result) {
case CHECK_POS:
- if (handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+ if (prebuilt->pk_filter &&
+ handler_rowid_filter_is_active(prebuilt->pk_filter)) {
ut_ad(!prebuilt->index->is_primary());
if (prebuilt->clust_index_was_generated) {
ulint len;
@@ -4765,14 +4767,13 @@ wait_table_again:
}
} else if (dtuple_get_n_fields(search_tuple) > 0) {
- pcur->btr_cur.thr = thr;
pcur->old_rec = nullptr;
if (index->is_spatial()) {
if (!prebuilt->rtr_info) {
prebuilt->rtr_info = rtr_create_rtr_info(
- set_also_gap_locks, true,
- btr_pcur_get_btr_cur(pcur), index);
+ set_also_gap_locks, true, thr,
+ btr_pcur_get_btr_cur(pcur));
prebuilt->rtr_info->search_tuple = search_tuple;
prebuilt->rtr_info->search_mode = mode;
rtr_info_update_btr(btr_pcur_get_btr_cur(pcur),
@@ -4785,7 +4786,8 @@ wait_table_again:
prebuilt->rtr_info->search_mode = mode;
}
- err = rtr_search_leaf(pcur, search_tuple, mode, &mtr);
+ err = rtr_search_leaf(pcur, thr, search_tuple, mode,
+ &mtr);
} else {
err = btr_pcur_open_with_no_init(search_tuple, mode,
BTR_SEARCH_LEAF,
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
index 23255cc9..b78a2c41 100644
--- a/storage/innobase/row/row0uins.cc
+++ b/storage/innobase/row/row0uins.cc
@@ -40,7 +40,6 @@ Created 2/25/1997 Heikki Tuuri
#include "row0row.h"
#include "row0upd.h"
#include "que0que.h"
-#include "ibuf0ibuf.h"
#include "log0log.h"
#include "fil0fil.h"
#include <mysql/service_thd_mdl.h>
@@ -188,10 +187,6 @@ restart:
os_file_close(d);
}
- if (space_id) {
- ibuf_delete_for_discarded_space(space_id);
- }
-
mtr.start();
ut_a(node->pcur.restore_position(
BTR_MODIFY_LEAF, &mtr) == btr_pcur_t::SAME_ALL);
@@ -271,7 +266,7 @@ row_undo_ins_remove_sec_low(
const bool modify_leaf = mode == BTR_MODIFY_LEAF;
pcur.btr_cur.page_cur.index = index;
- row_mtr_start(&mtr, index, !modify_leaf);
+ row_mtr_start(&mtr, index);
if (index->is_spatial()) {
mode = modify_leaf
@@ -279,8 +274,7 @@ row_undo_ins_remove_sec_low(
| BTR_RTREE_DELETE_MARK
| BTR_RTREE_UNDO_INS)
: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
- btr_pcur_get_btr_cur(&pcur)->thr = thr;
- if (rtr_search(entry, mode, &pcur, &mtr)) {
+ if (rtr_search(entry, mode, &pcur, thr, &mtr)) {
goto func_exit;
}
@@ -301,28 +295,17 @@ row_undo_ins_remove_sec_low(
mtr_x_lock_index(index, &mtr);
}
- switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
- case ROW_BUFFERED:
- case ROW_NOT_DELETED_REF:
- /* These are invalid outcomes, because the mode passed
- to row_search_index_entry() did not include any of the
- flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
- ut_error;
- case ROW_NOT_FOUND:
- break;
- case ROW_FOUND:
- found:
- btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
-
+ if (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+found:
if (modify_leaf) {
- err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+ err = btr_cur_optimistic_delete(&pcur.btr_cur, 0, &mtr);
} else {
/* Passing rollback=false here, because we are
deleting a secondary index record: the distinction
only matters when deleting a record that contains
externally stored columns. */
- btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
- false, &mtr);
+ btr_cur_pessimistic_delete(&err, FALSE, &pcur.btr_cur,
+ 0, false, &mtr);
}
}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index 52f54443..38d19882 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -33,7 +33,6 @@ Created 2/27/1997 Heikki Tuuri
#include "trx0purge.h"
#include "btr0btr.h"
#include "mach0data.h"
-#include "ibuf0ibuf.h"
#include "row0undo.h"
#include "row0vers.h"
#include "trx0trx.h"
@@ -491,7 +490,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
mtr_t mtr_vers;
const bool modify_leaf = mode == BTR_MODIFY_LEAF;
- row_mtr_start(&mtr, index, !modify_leaf);
+ row_mtr_start(&mtr, index);
pcur.btr_cur.page_cur.index = index;
btr_cur = btr_pcur_get_btr_cur(&pcur);
@@ -502,8 +501,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
| BTR_RTREE_DELETE_MARK
| BTR_RTREE_UNDO_INS)
: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
- btr_cur->thr = thr;
- if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+ if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, thr, &mtr))) {
goto found;
} else {
goto func_exit;
@@ -527,9 +525,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
ut_ad(!dict_index_is_online_ddl(index));
}
- switch (UNIV_EXPECT(row_search_index_entry(entry, mode, &pcur, &mtr),
- ROW_FOUND)) {
- case ROW_NOT_FOUND:
+ if (!row_search_index_entry(entry, mode, &pcur, &mtr)) {
/* In crash recovery, the secondary index record may
be missing if the UPDATE did not have time to insert
the secondary index records before the crash. When we
@@ -540,14 +536,6 @@ row_undo_mod_del_mark_or_remove_sec_low(
before it has inserted all updated secondary index
records, then the undo will not find those records. */
goto func_exit;
- case ROW_FOUND:
- break;
- case ROW_BUFFERED:
- case ROW_NOT_DELETED_REF:
- /* These are invalid outcomes, because the mode passed
- to row_search_index_entry() did not include any of the
- flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
- ut_error;
}
found:
@@ -685,12 +673,13 @@ row_undo_mod_del_unmark_sec_and_undo_update(
}
try_again:
- row_mtr_start(&mtr, index, mode & 8);
+ row_mtr_start(&mtr, index);
- btr_cur->thr = thr;
+ mem_heap_t* offsets_heap = nullptr;
+ rec_offs* offsets = nullptr;
if (index->is_spatial()) {
- if (!rtr_search(entry, mode, &pcur, &mtr)) {
+ if (!rtr_search(entry, mode, &pcur, thr, &mtr)) {
goto found;
}
@@ -704,17 +693,7 @@ try_again:
goto not_found;
}
- switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
- mem_heap_t* heap;
- mem_heap_t* offsets_heap;
- rec_offs* offsets;
- case ROW_BUFFERED:
- case ROW_NOT_DELETED_REF:
- /* These are invalid outcomes, because the mode passed
- to row_search_index_entry() did not include any of the
- flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
- ut_error;
- case ROW_NOT_FOUND:
+ if (!row_search_index_entry(entry, mode, &pcur, &mtr)) {
not_found:
if (btr_cur->up_match >= dict_index_get_n_unique(index)
|| btr_cur->low_match >= dict_index_get_n_unique(index)) {
@@ -726,7 +705,7 @@ not_found:
<< " at: " << rec_index_print(
btr_cur_get_rec(btr_cur), index);
err = DB_DUPLICATE_KEY;
- break;
+ goto func_exit;
}
ib::warn() << "Record in index " << index->name
@@ -740,8 +719,6 @@ not_found:
delete-unmark. */
big_rec_t* big_rec;
rec_t* insert_rec;
- offsets = NULL;
- offsets_heap = NULL;
err = btr_cur_optimistic_insert(
flags, btr_cur, &offsets, &offsets_heap,
@@ -770,16 +747,13 @@ not_found:
if (offsets_heap) {
mem_heap_free(offsets_heap);
}
-
- break;
- case ROW_FOUND:
+ } else {
found:
btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
btr_cur_get_rec(btr_cur), &mtr);
- heap = mem_heap_create(
+ mem_heap_t* heap = mem_heap_create(
sizeof(upd_t)
+ dtuple_get_n_fields(entry) * sizeof(upd_field_t));
- offsets_heap = NULL;
offsets = rec_get_offsets(
btr_cur_get_rec(btr_cur),
index, nullptr, index->n_core_fields, ULINT_UNDEFINED,
@@ -818,6 +792,7 @@ found:
mem_heap_free(offsets_heap);
}
+func_exit:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index a39574d2..76bd1eff 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -1836,9 +1836,7 @@ row_upd_sec_index_entry(
dict_index_t* index;
dberr_t err = DB_SUCCESS;
trx_t* trx = thr_get_trx(thr);
- btr_latch_mode mode;
ulint flags;
- enum row_search_result search_result;
ut_ad(trx->id != 0);
@@ -1866,7 +1864,6 @@ row_upd_sec_index_entry(
"before_row_upd_sec_index_entry");
mtr.start();
- mode = BTR_MODIFY_LEAF;
switch (index->table->space_id) {
case SRV_TMP_SPACE_ID:
@@ -1876,24 +1873,17 @@ row_upd_sec_index_entry(
default:
index->set_modified(mtr);
/* fall through */
- case IBUF_SPACE_ID:
+ case 0:
flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
- /* We can only buffer delete-mark operations if there
- are no foreign key constraints referring to the index. */
- if (!referenced) {
- mode = BTR_DELETE_MARK_LEAF;
- }
- break;
}
- /* Set the query thread, so that ibuf_insert_low() will be
- able to invoke thd_get_trx(). */
- pcur.btr_cur.thr = thr;
pcur.btr_cur.page_cur.index = index;
+ const rec_t *rec;
if (index->is_spatial()) {
- mode = btr_latch_mode(BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK);
- if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+ constexpr btr_latch_mode mode = btr_latch_mode(
+ BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK);
+ if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, thr, &mtr))) {
goto found;
}
@@ -1903,20 +1893,8 @@ row_upd_sec_index_entry(
}
goto not_found;
- }
-
- search_result = row_search_index_entry(entry, mode, &pcur, &mtr);
-
- switch (search_result) {
- const rec_t* rec;
- case ROW_NOT_DELETED_REF: /* should only occur for BTR_DELETE */
- ut_error;
- break;
- case ROW_BUFFERED:
- /* Entry was delete marked already. */
- break;
-
- case ROW_NOT_FOUND:
+ } else if (!row_search_index_entry(entry, BTR_MODIFY_LEAF,
+ &pcur, &mtr)) {
not_found:
rec = btr_pcur_get_rec(&pcur);
ib::error()
@@ -1930,8 +1908,7 @@ not_found:
ut_ad(btr_validate_index(index, 0) == DB_SUCCESS);
ut_ad(0);
#endif /* UNIV_DEBUG */
- break;
- case ROW_FOUND:
+ } else {
found:
ut_ad(err == DB_SUCCESS);
rec = btr_pcur_get_rec(&pcur);
@@ -1946,7 +1923,7 @@ found:
btr_pcur_get_block(&pcur),
btr_pcur_get_rec(&pcur), index, thr, &mtr);
if (err != DB_SUCCESS) {
- break;
+ goto close;
}
btr_rec_set_deleted<true>(btr_pcur_get_block(&pcur),
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index 62229842..2a22403e 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -2,7 +2,7 @@
Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -27,7 +27,6 @@ Created 12/9/2009 Jimmy Yang
#include "buf0flu.h"
#include "dict0mem.h"
-#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "mach0data.h"
#include "os0file.h"
@@ -517,23 +516,10 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf",
INDEX_NON_LEAF),
- MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf",
- INDEX_IBUF_LEAF),
-
- MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf",
- "Insert Buffer Index Non-Leaf",
- INDEX_IBUF_NON_LEAF),
-
MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG),
MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE),
- MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List",
- IBUF_FREELIST),
-
- MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap",
- IBUF_BITMAP),
-
MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM),
MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM),
@@ -556,23 +542,10 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf",
INDEX_NON_LEAF),
- MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf",
- INDEX_IBUF_LEAF),
-
- MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf",
- "Insert Buffer Index Non-Leaf",
- INDEX_IBUF_NON_LEAF),
-
MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG),
MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE),
- MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List",
- IBUF_FREELIST),
-
- MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap",
- IBUF_BITMAP),
-
MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM),
MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System",
@@ -941,57 +914,6 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED},
- /* ========== Counters for Change Buffer ========== */
- {"module_ibuf_system", "change_buffer", "InnoDB Change Buffer",
- MONITOR_MODULE,
- MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM},
-
- {"ibuf_merges_insert", "change_buffer",
- "Number of inserted records merged by change buffering",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT},
-
- {"ibuf_merges_delete_mark", "change_buffer",
- "Number of deleted records merged by change buffering",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE},
-
- {"ibuf_merges_delete", "change_buffer",
- "Number of purge records merged by change buffering",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE},
-
- {"ibuf_merges_discard_insert", "change_buffer",
- "Number of insert merged operations discarded",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT},
-
- {"ibuf_merges_discard_delete_mark", "change_buffer",
- "Number of deleted merged operations discarded",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE},
-
- {"ibuf_merges_discard_delete", "change_buffer",
- "Number of purge merged operations discarded",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE},
-
- {"ibuf_merges", "change_buffer", "Number of change buffer merges",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES},
-
- {"ibuf_size", "change_buffer", "Change buffer size in pages",
- static_cast<monitor_type_t>(
- MONITOR_EXISTING | MONITOR_DEFAULT_ON),
- MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE},
-
/* ========== Counters for server operations ========== */
{"module_innodb", "innodb",
"Counter for general InnoDB server wide operations and properties",
@@ -1549,38 +1471,6 @@ srv_mon_process_existing_counter(
value = fil_system.n_open;
break;
- case MONITOR_OVLD_IBUF_MERGE_INSERT:
- value = ibuf.n_merged_ops[IBUF_OP_INSERT];
- break;
-
- case MONITOR_OVLD_IBUF_MERGE_DELETE:
- value = ibuf.n_merged_ops[IBUF_OP_DELETE_MARK];
- break;
-
- case MONITOR_OVLD_IBUF_MERGE_PURGE:
- value = ibuf.n_merged_ops[IBUF_OP_DELETE];
- break;
-
- case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
- value = ibuf.n_discarded_ops[IBUF_OP_INSERT];
- break;
-
- case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
- value = ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK];
- break;
-
- case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
- value = ibuf.n_discarded_ops[IBUF_OP_DELETE];
- break;
-
- case MONITOR_OVLD_IBUF_MERGES:
- value = ibuf.n_merges;
- break;
-
- case MONITOR_OVLD_IBUF_SIZE:
- value = ibuf.size;
- break;
-
case MONITOR_OVLD_SERVER_ACTIVITY:
value = srv_get_activity_count();
break;
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 7c0c4b92..7d2a6072 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -3,7 +3,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -48,7 +48,6 @@ Created 10/8/1995 Heikki Tuuri
#include "buf0lru.h"
#include "dict0boot.h"
#include "dict0load.h"
-#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "log0recv.h"
#include "mem0mem.h"
@@ -61,7 +60,6 @@ Created 10/8/1995 Heikki Tuuri
#include "srv0start.h"
#include "trx0i_s.h"
#include "trx0purge.h"
-#include "btr0defragment.h"
#include "ut0mem.h"
#include "fil0fil.h"
#include "fil0crypt.h"
@@ -123,9 +121,9 @@ my_bool srv_read_only_mode;
/** store to its own file each table created by an user; data
dictionary tables are in the system tablespace 0 */
my_bool srv_file_per_table;
-/** Set if InnoDB operates in read-only mode or innodb-force-recovery
-is greater than SRV_FORCE_NO_TRX_UNDO. */
-my_bool high_level_read_only;
+/** Set if innodb_read_only is set or innodb_force_recovery
+is SRV_FORCE_NO_UNDO_LOG_SCAN or greater. */
+bool high_level_read_only;
/** Sort buffer size in index creation */
ulong srv_sort_buf_size;
@@ -216,13 +214,6 @@ in the buffer cache and accessed sequentially for InnoDB to trigger a
readahead request. */
ulong srv_read_ahead_threshold;
-/** innodb_change_buffer_max_size; maximum on-disk size of change
-buffer in terms of percentage of the buffer pool. */
-uint srv_change_buffer_max_size;
-
-ulong srv_file_flush_method;
-
-
/** copy of innodb_open_files; @see innodb_init_params() */
ulint srv_max_n_open_files;
@@ -279,7 +270,7 @@ my_bool srv_print_all_deadlocks;
INFORMATION_SCHEMA.innodb_cmp_per_index */
my_bool srv_cmp_per_index_enabled;
-/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+/** innodb_fast_shutdown=1 skips the purge of transaction history.
innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
of active transaction (to be done on restart). */
@@ -313,8 +304,6 @@ unsigned long long srv_stats_modified_counter;
based on number of configured pages */
my_bool srv_stats_sample_traditional;
-my_bool srv_use_doublewrite_buf;
-
/** innodb_sync_spin_loops */
ulong srv_n_spin_wait_rounds;
/** innodb_spin_wait_delay */
@@ -323,21 +312,6 @@ uint srv_spin_wait_delay;
/** Number of initialized rollback segments for persistent undo log */
ulong srv_available_undo_logs;
-/* Defragmentation */
-my_bool srv_defragment;
-/** innodb_defragment_n_pages */
-uint srv_defragment_n_pages;
-uint srv_defragment_stats_accuracy;
-/** innodb_defragment_fill_factor_n_recs */
-uint srv_defragment_fill_factor_n_recs;
-/** innodb_defragment_fill_factor */
-double srv_defragment_fill_factor;
-/** innodb_defragment_frequency */
-uint srv_defragment_frequency;
-/** derived from innodb_defragment_frequency;
-@see innodb_defragment_frequency_update() */
-ulonglong srv_defragment_interval;
-
/** Current mode of operation */
enum srv_operation_mode srv_operation;
@@ -381,8 +355,6 @@ FILE* srv_misc_tmpfile;
ulint srv_main_active_loops;
/** Iterations of the loop bounded by the 'srv_idle' label. */
ulint srv_main_idle_loops;
-/** Iterations of the loop bounded by the 'srv_shutdown' label. */
-static ulint srv_main_shutdown_loops;
/** Log writes involving flush. */
ulint srv_log_writes_and_flush;
@@ -548,10 +520,9 @@ srv_print_master_thread_info(
FILE *file) /* in: output stream */
{
fprintf(file, "srv_master_thread loops: " ULINTPF " srv_active, "
- ULINTPF " srv_shutdown, " ULINTPF " srv_idle\n"
+ ULINTPF " srv_idle\n"
"srv_master_thread log flush and writes: " ULINTPF "\n",
srv_main_active_loops,
- srv_main_shutdown_loops,
srv_main_idle_loops,
srv_log_writes_and_flush);
}
@@ -770,8 +741,6 @@ srv_printf_innodb_monitor(
"--------\n", file);
os_aio_print(file);
- ibuf_print(file);
-
#ifdef BTR_CUR_HASH_ADAPT
if (btr_search_enabled) {
fputs("-------------------\n"
@@ -953,11 +922,6 @@ srv_export_innodb_status(void)
export_vars.innodb_n_temp_blocks_decrypted =
srv_stats.n_temp_blocks_decrypted;
- export_vars.innodb_defragment_compression_failures =
- btr_defragment_compression_failures;
- export_vars.innodb_defragment_failures = btr_defragment_failures;
- export_vars.innodb_defragment_count = btr_defragment_count;
-
export_vars.innodb_onlineddl_rowlog_rows = onlineddl_rowlog_rows;
export_vars.innodb_onlineddl_rowlog_pct_used = onlineddl_rowlog_pct_used;
export_vars.innodb_onlineddl_pct_progress = onlineddl_pct_progress;
@@ -1272,31 +1236,6 @@ static void srv_sync_log_buffer_in_background()
}
}
-/** Report progress during shutdown.
-@param last time of last output
-@param n_read number of page reads initiated for change buffer merge */
-static void srv_shutdown_print(time_t &last, ulint n_read)
-{
- time_t now= time(nullptr);
- if (now - last >= 15)
- {
- last= now;
-
- const ulint ibuf_size= ibuf.size;
- sql_print_information("Completing change buffer merge;"
- " %zu page reads initiated;"
- " %zu change buffer pages remain",
- n_read, ibuf_size);
-#if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY
- service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
- "Completing change buffer merge;"
- " %zu page reads initiated;"
- " %zu change buffer pages remain",
- n_read, ibuf_size);
-#endif
- }
-}
-
/** Perform periodic tasks whenever the server is active.
@param counter_time microsecond_interval_timer() */
static void srv_master_do_active_tasks(ulonglong counter_time)
@@ -1334,32 +1273,6 @@ static void srv_master_do_idle_tasks(ulonglong counter_time)
MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
}
-/**
-Complete the shutdown tasks such as background DROP TABLE,
-and optionally change buffer merge (on innodb_fast_shutdown=0). */
-void srv_shutdown(bool ibuf_merge)
-{
- ulint n_read = 0;
- time_t now = time(NULL);
-
- do {
- ut_ad(!srv_read_only_mode);
- ut_ad(srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
- ++srv_main_shutdown_loops;
-
- if (ibuf_merge) {
- srv_main_thread_op_info = "doing insert buffer merge";
- /* Disallow the use of change buffer to
- avoid a race condition with
- ibuf_read_merge_pages() */
- ibuf_max_size_update(0);
- log_free_check();
- n_read = ibuf_contract();
- srv_shutdown_print(now, n_read);
- }
- } while (n_read);
-}
-
/** The periodic master task controlling the server. */
void srv_master_callback(void*)
{
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index fc557673..e13bc77d 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -2,7 +2,7 @@
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
@@ -62,10 +62,8 @@ Created 2/16/1996 Heikki Tuuri
#include "btr0btr.h"
#include "btr0cur.h"
#include "rem0rec.h"
-#include "ibuf0ibuf.h"
#include "srv0start.h"
#include "srv0srv.h"
-#include "btr0defragment.h"
#include "mysql/service_wsrep.h" /* wsrep_recovery */
#include "trx0rseg.h"
#include "buf0flu.h"
@@ -88,6 +86,7 @@ Created 2/16/1996 Heikki Tuuri
#include "row0row.h"
#include "row0mysql.h"
#include "btr0pcur.h"
+#include "ibuf0ibuf.h"
#include "zlib.h"
#include "log.h"
@@ -370,6 +369,11 @@ inline dberr_t trx_sys_t::reset_page(mtr_t *mtr)
sys_header->page.frame + TRX_SYS_DOUBLEWRITE
+ FSEG_HEADER_SIZE + TRX_SYS_DOUBLEWRITE_REPEAT,
sys_header->page.frame + TRX_SYS_DOUBLEWRITE + FSEG_HEADER_SIZE, 12);
+ mtr->write<4>(
+ *sys_header,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ sys_header->page.frame,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
}
return DB_SUCCESS;
@@ -592,7 +596,8 @@ static uint32_t trx_rseg_get_n_undo_tablespaces()
mtr_t mtr;
mtr.start();
- if (const buf_block_t *sys_header= trx_sysf_get(&mtr, false))
+ if (const buf_block_t *sys_header=
+ recv_sys.recover({TRX_SYS_SPACE, TRX_SYS_PAGE_NO}, &mtr, nullptr))
for (ulint rseg_id= 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++)
if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) != FIL_NULL)
if (uint32_t space= trx_sysf_rseg_get_space(sys_header, rseg_id))
@@ -822,7 +827,7 @@ unused_undo:
{
char name[OS_FILE_MAX_PATH];
snprintf(name, sizeof name, "%s/undo%03u", srv_undo_dir, i);
- uint32_t space_id= srv_undo_tablespace_open(create_new_undo, name, i);
+ uint32_t space_id= srv_undo_tablespace_open(false, name, i);
if (!space_id || space_id == ~0U)
break;
if (0 == srv_undo_tablespaces_open++)
@@ -1034,7 +1039,7 @@ srv_init_abort_low(
/** Prepare to delete the redo log file. Flush the dirty pages from all the
buffer pools. Flush the redo log buffer to the redo log file.
@return lsn upto which data pages have been flushed. */
-static lsn_t srv_prepare_to_delete_redo_log_file()
+ATTRIBUTE_COLD static lsn_t srv_prepare_to_delete_redo_log_file()
{
DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
@@ -1105,6 +1110,77 @@ same_size:
DBUG_RETURN(flushed_lsn);
}
+/** Upgrade the redo log to the latest format, or change its size
+or encryption, before starting to write any log records. */
+ATTRIBUTE_COLD static dberr_t srv_log_rebuild()
+{
+ /* Prepare to delete the old redo log file */
+ const lsn_t lsn{srv_prepare_to_delete_redo_log_file()};
+
+ DBUG_EXECUTE_IF("innodb_log_abort_1", return DB_ERROR;);
+ /* Prohibit redo log writes from any other threads until creating a
+ log checkpoint at the end of create_log_file(). */
+ ut_d(recv_no_log_write= true);
+ ut_ad(!os_aio_pending_reads());
+ ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+ ut_ad(!buf_pool.get_oldest_modification(0));
+ ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+ /* os_aio_pending_writes() may hold here if some write_io_callback()
+ did not release the slot yet. However, the page write itself must
+ have completed, because the buf_pool.flush_list is empty. In debug
+ builds, we wait for this to happen, hoping to get a hung process if
+ this assumption does not hold. */
+ ut_d(os_aio_wait_until_no_pending_writes(false));
+
+ /* Close the redo log file, so that we can replace it */
+ log_sys.close_file();
+
+ DBUG_EXECUTE_IF("innodb_log_abort_5", return DB_ERROR;);
+
+ dberr_t err= create_log_file(false, lsn);
+
+ if (err == DB_SUCCESS && log_sys.resize_rename())
+ err = DB_ERROR;
+
+ return err;
+}
+
+/** Rebuild the redo log if needed. */
+static dberr_t srv_log_rebuild_if_needed()
+{
+ if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO)
+ /* Completely ignore the redo log. */
+ return DB_SUCCESS;
+ if (srv_read_only_mode)
+ /* Leave the redo log alone. */
+ return DB_SUCCESS;
+
+ if (log_sys.file_size == srv_log_file_size &&
+ log_sys.format ==
+ (srv_encrypt_log ? log_t::FORMAT_ENC_10_8 : log_t::FORMAT_10_8))
+ {
+ /* No need to add or remove encryption, upgrade, or resize. */
+ delete_log_files();
+ return DB_SUCCESS;
+ }
+
+ return srv_log_rebuild();
+}
+
+ATTRIBUTE_COLD static dberr_t ibuf_log_rebuild_if_needed()
+{
+ mysql_mutex_lock(&recv_sys.mutex);
+ recv_sys.apply(true);
+ mysql_mutex_unlock(&recv_sys.mutex);
+
+ if (recv_sys.is_corrupt_log() || recv_sys.is_corrupt_fs())
+ return DB_CORRUPTION;
+
+ dberr_t err= srv_log_rebuild_if_needed();
+ recv_sys.debug_free();
+ return err;
+}
+
static tpool::task_group rollback_all_recovered_group(1);
static tpool::task rollback_all_recovered_task(trx_rollback_all_recovered,
nullptr,
@@ -1132,7 +1208,7 @@ dberr_t srv_start(bool create_new_db)
if (srv_read_only_mode) {
sql_print_information("InnoDB: Started in read only mode");
- srv_use_doublewrite_buf = false;
+ buf_dblwr.use = buf_dblwr.USE_NO;
}
high_level_read_only = srv_read_only_mode
@@ -1147,10 +1223,6 @@ dberr_t srv_start(bool create_new_db)
ib::info() << "!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!";
#endif
-#ifdef UNIV_IBUF_DEBUG
- ib::info() << "!!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!";
-#endif
-
ib::info() << "Compressed tables use zlib " ZLIB_VERSION
#ifdef UNIV_ZIP_DEBUG
" with validation"
@@ -1240,11 +1312,6 @@ dberr_t srv_start(bool create_new_db)
return(srv_init_abort(err));
}
- if (srv_read_only_mode) {
- ib::info() << "Disabling background log and ibuf IO write"
- << " threads.";
- }
-
if (os_aio_init()) {
ib::error() << "Cannot initialize AIO sub-system";
@@ -1399,31 +1466,41 @@ dberr_t srv_start(bool create_new_db)
if (create_new_db) {
ut_ad(!srv_read_only_mode);
- mtr_start(&mtr);
+ mtr.start();
ut_ad(fil_system.sys_space->id == 0);
compile_time_assert(TRX_SYS_SPACE == 0);
- compile_time_assert(IBUF_SPACE_ID == 0);
- ut_a(fsp_header_init(fil_system.sys_space,
- uint32_t(sum_of_new_sizes), &mtr)
- == DB_SUCCESS);
-
- ulint ibuf_root = btr_create(
- DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space,
- DICT_IBUF_ID_MIN, nullptr, &mtr, &err);
-
- mtr_commit(&mtr);
-
- if (ibuf_root == FIL_NULL) {
- return srv_init_abort(err);
+ err = fsp_header_init(fil_system.sys_space,
+ uint32_t(sum_of_new_sizes), &mtr);
+ /* Allocate dummy change buffer pages for backward
+ compatibility and to prevent a downgrade. */
+ if (err != DB_SUCCESS) {
+ } else if (buf_block_t *b =
+ fseg_create(fil_system.sys_space, PAGE_DATA, &mtr,
+ &err)) {
+ ut_ad(b->page.id()
+ == page_id_t(0, FSP_IBUF_HEADER_PAGE_NO));
+ b = fseg_alloc_free_page_general(
+ b->page.frame + PAGE_DATA,
+ FSP_IBUF_TREE_ROOT_PAGE_NO, FSP_UP, false,
+ &mtr, &mtr, &err);
+ if (b) {
+ ut_ad(b->page.id() == page_id_t
+ (0, FSP_IBUF_TREE_ROOT_PAGE_NO));
+ mtr.set_modified(*b);
+ fsp_init_file_page(fil_system.sys_space, b,
+ &mtr);
+ } else {
+ ut_ad(err != DB_SUCCESS);
+ }
}
-
- ut_ad(ibuf_root == IBUF_TREE_ROOT_PAGE_NO);
-
/* To maintain backward compatibility we create only
the first rollback segment before the double write buffer.
All the remaining rollback segments will be created later,
after the double write buffer has been created. */
- err = trx_sys_create_sys_pages(&mtr);
+ if (err == DB_SUCCESS) {
+ err = trx_sys_create_sys_pages(&mtr);
+ }
+ mtr.commit();
if (err != DB_SUCCESS) {
return(srv_init_abort(err));
@@ -1457,39 +1534,45 @@ dberr_t srv_start(bool create_new_db)
recv_sys.dblwr.pages.clear();
- if (err != DB_SUCCESS) {
- return(srv_init_abort(err));
- }
+ bool must_upgrade_ibuf = false;
switch (srv_operation) {
case SRV_OPERATION_NORMAL:
case SRV_OPERATION_EXPORT_RESTORED:
case SRV_OPERATION_RESTORE_EXPORT:
- /* Initialize the change buffer. */
- err = dict_boot();
if (err != DB_SUCCESS) {
- return(srv_init_abort(err));
+ break;
}
- /* fall through */
- case SRV_OPERATION_RESTORE:
- /* This must precede recv_sys.apply(true). */
- srv_undo_tablespaces_active
- = trx_rseg_get_n_undo_tablespaces();
- if (srv_operation != SRV_OPERATION_RESTORE) {
- dict_sys.load_sys_tables();
+ err = ibuf_upgrade_needed();
+
+ if (UNIV_UNLIKELY(err == DB_FAIL)) {
+ must_upgrade_ibuf = true;
+ err = ibuf_log_rebuild_if_needed();
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
}
- err = trx_lists_init_at_db_start();
+
+ err = dict_boot();
+ /* fall through */
+ case SRV_OPERATION_RESTORE:
if (err != DB_SUCCESS) {
- return srv_init_abort(err);
+ break;
}
+
+ srv_undo_tablespaces_active
+ = trx_rseg_get_n_undo_tablespaces();
break;
- case SRV_OPERATION_RESTORE_DELTA:
- case SRV_OPERATION_BACKUP:
- case SRV_OPERATION_BACKUP_NO_DEFER:
+ default:
ut_ad("wrong mariabackup mode" == 0);
}
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
+
if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
/* Apply the hashed log records to the
respective file pages, for the last batch of
@@ -1511,9 +1594,45 @@ dberr_t srv_start(bool create_new_db)
return(srv_init_abort(DB_CORRUPTION));
}
+ if (srv_operation != SRV_OPERATION_RESTORE
+ || recv_needed_recovery) {
+ }
+
DBUG_PRINT("ib_log", ("apply completed"));
- if (recv_needed_recovery) {
+ if (srv_operation != SRV_OPERATION_RESTORE) {
+ dict_sys.lock(SRW_LOCK_CALL);
+ dict_load_sys_table(dict_sys.sys_tables);
+ dict_sys.unlock();
+
+ if (UNIV_UNLIKELY(must_upgrade_ibuf)) {
+ dict_load_tablespaces(nullptr, true);
+ err = ibuf_upgrade();
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
+ }
+
+ dict_sys.lock(SRW_LOCK_CALL);
+ dict_load_sys_table(dict_sys.sys_columns);
+ dict_load_sys_table(dict_sys.sys_indexes);
+ dict_load_sys_table(dict_sys.sys_fields);
+ dict_sys.unlock();
+ dict_sys.load_sys_tables();
+
+ err = trx_lists_init_at_db_start();
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
+
+ if (recv_needed_recovery) {
+ trx_sys_print_mysql_binlog_offset();
+ }
+ } else if (recv_needed_recovery) {
+ err = trx_lists_init_at_db_start();
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
trx_sys_print_mysql_binlog_offset();
}
}
@@ -1530,57 +1649,10 @@ dberr_t srv_start(bool create_new_db)
generating any dirty pages, so that the old redo log
file will not be written to. */
- if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
- /* Completely ignore the redo log. */
- } else if (srv_read_only_mode) {
- /* Leave the redo log alone. */
- } else if (log_sys.file_size == srv_log_file_size
- && log_sys.format
- == (srv_encrypt_log
- ? log_t::FORMAT_ENC_10_8
- : log_t::FORMAT_10_8)) {
- /* No need to add or remove encryption,
- upgrade, or resize. */
- delete_log_files();
- } else {
- /* Prepare to delete the old redo log file */
- const lsn_t lsn{srv_prepare_to_delete_redo_log_file()};
-
- DBUG_EXECUTE_IF("innodb_log_abort_1",
- return(srv_init_abort(DB_ERROR)););
- /* Prohibit redo log writes from any other
- threads until creating a log checkpoint at the
- end of create_log_file(). */
- ut_d(recv_no_log_write = true);
- ut_ad(!os_aio_pending_reads());
- ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
- ut_ad(!buf_pool.get_oldest_modification(0));
- ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
- /* os_aio_pending_writes() may hold here if
- some write_io_callback() did not release the
- slot yet. However, the page write itself must
- have completed, because the buf_pool.flush_list
- is empty. In debug builds, we wait for this to
- happen, hoping to get a hung process if this
- assumption does not hold. */
- ut_d(os_aio_wait_until_no_pending_writes(false));
-
- /* Close the redo log file, so that we can replace it */
- log_sys.close_file();
-
- DBUG_EXECUTE_IF("innodb_log_abort_5",
- return(srv_init_abort(DB_ERROR)););
- DBUG_PRINT("ib_log", ("After innodb_log_abort_5"));
-
- err = create_log_file(false, lsn);
-
- if (err == DB_SUCCESS && log_sys.resize_rename()) {
- err = DB_ERROR;
- }
+ err = srv_log_rebuild_if_needed();
- if (err != DB_SUCCESS) {
- return(srv_init_abort(err));
- }
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
}
recv_sys.debug_free();
@@ -1710,6 +1782,13 @@ dberr_t srv_start(bool create_new_db)
ut_ad(high_level_read_only
|| srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+ if (!high_level_read_only
+ && srv_sys_space.can_auto_shrink()) {
+ fsp_system_tablespace_truncate();
+ DBUG_EXECUTE_IF("crash_after_sys_truncate",
+ return srv_init_abort(DB_ERROR););
+ }
+
/* Validate a few system page types that were left
uninitialized before MySQL or MariaDB 5.5. */
if (!high_level_read_only
@@ -1719,8 +1798,7 @@ dberr_t srv_start(bool create_new_db)
/* Bitmap page types will be reset in
buf_dblwr_check_block() without redo logging. */
block = buf_page_get(
- page_id_t(IBUF_SPACE_ID,
- FSP_IBUF_HEADER_PAGE_NO),
+ page_id_t(0, FSP_IBUF_HEADER_PAGE_NO),
0, RW_X_LATCH, &mtr);
if (UNIV_UNLIKELY(!block)) {
corrupted_old_page:
@@ -1778,13 +1856,7 @@ dberr_t srv_start(bool create_new_db)
}
if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
- /* The following call is necessary for the change
- buffer to work with multiple tablespaces. We must
- know the mapping between space id's and .ibd file
- names.
-
- We also determine the maximum tablespace id used. */
- dict_check_tablespaces_and_store_max_id(nullptr);
+ dict_load_tablespaces();
}
if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
@@ -1867,13 +1939,6 @@ skip_monitors:
trx_sys.get_max_trx_id());
}
- if (srv_force_recovery == 0) {
- /* In the change buffer we may have even bigger tablespace
- id's, because we may have dropped those tablespaces, but
- the buffered records have not been cleaned yet. */
- ibuf_update_max_tablespace_id();
- }
-
if (!srv_read_only_mode) {
if (create_new_db) {
srv_buffer_pool_load_at_startup = FALSE;
@@ -1904,9 +1969,6 @@ skip_monitors:
fil_crypt_threads_cond. */
fil_crypt_threads_init();
- /* Initialize online defragmentation. */
- btr_defragment_init();
-
srv_started_redo = true;
}
@@ -1927,15 +1989,9 @@ void innodb_preshutdown()
if (srv_read_only_mode)
return;
if (!srv_fast_shutdown && srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
- {
- /* Because a slow shutdown must empty the change buffer, we had
- better prevent any further changes from being buffered. */
- innodb_change_buffering= 0;
-
if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO && srv_was_started)
while (trx_sys.any_active_transactions())
std::this_thread::sleep_for(std::chrono::milliseconds(1));
- }
srv_shutdown_bg_undo_sources();
srv_purge_shutdown();
@@ -1998,8 +2054,6 @@ void innodb_shutdown()
|| srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
ut_ad(lock_sys.is_initialised() || !srv_was_started);
ut_ad(log_sys.is_initialised() || !srv_was_started);
- ut_ad(ibuf.index || !innodb_change_buffering || !srv_was_started
- || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO);
dict_stats_deinit();
@@ -2009,7 +2063,6 @@ void innodb_shutdown()
fts_optimize_shutdown(); dict_stats_shutdown(); */
fil_crypt_threads_cleanup();
- btr_defragment_shutdown();
}
/* This must be disabled before closing the buffer pool
@@ -2020,7 +2073,6 @@ void innodb_shutdown()
btr_search_disable();
}
#endif /* BTR_CUR_HASH_ADAPT */
- ibuf_close();
log_sys.close();
purge_sys.close();
trx_sys.close();
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index f32f4de5..85c6dfdb 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -190,7 +190,6 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
ut_a(undo_page);
trx_ulogf_t *undo_header= undo_page->page.frame + undo->hdr_offset;
- ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
ut_ad(rseg->needs_purge > trx->id);
ut_ad(rseg->last_page_no != FIL_NULL);
@@ -274,8 +273,6 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
undo_page->page.frame, undo_state);
mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, undo_header + TRX_UNDO_TRX_NO,
trx->rw_trx_hash_element->no);
- mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header +
- TRX_UNDO_NEEDS_PURGE, 1U);
}
/** Free an undo log segment.
@@ -836,7 +833,6 @@ bool purge_sys_t::rseg_get_next_history_log()
{
const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset;
trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
- ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
}
if (UNIV_LIKELY(trx_no != 0))
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
index 964dca94..abfdc920 100644
--- a/storage/innobase/trx/trx0rseg.cc
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -458,12 +458,12 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
/* Access the tablespace header page to recover rseg->space->free_limit */
page_id_t page_id{rseg->space->id, 0};
dberr_t err;
- if (!buf_page_get_gen(page_id, 0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err))
+ if (!buf_page_get_gen(page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr, &err))
return err;
mtr->release_last_page();
page_id.set_page_no(rseg->page_no);
const buf_block_t *rseg_hdr=
- buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr,
+ buf_page_get_gen(rseg->page_id(), 0, RW_X_LATCH, nullptr, BUF_GET, mtr,
&err);
if (!rseg_hdr)
return err;
@@ -563,8 +563,6 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
rseg->needs_purge= id;
rseg->set_last_commit(node_addr.boffset, id);
- ut_ad(mach_read_from_2(block->page.frame + node_addr.boffset +
- TRX_UNDO_NEEDS_PURGE) <= 1);
if (rseg->last_page_no != FIL_NULL)
/* There is no need to cover this operation by the purge
@@ -623,7 +621,7 @@ dberr_t trx_rseg_array_init()
purge_sys.queue_lock();
for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
mtr.start();
- if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+ if (const buf_block_t* sys = trx_sysf_get(&mtr, true)) {
if (rseg_id == 0) {
/* In case this is an upgrade from
before MariaDB 10.3.5, fetch the base
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
index 319ba99a..6f4f4423 100644
--- a/storage/innobase/trx/trx0sys.cc
+++ b/storage/innobase/trx/trx0sys.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -105,7 +105,6 @@ trx_sysf_get_n_rseg_slots()
/** Initialize the transaction system when creating the database. */
dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
{
- mtr->start();
mtr->x_lock_space(fil_system.sys_space);
static_assert(TRX_SYS_SPACE == 0, "compatibility");
@@ -114,11 +113,7 @@ dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
buf_block_t *block= fseg_create(fil_system.sys_space,
TRX_SYS + TRX_SYS_FSEG_HEADER, mtr, &err);
if (UNIV_UNLIKELY(!block))
- {
- error:
- mtr->commit();
return err;
- }
ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
@@ -138,9 +133,8 @@ dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
buf_block_t *r= trx_rseg_header_create(fil_system.sys_space, 0, 0,
mtr, &err);
if (UNIV_UNLIKELY(!r))
- goto error;
+ return err;
ut_a(r->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
- mtr->commit();
return trx_lists_init_at_db_start();
}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 1d22b853..ff9d8c55 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1256,7 +1256,7 @@ static void trx_flush_log_if_needed(lsn_t lsn, trx_t *trx)
return;
const bool flush=
- (srv_file_flush_method != SRV_NOSYNC &&
+ (!my_disable_sync &&
(srv_flush_log_at_trx_commit & 1));
completion_callback cb;
@@ -2019,8 +2019,7 @@ trx_prepare(
We must not be holding any mutexes or latches here. */
if (auto f = srv_flush_log_at_trx_commit) {
- log_write_up_to(lsn, (f & 1) && srv_file_flush_method
- != SRV_NOSYNC);
+ log_write_up_to(lsn, (f & 1) && !my_disable_sync);
}
if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index c0f5b1fb..0801ddb2 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -185,7 +185,7 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
return nullptr;
if (!buf_page_make_young_if_needed(&block->page))
- buf_read_ahead_linear(block->page.id(), 0, false);
+ buf_read_ahead_linear(block->page.id(), 0);
return trx_undo_page_get_last_rec(block, page_no, offset);
}
@@ -282,7 +282,7 @@ trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
return nullptr;
if (!buf_page_make_young_if_needed(&b->page))
- buf_read_ahead_linear(b->page.id(), 0, false);
+ buf_read_ahead_linear(b->page.id(), 0);
if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(b, page_no, offset))
return rec;
@@ -572,13 +572,8 @@ static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
undo_page->page.frame) != 0))
mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0);
- /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
- mach_write_to_2(buf, 1);
- memcpy_aligned<2>(buf + 2, start, 2);
- static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
- "compatibility");
- mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
- undo_page->page.frame, buf, 4);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_LOG_START +
+ undo_page->page.frame, start, 2);
/* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
if (prev_log)
{
@@ -985,7 +980,7 @@ trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no)
mtr.start();
const page_id_t page_id{rseg->space->id, page_no};
- const buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
+ const buf_block_t* block = recv_sys.recover(page_id, &mtr, nullptr);
if (UNIV_UNLIKELY(!block)) {
corrupted:
mtr.commit();
@@ -1099,9 +1094,8 @@ corrupted_type:
undo->last_page_no = last_addr.page;
undo->top_page_no = last_addr.page;
- const buf_block_t* last = buf_page_get(
- page_id_t(rseg->space->id, undo->last_page_no), 0,
- RW_X_LATCH, &mtr);
+ const buf_block_t* last = recv_sys.recover(
+ page_id_t(rseg->space->id, undo->last_page_no), &mtr, nullptr);
if (UNIV_UNLIKELY(!last)) {
goto corrupted_undo;