diff options
Diffstat (limited to 'storage/innobase/include')
51 files changed, 403 insertions, 1784 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 83bdaa97..35a567d7 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -56,12 +56,8 @@ is acceptable for the program to die with a clear assert failure. */ #define BTR_MAX_LEVELS 100 #define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ - btr_latch_mode((latch_mode) & ~(BTR_INSERT \ - | BTR_DELETE_MARK \ - | BTR_RTREE_UNDO_INS \ + btr_latch_mode((latch_mode) & ~(BTR_RTREE_UNDO_INS \ | BTR_RTREE_DELETE_MARK \ - | BTR_DELETE \ - | BTR_IGNORE_SEC_UNIQUE \ | BTR_ALREADY_S_LATCHED \ | BTR_LATCH_FOR_INSERT \ | BTR_LATCH_FOR_DELETE)) @@ -79,6 +75,14 @@ btr_root_adjust_on_import( const dict_index_t* index) /*!< in: index tree */ MY_ATTRIBUTE((warn_unused_result)); +/** Check a file segment header within a B-tree root page. +@param offset file segment header offset +@param block B-tree root page +@param space tablespace +@return whether the segment header is valid */ +bool btr_root_fseg_validate(ulint offset, const buf_block_t &block, + const fil_space_t &space); + /** Report a decryption failure. */ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index); @@ -86,13 +90,12 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index); @param[in] index index tree @param[in] page page number @param[in] mode latch mode -@param[in] merge whether change buffer merge should be attempted @param[in,out] mtr mini-transaction @param[out] err error code @param[out] first set if this is a first-time access to the page @return block */ buf_block_t *btr_block_get(const dict_index_t &index, - uint32_t page, rw_lock_type_t mode, bool merge, + uint32_t page, rw_lock_type_t mode, mtr_t *mtr, dberr_t *err= nullptr, bool *first= nullptr); @@ -246,15 +249,7 @@ btr_root_raise_and_insert( mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*************************************************************//** -Reorganizes an index page. - -IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE -if this is a compressed leaf page in a secondary index. This has to -be done either within the same mini-transaction, or by invoking -ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, -IBUF_BITMAP_FREE is unaffected by reorganization. - +/** Reorganize an index page. @param cursor page cursor @param mtr mini-transaction @return error code @@ -352,6 +347,7 @@ btr_check_node_ptr( /*===============*/ dict_index_t* index, /*!< in: index tree */ buf_block_t* block, /*!< in: index page */ + que_thr_t* thr, /*!< in/out: query thread */ mtr_t* mtr) /*!< in: mtr */ MY_ATTRIBUTE((warn_unused_result)); #endif /* UNIV_DEBUG */ @@ -455,15 +451,8 @@ btr_root_block_get( or RW_X_LATCH */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err); /*!< out: error code */ -/*************************************************************//** -Reorganizes an index page. - -IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE -if this is a compressed leaf page in a secondary index. This has to -be done either within the same mini-transaction, or by invoking -ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, -IBUF_BITMAP_FREE is unaffected by reorganization. +/** Reorganize an index page. @return error code @retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ dberr_t btr_page_reorganize_block( @@ -534,9 +523,10 @@ btr_lift_page_up( must not be empty: use btr_discard_only_page_on_level if the last record from the page should be removed */ + que_thr_t* thr, /*!< in/out: query thread for SPATIAL INDEX */ mtr_t* mtr, /*!< in/out: mini-transaction */ dberr_t* err) /*!< out: error code */ - __attribute__((nonnull)); + __attribute__((nonnull(1,2,4,5))); #define BTR_N_LEAF_PAGES 1 #define BTR_TOTAL_SIZE 2 diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index f6abc9f5..dc64054e 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -56,11 +56,7 @@ enum { BTR_KEEP_POS_FLAG = 8, /** the caller is creating the index or wants to bypass the index->info.online creation log */ - BTR_CREATE_FLAG = 16, - /** the caller of btr_cur_optimistic_update() or - btr_cur_update_in_place() will take care of - updating IBUF_BITMAP_FREE */ - BTR_KEEP_IBUF_BITMAP = 32 + BTR_CREATE_FLAG = 16 }; #include "que0types.h" @@ -213,14 +209,8 @@ btr_cur_pessimistic_insert( See if there is enough place in the page modification log to log an update-in-place. -@retval false if out of space; IBUF_BITMAP_FREE will be reset -outside mtr if the page was recompressed -@retval true if enough place; - -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is -a secondary index leaf page. This has to be done either within the -same mini-transaction, or by invoking ibuf_reset_free_bits() before -mtr_commit(mtr). */ +@retval false if out of space +@retval true if enough place */ bool btr_cur_update_alloc_zip_func( /*==========================*/ @@ -262,7 +252,7 @@ Updates a record when the update causes no size changes in its fields. @return locking or undo log related error code, or @retval DB_SUCCESS on success @retval DB_ZIP_OVERFLOW if there is not enough space left -on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +on a ROW_FORMAT=COMPRESSED page */ dberr_t btr_cur_update_in_place( /*====================*/ @@ -669,28 +659,13 @@ enum btr_cur_method { reference is stored in the field hash_node, and might be necessary to update */ - BTR_CUR_BINARY, /*!< success using the binary search */ - BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to - the insert buffer */ - BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete - mark in the insert/delete buffer */ - BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in - the insert/delete buffer */ - BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */ + BTR_CUR_BINARY /*!< success using the binary search */ }; /** The tree cursor: the definition appears here only for the compiler to know struct size! */ struct btr_cur_t { page_cur_t page_cur; /*!< page cursor */ - purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */ - /*------------------------------*/ - que_thr_t* thr; /*!< this field is only used - when search_leaf() - is called for an index entry - insertion: the calling query - thread is passed here to be - used in the insert buffer */ /*------------------------------*/ /** The following fields are used in search_leaf() to pass information: */ diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h deleted file mode 100644 index 0523829b..00000000 --- a/storage/innobase/include/btr0defragment.h +++ /dev/null @@ -1,65 +0,0 @@ -/***************************************************************************** - -Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. -Copyright (C) 2014, 2021, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -#ifndef btr0defragment_h -#define btr0defragment_h - -#include "btr0pcur.h" - -/* Max number of pages to consider at once during defragmentation. */ -#define BTR_DEFRAGMENT_MAX_N_PAGES 32 - -/** stats in btr_defragment */ -extern Atomic_counter<ulint> btr_defragment_compression_failures; -extern Atomic_counter<ulint> btr_defragment_failures; -extern Atomic_counter<ulint> btr_defragment_count; - -/******************************************************************//** -Initialize defragmentation. */ -void -btr_defragment_init(void); -/******************************************************************//** -Shutdown defragmentation. */ -void -btr_defragment_shutdown(); -/******************************************************************//** -Check whether the given index is in btr_defragment_wq. */ -bool -btr_defragment_find_index( - dict_index_t* index); /*!< Index to find. */ -/** Defragment an index. -@param pcur persistent cursor -@param thd current session, for checking thd_killed() -@return whether the operation was interrupted */ -bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd); -/******************************************************************//** -When table is dropped, this function is called to mark a table as removed in -btr_efragment_wq. The difference between this function and the remove_index -function is this will not NULL the event. */ -void -btr_defragment_remove_table( - dict_table_t* table); /*!< Index to be removed. */ -/*********************************************************************//** -Check whether we should save defragmentation statistics to persistent storage.*/ -void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index); - -/* Stop defragmentation.*/ -void btr_defragment_end(); -extern bool btr_defragment_active; -#endif diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index fc829e78..966247ff 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -69,7 +69,7 @@ enum btr_latch_mode { Used in btr_pcur_move_backward_from_page(). */ BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF, /** Modify the previous record. - Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */ + Used in btr_pcur_move_backward_from_page(). */ BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF, /** Start modifying the entire B-tree. */ BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF, @@ -77,24 +77,8 @@ enum btr_latch_mode { Only used by rtr_search_to_nth_level(). */ BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE, - /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually - exclusive. */ - /** The search tuple will be inserted to the secondary index - at the searched position. When the leaf page is not in the - buffer pool, try to use the change buffer. */ - BTR_INSERT = 64, - - /** Try to delete mark a secondary index leaf page record at - the searched position using the change buffer when the page is - not in the buffer pool. */ - BTR_DELETE_MARK = 128, - - /** Try to purge the record using the change buffer when the - secondary index leaf page is not in the buffer pool. */ - BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK, - /** The caller is already holding dict_index_t::lock S-latch. */ - BTR_ALREADY_S_LATCHED = 256, + BTR_ALREADY_S_LATCHED = 16, /** Search and S-latch a leaf page, assuming that the dict_index_t::lock S-latch is being held. */ BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF @@ -111,28 +95,15 @@ enum btr_latch_mode { BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF | BTR_ALREADY_S_LATCHED, - /** Attempt to delete-mark a secondary index record. */ - BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK, - /** Attempt to delete-mark a secondary index record - while holding the dict_index_t::lock S-latch. */ - BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF - | BTR_ALREADY_S_LATCHED, - /** Attempt to purge a secondary index record. */ - BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE, - /** Attempt to purge a secondary index record - while holding the dict_index_t::lock S-latch. */ - BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF - | BTR_ALREADY_S_LATCHED, - /** In the case of BTR_MODIFY_TREE, the caller specifies the intention to delete record only. It is used to optimize block->lock range.*/ - BTR_LATCH_FOR_DELETE = 512, + BTR_LATCH_FOR_DELETE = 32, /** In the case of BTR_MODIFY_TREE, the caller specifies the intention to delete record only. It is used to optimize block->lock range.*/ - BTR_LATCH_FOR_INSERT = 1024, + BTR_LATCH_FOR_INSERT = 64, /** Attempt to delete a record in the tree. */ BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, @@ -143,12 +114,8 @@ enum btr_latch_mode { /** Attempt to insert a record into the tree. */ BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT, - /** This flag ORed to BTR_INSERT says that we can ignore possible - UNIQUE definition on secondary indexes when we decide if we can use - the insert buffer to speed up inserts */ - BTR_IGNORE_SEC_UNIQUE = 2048, /** Rollback in spatial index */ - BTR_RTREE_UNDO_INS = 4096, + BTR_RTREE_UNDO_INS = 128, /** Try to delete mark a spatial index record */ - BTR_RTREE_DELETE_MARK = 8192 + BTR_RTREE_DELETE_MARK = 256 }; diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index c291615c..b30763fa 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -45,13 +45,10 @@ Created 11/5/1995 Heikki Tuuri /** @name Modes for buf_page_get_gen */ /* @{ */ #define BUF_GET 10 /*!< get always */ +#define BUF_GET_RECOVER 9 /*!< like BUF_GET, but in recv_sys.recover() */ #define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ #define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make the block young in the LRU list */ -#define BUF_GET_IF_IN_POOL_OR_WATCH 15 - /*!< Get the page only if it's in the - buffer pool, if not then set a watch - on the page. */ #define BUF_GET_POSSIBLY_FREED 16 /*!< Like BUF_GET, but do not mind if the file page has been freed. */ @@ -204,11 +201,9 @@ buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size); @param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, -BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +or BUF_PEEK_IF_IN_POOL @param[in,out] mtr mini-transaction @param[out] err DB_SUCCESS or error code -@param[in] allow_ibuf_merge Allow change buffer merge while -reading the pages from file. @return pointer to the block or NULL */ buf_block_t* buf_page_get_gen( @@ -218,40 +213,12 @@ buf_page_get_gen( buf_block_t* guess, ulint mode, mtr_t* mtr, - dberr_t* err = NULL, - bool allow_ibuf_merge = false) - MY_ATTRIBUTE((nonnull(6), warn_unused_result)); - -/** This is the low level function used to get access to a database page. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH -@param[in] guess guessed block or NULL -@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, -BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH -@param[in,out] mtr mini-transaction, or NULL if a - block with page_id is to be evicted -@param[out] err DB_SUCCESS or error code -@param[in] allow_ibuf_merge Allow change buffer merge to happen -while reading the page from file -then it makes sure that it does merging of change buffer changes while -reading the page from file. -@return pointer to the block or NULL */ -buf_block_t* -buf_page_get_low( - const page_id_t page_id, - ulint zip_size, - ulint rw_latch, - buf_block_t* guess, - ulint mode, - mtr_t* mtr, - dberr_t* err, - bool allow_ibuf_merge); + dberr_t* err = nullptr); /** Initialize a page in the buffer pool. The page is usually not read from a file even if it cannot be found in the buffer buf_pool. This is one of the functions which perform to a block a state transition NOT_USED => LRU -(the other is buf_page_get_low()). +(the other is buf_page_get_gen()). @param[in,out] space space object @param[in] offset offset of the tablespace @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @@ -529,18 +496,16 @@ public: static constexpr uint32_t REMOVE_HASH= 2; /** smallest state() of a buffer page that is freed in the tablespace */ static constexpr uint32_t FREED= 3; + /* unused state: 1U<<29 */ /** smallest state() for a block that belongs to buf_pool.LRU */ - static constexpr uint32_t UNFIXED= 1U << 29; - /** smallest state() of a block for which buffered changes may exist */ - static constexpr uint32_t IBUF_EXIST= 2U << 29; + static constexpr uint32_t UNFIXED= 2U << 29; /** smallest state() of a (re)initialized page (no doublewrite needed) */ static constexpr uint32_t REINIT= 3U << 29; /** smallest state() for an io-fixed block */ static constexpr uint32_t READ_FIX= 4U << 29; + /* unused state: 5U<<29 */ /** smallest state() for a write-fixed block */ - static constexpr uint32_t WRITE_FIX= 5U << 29; - /** smallest state() for a write-fixed block with buffered changes */ - static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29; + static constexpr uint32_t WRITE_FIX= 6U << 29; /** smallest state() for a write-fixed block (no doublewrite was used) */ static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29; /** buf_pool.LRU status mask in state() */ @@ -552,8 +517,7 @@ public: byte *frame; /* @} */ /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to) - is also protected by buf_pool.mutex; - !frame && !zip.data means an active buf_pool.watch */ + is also protected by buf_pool.mutex */ page_zip_des_t zip; #ifdef UNIV_DEBUG /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */ @@ -683,13 +647,6 @@ public: bool is_freed() const { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; } - bool is_ibuf_exist() const - { - const auto s= state(); - ut_ad(s >= UNFIXED); - ut_ad(s < READ_FIX); - return (s & LRU_MASK) == IBUF_EXIST; - } bool is_reinit() const { return !(~state() & REINIT); } void set_reinit(uint32_t prev_state) @@ -700,29 +657,10 @@ public: ut_ad(s < prev_state + UNFIXED); } - void set_ibuf_exist() - { - ut_ad(lock.is_write_locked()); - ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0)); - const auto s= state(); - ut_ad(s >= UNFIXED); - ut_ad(s < READ_FIX); - ut_ad(s < IBUF_EXIST || s >= REINIT); - zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s)); - } - void clear_ibuf_exist() - { - ut_ad(lock.is_write_locked()); - ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0)); - ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED); - ut_ad(s >= IBUF_EXIST); - ut_ad(s < REINIT); - } - uint32_t read_unfix(uint32_t s) { ut_ad(lock.is_write_locked()); - ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1); + ut_ad(s == UNFIXED + 1 || s == REINIT + 1); uint32_t old_state= zip.fix.fetch_add(s - READ_FIX); ut_ad(old_state >= READ_FIX); ut_ad(old_state < WRITE_FIX); @@ -812,7 +750,7 @@ public: uint32_t fix(uint32_t count= 1) { ut_ad(count); - ut_ad(count < IBUF_EXIST); + ut_ad(count < REINIT); uint32_t f= zip.fix.fetch_add(count); ut_ad(f >= FREED); ut_ad(!((f ^ (f + 1)) & LRU_MASK)); @@ -1417,78 +1355,10 @@ public: public: /** @return whether the buffer pool contains a page - @tparam allow_watch whether to allow watch_is_sentinel() @param page_id page identifier @param chain hash table chain for page_id.fold() */ - template<bool allow_watch= false> - TRANSACTIONAL_INLINE - bool page_hash_contains(const page_id_t page_id, hash_chain &chain) - { - transactional_shared_lock_guard<page_hash_latch> g - {page_hash.lock_get(chain)}; - buf_page_t *bpage= page_hash.get(page_id, chain); - if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)]) - { - ut_ad(!bpage->in_zip_hash); - ut_ad(!bpage->zip.data); - if (!allow_watch) - bpage= nullptr; - } - return bpage; - } - - /** Determine if a block is a sentinel for a buffer pool watch. - @param bpage page descriptor - @return whether bpage a sentinel for a buffer pool watch */ - bool watch_is_sentinel(const buf_page_t &bpage) - { -#ifdef SAFE_MUTEX - DBUG_ASSERT(mysql_mutex_is_owner(&mutex) || - page_hash.lock_get(page_hash.cell_get(bpage.id().fold())). - is_locked()); -#endif /* SAFE_MUTEX */ - ut_ad(bpage.in_file()); - if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)]) - return false; - ut_ad(!bpage.in_zip_hash); - ut_ad(!bpage.zip.data); - return true; - } - - /** Check if a watched page has been read. - This may only be called after !watch_set() and before invoking watch_unset(). - @param id page identifier - @return whether the page was read to the buffer pool */ - TRANSACTIONAL_INLINE - bool watch_occurred(const page_id_t id) - { - hash_chain &chain= page_hash.cell_get(id.fold()); - transactional_shared_lock_guard<page_hash_latch> g - {page_hash.lock_get(chain)}; - /* The page must exist because watch_set() increments buf_fix_count. */ - return !watch_is_sentinel(*page_hash.get(id, chain)); - } - - /** Register a watch for a page identifier. - @param id page identifier - @param chain page_hash.cell_get(id.fold()) - @return a buffer page corresponding to id - @retval nullptr if the block was not present in page_hash */ - buf_page_t *watch_set(const page_id_t id, hash_chain &chain); - - /** Stop watching whether a page has been read in. - watch_set(id) must have returned nullptr before. - @param id page identifier - @param chain unlocked hash table chain */ - void watch_unset(const page_id_t id, hash_chain &chain); - - /** Remove the sentinel block for the watch before replacing it with a - real block. watch_unset() or watch_occurred() will notice - that the block has been replaced with the real block. - @param w sentinel - @param chain locked hash table chain - @return w->state() */ - inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain); + TRANSACTIONAL_TARGET + bool page_hash_contains(const page_id_t page_id, hash_chain &chain); /** @return whether less than 1/4 of the buffer pool is available */ TPOOL_SUPPRESS_TSAN @@ -1883,9 +1753,6 @@ public: # error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" #endif - /** Sentinels to detect if pages are read into the buffer pool while - a delete-buffering operation is pending. Protected by mutex. */ - buf_page_t watch[innodb_purge_threads_MAX + 1]; /** Reserve a buffer. */ buf_tmp_buffer_t *io_buf_reserve(bool wait_for_reads) { return io_buf.reserve(wait_for_reads); } diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl index 050c8493..048e3d15 100644 --- a/storage/innobase/include/buf0buf.inl +++ b/storage/innobase/include/buf0buf.inl @@ -79,7 +79,7 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage) @return own: the allocated block, in state BUF_BLOCK_MEMORY */ inline buf_block_t *buf_block_alloc() { - return buf_LRU_get_free_block(false); + return buf_LRU_get_free_block(have_no_mutex); } /********************************************************************//** diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 6e7662d9..f912775d 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -53,9 +53,9 @@ class buf_dblwr_t element* buf_block_arr; }; - /** the page number of the first doublewrite block (block_size() pages) */ + /** the page number of the first doublewrite block (block_size pages) */ page_id_t block1{0, 0}; - /** the page number of the second doublewrite block (block_size() pages) */ + /** the page number of the second doublewrite block (block_size pages) */ page_id_t block2{0, 0}; /** mutex protecting the data members below */ @@ -74,6 +74,22 @@ class buf_dblwr_t slot slots[2]; slot *active_slot; + /** Size of the doublewrite block in pages */ + uint32_t block_size; + +public: + /** Values of use */ + enum usage { + /** Assume that writes are atomic */ + USE_NO= 0, + /** Use the doublewrite buffer with full durability */ + USE_YES, + /** Durable writes to the doublewrite buffer, not to data files */ + USE_FAST + }; + /** The value of innodb_doublewrite */ + ulong use; +private: /** Initialise the persistent storage of the doublewrite buffer. @param header doublewrite page header in the TRX_SYS page */ inline void init(const byte *header); @@ -126,9 +142,6 @@ public: @param request the completed batch write request */ void flush_buffered_writes_completed(const IORequest &request); - /** Size of the doublewrite block in pages */ - uint32_t block_size() const { return FSP_EXTENT_SIZE; } - /** Schedule a page write. If the doublewrite memory buffer is full, flush_buffered_writes() will be invoked to make space. @param request asynchronous write request @@ -139,6 +152,19 @@ public: bool is_created() const { return UNIV_LIKELY(block1 != page_id_t(0, 0)); } + /** @return whether the doublewrite buffer is in use */ + bool in_use() const { return is_created() && use; } + /** @return whether fsync() is needed on non-doublewrite pages */ + bool need_fsync() const { return use < USE_FAST; } + + void set_use(ulong use) + { + ut_ad(use <= USE_FAST); + mysql_mutex_lock(&mutex); + this->use= use; + mysql_mutex_unlock(&mutex); + } + /** @return whether a page identifier is part of the doublewrite buffer */ bool is_inside(const page_id_t id) const { @@ -147,8 +173,8 @@ public: ut_ad(block1 < block2); if (id < block1) return false; - const uint32_t size= block_size(); - return id < block1 + size || (id >= block2 && id < block2 + size); + return id < block1 + block_size || + (id >= block2 && id < block2 + block_size); } /** Wait for flush_buffered_writes() to be fully completed */ diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 28410276..c52fc05c 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -62,6 +62,17 @@ bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED); @retval NULL if the free list is empty */ buf_block_t* buf_LRU_get_free_only(); +/** How to acquire a block */ +enum buf_LRU_get { + /** The caller is not holding buf_pool.mutex */ + have_no_mutex= 0, + /** The caller is holding buf_pool.mutex */ + have_mutex, + /** The caller is not holding buf_pool.mutex and is OK if a block + cannot be allocated. */ + have_no_mutex_soft +}; + /** Get a block from the buf_pool.free list. If the list is empty, blocks will be moved from the end of buf_pool.LRU to buf_pool.free. @@ -83,9 +94,10 @@ we put it to free list to be used. * scan whole LRU list * scan LRU list even if buf_pool.try_LRU_scan is not set -@param have_mutex whether buf_pool.mutex is already being held -@return the free control block, in state BUF_BLOCK_MEMORY */ -buf_block_t* buf_LRU_get_free_block(bool have_mutex) +@param get how to allocate the block +@return the free control block, in state BUF_BLOCK_MEMORY +@retval nullptr if get==have_no_mutex_soft and memory was not available */ +buf_block_t* buf_LRU_get_free_block(buf_LRU_get get) MY_ATTRIBUTE((malloc,warn_unused_result)); /** @return whether the unzip_LRU list should be used for evicting a victim @@ -127,6 +139,10 @@ buf_unzip_LRU_add_block( ibool old); /*!< in: TRUE if should be put to the end of the list, else put to the start */ +/** Evict the temporary tablespace pages above the given threshold +@param threshold Above this page to be removed from LRU list */ +void buf_LRU_truncate_temp(uint32_t threshold); + /** Update buf_pool.LRU_old_ratio. @param[in] old_pct Reserve this percentage of the buffer pool for "old" blocks diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index 3dd085dd..46d08243 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2021, MariaDB Corporation. +Copyright (c) 2015, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,8 +24,7 @@ The database buffer read Created 11/5/1995 Heikki Tuuri *******************************************************/ -#ifndef buf0rea_h -#define buf0rea_h +#pragma once #include "buf0buf.h" @@ -33,15 +32,17 @@ Created 11/5/1995 Heikki Tuuri buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. -@param page_id page id -@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@retval DB_SUCCESS if the page was read and is not corrupted +@param page_id page id +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param chain buf_pool.page_hash cell for page_id +@retval DB_SUCCESS if the page was read and is not corrupted, @retval DB_SUCCESS_LOCKED_REC if the page was not read -@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ -dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size, + buf_pool_t::hash_chain &chain); /** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets @@ -57,21 +58,14 @@ void buf_read_page_background(fil_space_t *space, const page_id_t page_id, /** Applies a random read-ahead in buf_pool if there are at least a threshold value of accessed pages from the random read-ahead area. Does not read any page, not even the one at the position (space, offset), if the read-ahead -mechanism is not activated. NOTE 1: the calling thread may own latches on +mechanism is not activated. NOTE: the calling thread may own latches on pages: to avoid deadlocks this function must be written such that it cannot -end up waiting for these latches! NOTE 2: the calling thread must want -access to the page given: this rule is set to prevent unintended read-aheads -performed by ibuf routines, a situation which could result in a deadlock if -the OS does not support asynchronous i/o. +end up waiting for these latches! @param[in] page_id page id of a page which the current thread wants to access @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] ibuf whether we are inside ibuf routine -@return number of page read requests issued; NOTE that if we read ibuf -pages, it may happen that the page at the given page number does not -get read even if we return a positive value! */ -ulint -buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf); +@return number of page read requests issued */ +ulint buf_read_ahead_random(const page_id_t page_id, ulint zip_size); /** Applies linear read-ahead if in the buf_pool the page is a border page of a linear read-ahead area and all the pages in the area have been accessed. @@ -92,29 +86,15 @@ only very improbably. NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this function must be written such that it cannot end up waiting for these latches! -NOTE 3: the calling thread must want access to the page given: this rule is -set to prevent unintended read-aheads performed by ibuf routines, a situation -which could result in a deadlock if the OS does not support asynchronous io. @param[in] page_id page id; see NOTE 3 above @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] ibuf whether if we are inside ibuf routine @return number of page read requests issued */ -ulint -buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); +ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size); /** Schedule a page for recovery. @param space tablespace @param page_id page identifier @param recs log records -@param init page initialization, or nullptr if the page needs to be read */ +@param init_lsn page initialization, or 0 if the page needs to be read */ void buf_read_recover(fil_space_t *space, const page_id_t page_id, - page_recv_t &recs, recv_init *init); - -/** @name Modes used in read-ahead @{ */ -/** read only pages belonging to the insert buffer tree */ -#define BUF_READ_IBUF_PAGES_ONLY 131 -/** read any page */ -#define BUF_READ_ANY_PAGE 132 -/* @} */ - -#endif + page_recv_t &recs, lsn_t init_lsn); diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h index 3d63ddb7..d4885186 100644 --- a/storage/innobase/include/data0type.h +++ b/storage/innobase/include/data0type.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2022, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -33,7 +33,6 @@ Created 1/16/1996 Heikki Tuuri /** @return whether a length is actually stored in a field */ #define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT) -extern ulint data_mysql_default_charset_coll; #define DATA_MYSQL_BINARY_CHARSET_COLL 63 /* SQL data type struct */ @@ -196,14 +195,6 @@ constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double)); /*-------------------------------------------*/ -/* This many bytes we need to store the type information affecting the -alphabetical order for a single field and decide the storage size of an -SQL null*/ -#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 -/* In the >= 4.1.x storage format we add 2 bytes more so that we can also -store the charset-collation number; one byte is left unused, though */ -#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 - /* Maximum multi-byte character length in bytes, plus 1 */ #define DATA_MBMAX 8 @@ -344,13 +335,11 @@ charset-collation code. DATA_BINARY_TYPE etc. @param[in] charset_coll character-set collation code @return precise type, including the charset-collation code */ -UNIV_INLINE -uint32_t -dtype_form_prtype(ulint old_prtype, ulint charset_coll) +inline uint32_t dtype_form_prtype(ulint old_prtype, ulint charset_coll) { - ut_ad(old_prtype < 256 * 256); - ut_ad(charset_coll <= MAX_CHAR_COLL_NUM); - return(uint32_t(old_prtype + (charset_coll << 16))); + ut_ad(old_prtype <= 0xffff); + ut_ad(charset_coll <= MAX_CHAR_COLL_NUM); + return uint32_t(old_prtype | (charset_coll << 16)); } /*********************************************************************//** @@ -439,40 +428,6 @@ dtype_get_sql_null_size( const dtype_t* type, /*!< in: type */ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ -/**********************************************************************//** -Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. */ -UNIV_INLINE -void -dtype_read_for_order_and_null_size( -/*===============================*/ - dtype_t* type, /*!< in: type struct */ - const byte* buf); /*!< in: buffer for the stored order info */ -/**********************************************************************//** -Stores for a type the information which determines its alphabetical ordering -and the storage size of an SQL NULL value. This is the >= 4.1.x storage -format. */ -UNIV_INLINE -void -dtype_new_store_for_order_and_null_size( -/*====================================*/ - byte* buf, /*!< in: buffer for - DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE - bytes where we store the info */ - const dtype_t* type, /*!< in: type struct */ - ulint prefix_len);/*!< in: prefix length to - replace type->len, or 0 */ -/**********************************************************************//** -Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. This is the 4.1.x storage -format. */ -UNIV_INLINE -void -dtype_new_read_for_order_and_null_size( -/*===================================*/ - dtype_t* type, /*!< in: type struct */ - const byte* buf); /*!< in: buffer for stored type order info */ - /*********************************************************************//** Validates a data type structure. @return TRUE if ok */ @@ -494,8 +449,6 @@ struct dict_col_t; If you add fields to this structure, be sure to initialize them everywhere. This structure is initialized in the following functions: dtype_set() -dtype_read_for_order_and_null_size() -dtype_new_read_for_order_and_null_size() sym_tab_add_null_lit() */ struct dtype_t{ diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl index 329cee5d..add6c211 100644 --- a/storage/innobase/include/data0type.inl +++ b/storage/innobase/include/data0type.inl @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -183,126 +183,6 @@ dtype_get_mbmaxlen( return type->mbmaxlen; } -/**********************************************************************//** -Stores for a type the information which determines its alphabetical ordering -and the storage size of an SQL NULL value. This is the >= 4.1.x storage -format. */ -UNIV_INLINE -void -dtype_new_store_for_order_and_null_size( -/*====================================*/ - byte* buf, /*!< in: buffer for - DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE - bytes where we store the info */ - const dtype_t* type, /*!< in: type struct */ - ulint prefix_len)/*!< in: prefix length to - replace type->len, or 0 */ -{ - compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - - ulint len; - - ut_ad(type); - ut_ad(type->mtype >= DATA_VARCHAR); - ut_ad(type->mtype <= DATA_MTYPE_MAX); - - buf[0] = (byte)(type->mtype & 0xFFUL); - - if (type->prtype & DATA_BINARY_TYPE) { - buf[0] |= 128; - } - - /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { - buf[0] |= 64; - } - */ - - buf[1] = (byte)(type->prtype & 0xFFUL); - - len = prefix_len ? prefix_len : type->len; - - mach_write_to_2(buf + 2, len & 0xFFFFUL); - - ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM); - mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); - - if (type->prtype & DATA_NOT_NULL) { - buf[4] |= 128; - } -} - -/**********************************************************************//** -Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. This is the < 4.1.x -storage format. */ -UNIV_INLINE -void -dtype_read_for_order_and_null_size( -/*===============================*/ - dtype_t* type, /*!< in: type struct */ - const byte* buf) /*!< in: buffer for stored type order info */ -{ - compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); - type->mtype = buf[0] & 63; - type->prtype = buf[1]; - - if (buf[0] & 128) { - type->prtype |= DATA_BINARY_TYPE; - } - - type->len = mach_read_from_2(buf + 2); - - type->prtype = dtype_form_prtype(type->prtype, - data_mysql_default_charset_coll); - dtype_set_mblen(type); -} - -/**********************************************************************//** -Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. This is the >= 4.1.x -storage format. */ -UNIV_INLINE -void -dtype_new_read_for_order_and_null_size( -/*===================================*/ - dtype_t* type, /*!< in: type struct */ - const byte* buf) /*!< in: buffer for stored type order info */ -{ - compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - - type->mtype = buf[0] & 63; - type->prtype = buf[1]; - - if (buf[0] & 128) { - type->prtype |= DATA_BINARY_TYPE; - } - - if (buf[4] & 128) { - type->prtype |= DATA_NOT_NULL; - } - - type->len = mach_read_from_2(buf + 2); - - ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; - - if (dtype_is_string_type(type->mtype)) { - ut_a(charset_coll <= MAX_CHAR_COLL_NUM); - - if (charset_coll == 0) { - /* This insert buffer record was inserted with MySQL - version < 4.1.2, and the charset-collation code was not - explicitly stored to dtype->prtype at that time. It - must be the default charset-collation of this MySQL - installation. */ - - charset_coll = data_mysql_default_charset_coll; - } - - type->prtype = dtype_form_prtype(type->prtype, charset_coll); - } - dtype_set_mblen(type); -} - /***********************************************************************//** Returns the size of a fixed size data type, 0 if not a fixed size type. @return fixed size, or 0 */ diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h index a6528747..68400d20 100644 --- a/storage/innobase/include/dict0boot.h +++ b/storage/innobase/include/dict0boot.h @@ -44,39 +44,6 @@ dict_hdr_get_new_id( (not assigned if NULL) */ uint32_t* space_id); /*!< out: space id (not assigned if NULL) */ -/** Update dict_sys.row_id in the dictionary header file page. */ -void dict_hdr_flush_row_id(row_id_t id); -/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */ -inline row_id_t dict_sys_t::get_new_row_id() -{ - row_id_t id= row_id.fetch_add(1); - if (!(id % ROW_ID_WRITE_MARGIN)) - dict_hdr_flush_row_id(id); - return id; -} - -/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */ -inline void dict_sys_t::update_row_id(row_id_t id) -{ - row_id_t sys_id= row_id; - while (id >= sys_id) - { - if (!row_id.compare_exchange_strong(sys_id, id)) - continue; - if (!(id % ROW_ID_WRITE_MARGIN)) - dict_hdr_flush_row_id(id); - break; - } -} - -/**********************************************************************//** -Writes a row id to a record or other 6-byte stored form. */ -inline void dict_sys_write_row_id(byte *field, row_id_t row_id) -{ - static_assert(DATA_ROW_ID_LEN == 6, "compatibility"); - mach_write_to_6(field, row_id); -} - /*****************************************************************//** Initializes the data dictionary memory structures when the database is started. This function is also called when the data dictionary is created. @@ -116,7 +83,7 @@ inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; } /*-------------------------------------------------------------*/ /* Dictionary header offsets */ -#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */ +//#define DICT_HDR_ROW_ID 0 /* Was: latest assigned DB_ROW_ID */ #define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */ #define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */ #define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/ diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h deleted file mode 100644 index 679484ad..00000000 --- a/storage/innobase/include/dict0defrag_bg.h +++ /dev/null @@ -1,101 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2016, 2021, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/dict0defrag_bg.h -Code used for background table and index -defragmentation - -Created 25/08/2016 Jan Lindström -*******************************************************/ - -#ifndef dict0defrag_bg_h -#define dict0defrag_bg_h - -#include "dict0types.h" - -/** Indices whose defrag stats need to be saved to persistent storage.*/ -struct defrag_pool_item_t { - table_id_t table_id; - index_id_t index_id; -}; - -/** Allocator type, used by std::vector */ -typedef ut_allocator<defrag_pool_item_t> - defrag_pool_allocator_t; - -/** The multitude of tables to be defragmented- an STL vector */ -typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t> - defrag_pool_t; - -/** Pool where we store information on which tables are to be processed -by background defragmentation. */ -extern defrag_pool_t defrag_pool; - -/*****************************************************************//** -Initialize the defrag pool, called once during thread initialization. */ -void -dict_defrag_pool_init(void); -/*========================*/ - -/*****************************************************************//** -Free the resources occupied by the defrag pool, called once during -thread de-initialization. */ -void -dict_defrag_pool_deinit(void); -/*==========================*/ - -/*****************************************************************//** -Add an index in a table to the defrag pool, which is processed by the -background stats gathering thread. Only the table id and index id are -added to the list, so the table can be closed after being enqueued and -it will be opened when needed. If the table or index does not exist later -(has been DROPped), then it will be removed from the pool and skipped. */ -void -dict_stats_defrag_pool_add( -/*=======================*/ - const dict_index_t* index); /*!< in: table to add */ - -/*****************************************************************//** -Delete a given index from the auto defrag pool. */ -void -dict_stats_defrag_pool_del( -/*=======================*/ - const dict_table_t* table, /*!<in: if given, remove - all entries for the table */ - const dict_index_t* index); /*!< in: index to remove */ - -/** -Get the first index that has been added for updating persistent defrag -stats and eventually save its stats. */ -void dict_defrag_process_entries_from_defrag_pool(THD *thd); - -/*********************************************************************//** -Save defragmentation result. -@return DB_SUCCESS or error code */ -dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - -/*********************************************************************//** -Save defragmentation stats for a given index. -@return DB_SUCCESS or error code */ -dberr_t -dict_stats_save_defrag_stats( -/*============================*/ - dict_index_t* index); /*!< in: index */ -#endif /* dict0defrag_bg_h */ diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 3baac658..47350f9c 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -2,7 +2,7 @@ Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -649,8 +649,6 @@ dict_table_get_next_index( #define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust() #define dict_index_is_unique(index) (index)->is_unique() #define dict_index_is_spatial(index) (index)->is_spatial() -#define dict_index_is_ibuf(index) (index)->is_ibuf() -#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary() #define dict_index_has_virtual(index) (index)->has_virtual() /** Get all the FTS indexes on a table. @@ -665,7 +663,7 @@ dict_table_get_all_fts_indexes( /********************************************************************//** Gets the number of user-defined non-virtual columns in a table in the dictionary cache. -@return number of user-defined (e.g., not ROW_ID) non-virtual +@return number of user-defined (e.g., not DB_ROW_ID) non-virtual columns of a table */ UNIV_INLINE unsigned @@ -1381,27 +1379,10 @@ private: std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID}; /** hash table of temporary table IDs */ hash_table_t temp_id_hash; - /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID - (FIXME: remove this, and move to dict_table_t) */ - Atomic_relaxed<row_id_t> row_id; - /** The synchronization interval of row_id */ - static constexpr size_t ROW_ID_WRITE_MARGIN= 256; public: /** Diagnostic message for exceeding the lock_wait() timeout */ static const char fatal_msg[]; - /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */ - inline row_id_t get_new_row_id(); - - /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */ - inline void update_row_id(row_id_t id); - - /** Recover the global DB_ROW_ID sequence on database startup */ - void recover_row_id(row_id_t id) - { - row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN; - } - /** @return a new temporary table ID */ table_id_t acquire_temporary_table_id() { diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl index 4cc3eae9..ead22a21 100644 --- a/storage/innobase/include/dict0dict.inl +++ b/storage/innobase/include/dict0dict.inl @@ -244,7 +244,7 @@ dict_table_get_next_index( /********************************************************************//** Gets the number of user-defined non-virtual columns in a table in the dictionary cache. -@return number of user-defined (e.g., not ROW_ID) non-virtual +@return number of user-defined (e.g., not DB_ROW_ID) non-virtual columns of a table */ UNIV_INLINE unsigned diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index 3143aafd..c774a792 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2022, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -43,8 +43,10 @@ typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t; /** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system. Open each data file if an encryption plugin has been loaded. -@param spaces set of tablespace files to open */ -void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces); +@param spaces set of tablespace files to open +@param upgrade whether we need to invoke ibuf_upgrade() */ +void dict_load_tablespaces(const std::set<uint32_t> *spaces= nullptr, + bool upgrade= false); /** Make sure the data_file_name is saved in dict_table_t if needed. @param[in,out] table Table object */ diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 0268a280..52bb4777 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -65,7 +65,6 @@ combination of types */ auto-generated clustered indexes, also DICT_UNIQUE will be set */ #define DICT_UNIQUE 2 /*!< unique index */ -#define DICT_IBUF 8 /*!< insert buffer tree */ #define DICT_CORRUPT 16 /*!< bit to store the corrupted flag in SYS_INDEXES.TYPE */ #define DICT_FTS 32 /* FTS index; can't be combined with the @@ -266,7 +265,8 @@ use its own tablespace instead of the system tablespace. */ #define DICT_TF2_USE_FILE_PER_TABLE 16U /** Set when we discard/detach the tablespace */ -#define DICT_TF2_DISCARDED 32U +constexpr unsigned DICT_TF2_POS_DISCARDED= 5; +constexpr unsigned DICT_TF2_DISCARDED= 1U << DICT_TF2_POS_DISCARDED; /** This bit is set if all aux table names (both common tables and index tables) of a FTS table are in HEX format. */ @@ -947,10 +947,6 @@ struct zip_pad_info_t { rounds */ }; -/** Number of samples of data size kept when page compression fails for -a certain index.*/ -#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10 - /** "GEN_CLUST_INDEX" is the name reserved for InnoDB default system clustered index when there is no primary key. */ const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; @@ -996,7 +992,7 @@ struct dict_index_t { # define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50 unsigned type:DICT_IT_BITS; /*!< index type (DICT_CLUSTERED, DICT_UNIQUE, - DICT_IBUF, DICT_CORRUPT) */ + DICT_CORRUPT) */ #define MAX_KEY_LENGTH_BITS 12 unsigned trx_id_offset:MAX_KEY_LENGTH_BITS; /*!< position of the trx id column @@ -1116,23 +1112,6 @@ struct dict_index_t { /*!< has persistent statistics error printed for this index ? */ /* @} */ - /** Statistics for defragmentation, these numbers are estimations and - could be very inaccurate at certain times, e.g. right after restart, - during defragmentation, etc. */ - /* @{ */ - ulint stat_defrag_modified_counter; - ulint stat_defrag_n_pages_freed; - /* number of pages freed by defragmentation. */ - ulint stat_defrag_n_page_split; - /* number of page splits since last full index - defragmentation. */ - ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE]; - /* data size when compression failure happened - the most recent 10 times. */ - ulint stat_defrag_sample_next_slot; - /* in which slot the next sample should be - saved. */ - /* @} */ private: /** R-tree split sequence number */ Atomic_relaxed<node_seq_t> rtr_ssn; @@ -1184,12 +1163,8 @@ public: /** @return whether instant ALTER TABLE is in effect */ inline bool is_instant() const; - /** @return whether the index is the primary key index - (not the clustered index of the change buffer) */ - bool is_primary() const - { - return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); - } + /** @return whether the index is the primary key index */ + bool is_primary() const { return is_clust(); } /** @return whether this is a generated clustered index */ bool is_gen_clust() const { return type == DICT_CLUSTERED; } @@ -1203,16 +1178,13 @@ public: /** @return whether this is a spatial index */ bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); } - /** @return whether this is the change buffer */ - bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } - /** @return whether this index requires locking */ - bool has_locking() const { return !is_ibuf(); } + static constexpr bool has_locking() { return true; } /** @return whether this is a normal B-tree index (not the change buffer, not SPATIAL or FULLTEXT) */ bool is_btree() const { - return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL + return UNIV_LIKELY(!(type & (DICT_SPATIAL | DICT_FTS | DICT_CORRUPT))); } @@ -2126,8 +2098,9 @@ public: process of altering partitions */ unsigned skip_alter_undo:1; - /*!< whether this is in a single-table tablespace and the .ibd - file is missing or page decryption failed and page is corrupted */ + /** whether this is in a single-table tablespace and the .ibd file + is believed to be missing or page decryption failed and page is + corrupted */ unsigned file_unreadable:1; /** TRUE if the table object has been added to the dictionary cache. */ @@ -2355,6 +2328,8 @@ private: Atomic_relaxed<pthread_t> lock_mutex_owner{0}; #endif public: + /** The next DB_ROW_ID value */ + Atomic_counter<uint64_t> row_id{0}; /** Autoinc counter value to give to the next inserted row. */ uint64_t autoinc; @@ -2632,19 +2607,6 @@ dict_col_get_spatial_status( return(spatial_status); } -/** Clear defragmentation summary. */ -inline void dict_stats_empty_defrag_summary(dict_index_t* index) -{ - index->stat_defrag_n_pages_freed = 0; -} - -/** Clear defragmentation related index stats. */ -inline void dict_stats_empty_defrag_stats(dict_index_t* index) -{ - index->stat_defrag_modified_counter = 0; - index->stat_defrag_n_page_split = 0; -} - #include "dict0mem.inl" #endif /* dict0mem_h */ diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 3b006daf..720c8e00 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -218,30 +218,15 @@ dict_stats_save_index_stat( trx_t* trx) MY_ATTRIBUTE((nonnull(1, 3, 6, 7))); -/** Report an error if updating table statistics failed because -.ibd file is missing, table decryption failed or table is corrupted. -@param[in,out] table Table -@param[in] defragment true if statistics is for defragment -@retval DB_DECRYPTION_FAILED if decryption of the table failed -@retval DB_TABLESPACE_DELETED if .ibd file is missing -@retval DB_CORRUPTION if table is marked as corrupted */ -dberr_t -dict_stats_report_error(dict_table_t* table, bool defragment = false) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - #include "dict0stats.inl" #ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS void test_dict_stats_all(); #endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ -/** Write all zeros (or 1 where it makes sense) into a table -and its indexes'statistics members. The resulting stats -correspond to an empty table. -@param table table stats to be emptied -@param empty_defrag_stats empty the defrag stats */ -void -dict_stats_empty_table( - dict_table_t* table, - bool empty_defrag_stats); +/** Write all zeros (or 1 where it makes sense) into a table and its indexes' +statistics members. The resulting stats correspond to an empty table. +@param table table statistics to be emptied */ +void dict_stats_empty_table(dict_table_t *table); + #endif /* dict0stats_h */ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index ec50e8cd..f6169227 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -48,10 +48,6 @@ struct dict_add_v_col_t; #define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ #define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO -/* The ibuf table and indexes's ID are assigned as the number -DICT_IBUF_ID_MIN plus the space id */ -#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL - typedef ib_id_t table_id_t; typedef ib_id_t index_id_t; @@ -136,13 +132,6 @@ struct table_name_t inline bool is_temporary() const; }; -#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG -/** Dump the change buffer at startup */ -extern my_bool ibuf_dump; -/** Flag to control insert buffer debugging. */ -extern uint ibuf_debug; -#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ - /** Shift for spatial status */ #define SPATIAL_STATUS_SHIFT 12 diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index dfda1178..41b6c59f 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -48,38 +48,6 @@ struct named_spaces_tag_t; using space_list_t= ilist<fil_space_t, space_list_tag_t>; -// Forward declaration -extern my_bool srv_use_doublewrite_buf; - -/** Possible values of innodb_flush_method */ -enum srv_flush_t -{ - /** fsync, the default */ - SRV_FSYNC= 0, - /** open log files in O_DSYNC mode */ - SRV_O_DSYNC, - /** do not call os_file_flush() when writing data files, but do flush - after writing to log files */ - SRV_LITTLESYNC, - /** do not flush after writing */ - SRV_NOSYNC, - /** Open or create files with O_DIRECT. This implies using - unbuffered I/O but still fdatasync(), because some filesystems might - not flush meta-data on write completion */ - SRV_O_DIRECT, - /** Like O_DIRECT, but skip fdatasync(), assuming that the data is - durable on write completion */ - SRV_O_DIRECT_NO_FSYNC -#ifdef _WIN32 - /** Traditional Windows appoach to open all files without caching, - and do FileFlushBuffers() */ - ,SRV_ALL_O_DIRECT_FSYNC -#endif -}; - -/** innodb_flush_method */ -extern ulong srv_file_flush_method; - /** Undo tablespaces starts with space_id. */ extern uint32_t srv_undo_space_id_start; /** The number of UNDO tablespaces that are open and ready to use. */ @@ -645,6 +613,8 @@ private: } public: + /** Reopen all files on set_write_through() or set_buffered(). */ + static void reopen_all(); /** Try to close a file to adhere to the innodb_open_files limit. @param print_info whether to diagnose why a file cannot be closed @return whether a file was closed */ @@ -958,6 +928,11 @@ public: freed_ranges.add_range(range); } + /** Clear the freed range in temporary tablespace + which are in shrinking ranges. + @param threshold to be truncated value*/ + inline void clear_freed_ranges(uint32_t threshold); + /** Set the tablespace size in pages */ void set_sizes(uint32_t s) { @@ -1035,6 +1010,9 @@ public: /** @return the tablespace name (databasename/tablename) */ name_type name() const; + /** Update the data structures on write completion */ + void complete_write(); + private: /** @return whether the file is usable for io() */ ATTRIBUTE_COLD bool prepare_acquired(); @@ -1107,9 +1085,6 @@ struct fil_node_t final @return detached handle or OS_FILE_CLOSED */ inline pfs_os_file_t close_to_free(bool detach_handle= false); - /** Update the data structures on write completion */ - inline void complete_write(); - private: /** Does stuff common for close() and detach() */ void prepare_to_close_or_detach(); @@ -1117,8 +1092,7 @@ private: inline bool fil_space_t::use_doublewrite() const { - return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf && - buf_dblwr.is_created(); + return !UT_LIST_GET_FIRST(chain)->atomic_write && buf_dblwr.in_use(); } inline void fil_space_t::set_imported() @@ -1285,11 +1259,11 @@ constexpr uint16_t FIL_PAGE_RTREE= 17854; constexpr uint16_t FIL_PAGE_UNDO_LOG= 2; /** Index node (of file-in-file metadata) */ constexpr uint16_t FIL_PAGE_INODE= 3; -/** Insert buffer free list */ +/** Former change buffer free list */ constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4; /** Freshly allocated page */ constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0; -/** Change buffer bitmap (pages n*innodb_page_size+1) */ +/** Former change buffer bitmap pages (pages n*innodb_page_size+1) */ constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5; /** System page */ constexpr uint16_t FIL_PAGE_TYPE_SYS= 6; @@ -1379,9 +1353,9 @@ struct fil_system_t Some members may require late initialisation, thus we just mark object as uninitialised. Real initialisation happens in create(). */ - fil_system_t() : m_initialised(false) {} + fil_system_t() {} - bool is_initialised() const { return m_initialised; } + bool is_initialised() const { return spaces.array; } /** Create the file system interface at database start. @@ -1394,8 +1368,6 @@ struct fil_system_t void close(); private: - bool m_initialised; - /** Points to the last opened space in space_list. Protected with fil_system.mutex. */ fil_space_t *space_list_last_opened= nullptr; @@ -1430,6 +1402,33 @@ public: fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ /** Map of fil_space_t::id to fil_space_t* */ hash_table_t spaces; + + /** false=invoke fsync() or fdatasync() on data files before checkpoint; + true=each write is durable (O_DSYNC) */ + my_bool write_through; + /** whether data files are buffered (not O_DIRECT) */ + my_bool buffered; + /** whether fdatasync() is needed on data files */ + Atomic_relaxed<bool> need_unflushed_spaces; + + /** Try to enable or disable write-through of data files */ + void set_write_through(bool write_through); + /** Update innodb_doublewrite */ + void set_use_doublewrite(ulong use) + { + buf_dblwr.set_use(use); + need_unflushed_spaces= !write_through && buf_dblwr.need_fsync(); + } + + /** Try to enable or disable file system caching of data files */ + void set_buffered(bool buffered); + + TPOOL_SUPPRESS_TSAN bool is_write_through() const { return write_through; } + TPOOL_SUPPRESS_TSAN bool is_buffered() const { return buffered; } + + /** @return whether to update unflushed_spaces */ + bool use_unflushed_spaces() const { return need_unflushed_spaces; } + /** tablespaces for which fil_space_t::needs_flush() holds */ sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces; /** number of currently open files; protected by mutex */ @@ -1555,12 +1554,7 @@ template<bool have_reference> inline void fil_space_t::flush() mysql_mutex_assert_not_owner(&fil_system.mutex); ut_ad(!have_reference || (pending() & PENDING)); ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT); - if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) - { - ut_ad(!is_in_unflushed_spaces); - ut_ad(!needs_flush()); - } - else if (have_reference) + if (have_reference) flush_low(); else { diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h index 67e79f1a..62dec39b 100644 --- a/storage/innobase/include/fsp0file.h +++ b/storage/innobase/include/fsp0file.h @@ -317,6 +317,8 @@ public: void set_space_id(uint32_t space_id) { m_space_id= space_id; } void set_flags(uint32_t flags) { m_flags = flags; } + + uint32_t param_size() const { return m_user_param_size; } private: /** Free the filepath buffer. */ void free_filepath(); @@ -401,6 +403,9 @@ private: pages in SysTablespace::normalize_size() */ uint32_t m_size; + /** Size in pages; Initial parameter size */ + uint32_t m_user_param_size; + /** ordinal position of this datafile in the tablespace */ ulint m_order; diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 99459bcb..015cb48c 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -555,6 +555,12 @@ inline void fsp_init_file_page( mtr->init(block); } +/** Truncate the system tablespace */ +void fsp_system_tablespace_truncate(); + +/** Truncate the temporary tablespace */ +void fsp_shrink_temp_space(); + #ifndef UNIV_DEBUG # define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr) #endif diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h index 514f3fdb..3ff0e864 100644 --- a/storage/innobase/include/fsp0sysspace.h +++ b/storage/innobase/include/fsp0sysspace.h @@ -119,6 +119,12 @@ public: return(m_auto_extend_last_file); } + /** @return auto shrink */ + bool can_auto_shrink() const + { + return m_auto_shrink; + } + /** Set the last file size. @param[in] size the size to set */ void set_last_file_size(uint32_t size) @@ -144,6 +150,16 @@ public: } /** + @return user specified tablespace size */ + uint32_t get_min_size() const + { + uint32_t full_size= 0; + for (uint32_t i= 0; i < m_files.size(); i++) + full_size+= m_files.at(i).m_user_param_size; + return full_size; + } + + /** @return next increment size */ uint32_t get_increment() const; @@ -251,6 +267,10 @@ private: /** if false, then sanity checks are still pending */ bool m_sanity_checks_done; + + /** Shrink the system tablespace if the value is + enabled */ + bool m_auto_shrink; }; /* GLOBAL OBJECTS */ diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 9a23e840..757ead55 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2022, MariaDB Corporation. +Copyright (c) 2014, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -157,28 +157,20 @@ this many file pages */ /* This has been replaced with either srv_page_size or page_zip->size. */ /** @name The space low address page map -The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated +The 2 pages at FSP_XDES_OFFSET are repeated every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */ /* @{ */ /*--------------------------------------*/ #define FSP_XDES_OFFSET 0U /* !< extent descriptor */ -#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */ - /* The ibuf bitmap pages are the ones whose - page number is the number above plus a - multiple of XDES_DESCRIBED_PER_PAGE */ - #define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */ /* The following pages exist in the system tablespace (space 0). */ -#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer +#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< former change buffer header page, in tablespace 0 */ -#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< former change buffer B-tree root page in tablespace 0 */ - /* The ibuf tree root page number in - tablespace 0; its fseg inode is on the page - number FSP_FIRST_INODE_PAGE_NO */ #define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction system header, in tablespace 0 */ diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h index 1adec365..dc8806a5 100644 --- a/storage/innobase/include/fut0lst.h +++ b/storage/innobase/include/fut0lst.h @@ -152,6 +152,15 @@ inline fil_addr_t flst_get_prev_addr(const flst_node_t *node) return flst_read_addr(node + FLST_PREV); } +/** Write a file address. +@param[in] block file page +@param[in,out] faddr file address location +@param[in] page page number +@param[in] boffset byte offset +@param[in,out] mtr mini-transaction */ +void flst_write_addr(const buf_block_t &block, byte *faddr, + uint32_t page, uint16_t boffset, mtr_t *mtr); + # ifdef UNIV_DEBUG /** Validate a file-based list. */ void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr); diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h index b07261ce..724a764d 100644 --- a/storage/innobase/include/gis0rtree.h +++ b/storage/innobase/include/gis0rtree.h @@ -62,40 +62,45 @@ Created 2013/03/27 Jimmy Yang and Allen Lai /** Search for a spatial index leaf page record. @param cur cursor +@param thr query thread @param tuple search tuple @param latch_mode latching mode @param mtr mini-transaction @param mode search mode */ -dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple, +dberr_t rtr_search_leaf(btr_cur_t *cur, que_thr_t *thr, const dtuple_t *tuple, btr_latch_mode latch_mode, mtr_t *mtr, page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE) - MY_ATTRIBUTE((nonnull, warn_unused_result)); + MY_ATTRIBUTE((nonnull(1,3,5), warn_unused_result)); /** Search for inserting a spatial index leaf page record. @param cur cursor @param tuple search tuple @param latch_mode latching mode @param mtr mini-transaction */ -inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple, +inline dberr_t rtr_insert_leaf(btr_cur_t *cur, que_thr_t *thr, + const dtuple_t *tuple, btr_latch_mode latch_mode, mtr_t *mtr) { - return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT); + return rtr_search_leaf(cur, thr, tuple, latch_mode, mtr, + PAGE_CUR_RTREE_INSERT); } /** Search for a spatial index leaf page record. -@param pcur cursor +@param pcur cursor +@param thr query thread @param tuple search tuple @param mode search mode @param mtr mini-transaction */ -dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple, +dberr_t rtr_search_leaf(btr_pcur_t *pcur, que_thr_t *thr, + const dtuple_t *tuple, page_cur_mode_t mode, mtr_t *mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); -dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple, - page_cur_mode_t mode, - btr_latch_mode latch_mode, - btr_cur_t *cur, mtr_t *mtr) - MY_ATTRIBUTE((nonnull, warn_unused_result)); +dberr_t rtr_search_to_nth_level(btr_cur_t *cur, que_thr_t *thr, + const dtuple_t *tuple, + btr_latch_mode latch_mode, mtr_t *mtr, + page_cur_mode_t mode, ulint level) + MY_ATTRIBUTE((nonnull(1,3,5), warn_unused_result)); /**********************************************************************//** Builds a Rtree node pointer out of a physical record and a page number. @@ -132,7 +137,29 @@ rtr_page_split_and_insert( const dtuple_t* tuple, /*!< in: tuple to insert */ ulint n_ext, /*!< in: number of externally stored columns */ mtr_t* mtr, /*!< in: mtr */ - dberr_t* err); /*!< out: error code */ + dberr_t* err, /*!< out: error code */ + que_thr_t* thr); /*!< in: query thread */ + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts the tuple. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +rec_t* +rtr_root_raise_and_insert( + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err, /*!< out: error code */ + que_thr_t* thr); /*!< in: query thread */ /**************************************************************//** Sets the child node mbr in a node pointer. */ @@ -243,8 +270,8 @@ rtr_create_rtr_info( bool init_matches, /*!< in: Whether to initiate the "matches" structure for collecting matched leaf records */ - btr_cur_t* cursor, /*!< in: tree search cursor */ - dict_index_t* index); /*!< in: index struct */ + que_thr_t* thr, /*!< in/out: query thread */ + btr_cur_t* cursor); /*!< in: tree search cursor */ /********************************************************************//** Update a btr_cur_t with rtr_info */ @@ -299,8 +326,10 @@ rtr_get_mbr_from_tuple( about parent nodes in search @param[in,out] cursor cursor on node pointer record, its page x-latched +@param[in,out] thr query thread @return whether the cursor was successfully positioned */ -bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor) +bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor, + que_thr_t *thr) MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); /************************************************************//** @@ -312,11 +341,12 @@ rtr_page_get_father_block( /*======================*/ rec_offs* offsets,/*!< in: work area for the return value */ mem_heap_t* heap, /*!< in: memory heap to use */ - mtr_t* mtr, /*!< in: mtr */ btr_cur_t* sea_cur,/*!< in: search cursor, contains information about parent nodes in search */ - btr_cur_t* cursor);/*!< out: cursor on node pointer record, + btr_cur_t* cursor, /*!< out: cursor on node pointer record, its page x-latched */ + que_thr_t* thr, /*!< in/out: query thread */ + mtr_t* mtr); /*!< in/out: mtr */ /**************************************************************//** Store the parent path cursor @return number of cursor stored */ @@ -337,6 +367,7 @@ bool rtr_search( const dtuple_t* tuple, /*!< in: tuple on which search done */ btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + que_thr_t* thr, /*!< in/out; query thread */ mtr_t* mtr) /*!< in: mtr */ MY_ATTRIBUTE((warn_unused_result)); diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl index 5101eeb6..460496d1 100644 --- a/storage/innobase/include/gis0rtree.inl +++ b/storage/innobase/include/gis0rtree.inl @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -240,6 +240,9 @@ rtr_info_reinit_in_cursor( bool need_prdt) /*!< in: Whether predicate lock is needed */ { + que_thr_t* thr = cursor->rtr_info->thr; + ut_ad(thr); rtr_clean_rtr_info(cursor->rtr_info, false); rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true); + cursor->rtr_info->thr = thr; } diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h index c246b2ef..d1ff331f 100644 --- a/storage/innobase/include/ibuf0ibuf.h +++ b/storage/innobase/include/ibuf0ibuf.h @@ -1,7 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2022, MariaDB Corporation. +Copyright (c) 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -17,420 +16,40 @@ this program; if not, write to the Free Software Foundation, Inc., *****************************************************************************/ -/**************************************************//** -@file include/ibuf0ibuf.h -Insert buffer - -Created 7/19/1997 Heikki Tuuri -*******************************************************/ - -#ifndef ibuf0ibuf_h -#define ibuf0ibuf_h - -#include "mtr0mtr.h" -#include "dict0mem.h" -#include "fsp0fsp.h" - -/** Default value for maximum on-disk size of change buffer in terms -of percentage of the buffer pool. */ -#define CHANGE_BUFFER_DEFAULT_SIZE (25) - -/* Possible operations buffered in the insert/whatever buffer. See -ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */ -typedef enum { - IBUF_OP_INSERT = 0, - IBUF_OP_DELETE_MARK = 1, - IBUF_OP_DELETE = 2, - - /* Number of different operation types. */ - IBUF_OP_COUNT = 3 -} ibuf_op_t; - -/** Combinations of operations that can be buffered. -@see innodb_change_buffering_names */ -enum ibuf_use_t { - IBUF_USE_NONE = 0, - IBUF_USE_INSERT, /* insert */ - IBUF_USE_DELETE_MARK, /* delete */ - IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */ - IBUF_USE_DELETE, /* delete+purge */ - IBUF_USE_ALL /* insert+delete+purge */ -}; - -/** Operations that can currently be buffered. */ -extern ulong innodb_change_buffering; - -/** Insert buffer struct */ -struct ibuf_t{ - Atomic_relaxed<ulint> size; /*!< current size of the ibuf index - tree, in pages */ - Atomic_relaxed<ulint> max_size; /*!< recommended maximum size of the - ibuf index tree, in pages */ - ulint seg_size; /*!< allocated pages of the file - segment containing ibuf header and - tree */ - bool empty; /*!< Protected by the page - latch of the root page of the - insert buffer tree - (FSP_IBUF_TREE_ROOT_PAGE_NO). true - if and only if the insert - buffer tree is empty. */ - ulint free_list_len; /*!< length of the free list */ - ulint height; /*!< tree height */ - dict_index_t* index; /*!< insert buffer index */ - - /** number of pages merged */ - Atomic_counter<ulint> n_merges; - Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT]; - /*!< number of operations of each type - merged to index pages */ - Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT]; - /*!< number of operations of each type - discarded without merging due to the - tablespace being deleted or the - index being dropped */ -}; - -/** The insert buffer control structure */ -extern ibuf_t ibuf; - -/* The purpose of the insert buffer is to reduce random disk access. -When we wish to insert a record into a non-unique secondary index and -the B-tree leaf page where the record belongs to is not in the buffer -pool, we insert the record into the insert buffer B-tree, indexed by -(space_id, page_no). When the page is eventually read into the buffer -pool, we look up the insert buffer B-tree for any modifications to the -page, and apply these upon the completion of the read operation. This -is called the insert buffer merge. */ - -/* The insert buffer merge must always succeed. To guarantee this, -the insert buffer subsystem keeps track of the free space in pages for -which it can buffer operations. Two bits per page in the insert -buffer bitmap indicate the available space in coarse increments. The -free bits in the insert buffer bitmap must never exceed the free space -on a page. It is safe to decrement or reset the bits in the bitmap in -a mini-transaction that is committed before the mini-transaction that -affects the free space. It is unsafe to increment the bits in a -separately committed mini-transaction, because in crash recovery, the -free bits could momentarily be set too high. */ - -/******************************************************************//** -Creates the insert buffer data structure at a database startup. -@return DB_SUCCESS or failure */ -dberr_t -ibuf_init_at_db_start(void); -/*=======================*/ -/*********************************************************************//** -Updates the max_size value for ibuf. */ -void -ibuf_max_size_update( -/*=================*/ - ulint new_val); /*!< in: new value in terms of - percentage of the buffer pool size */ -/*********************************************************************//** -Reads the biggest tablespace id from the high end of the insert buffer -tree and updates the counter in fil_system. */ -void -ibuf_update_max_tablespace_id(void); -/*===============================*/ -/***************************************************************//** -Starts an insert buffer mini-transaction. */ -UNIV_INLINE -void -ibuf_mtr_start( -/*===========*/ - mtr_t* mtr) /*!< out: mini-transaction */ - MY_ATTRIBUTE((nonnull)); -/***************************************************************//** -Commits an insert buffer mini-transaction. */ -UNIV_INLINE -void -ibuf_mtr_commit( -/*============*/ - mtr_t* mtr) /*!< in/out: mini-transaction */ - MY_ATTRIBUTE((nonnull)); -/************************************************************************//** -Resets the free bits of the page in the ibuf bitmap. This is done in a -separate mini-transaction, hence this operation does not restrict -further work to only ibuf bitmap operations, which would result if the -latch to the bitmap page were kept. NOTE: The free bits in the insert -buffer bitmap must never exceed the free space on a page. It is safe -to decrement or reset the bits in the bitmap in a mini-transaction -that is committed before the mini-transaction that affects the free -space. */ -void -ibuf_reset_free_bits( -/*=================*/ - buf_block_t* block); /*!< in: index page; free bits are set to 0 - if the index is a non-clustered - non-unique, and page level is 0 */ -/************************************************************************//** -Updates the free bits of an uncompressed page in the ibuf bitmap if -there is not enough free on the page any more. This is done in a -separate mini-transaction, hence this operation does not restrict -further work to only ibuf bitmap operations, which would result if the -latch to the bitmap page were kept. NOTE: The free bits in the insert -buffer bitmap must never exceed the free space on a page. It is -unsafe to increment the bits in a separately committed -mini-transaction, because in crash recovery, the free bits could -momentarily be set too high. It is only safe to use this function for -decrementing the free bits. Should more free space become available, -we must not update the free bits here, because that would break crash -recovery. */ -UNIV_INLINE -void -ibuf_update_free_bits_if_full( -/*==========================*/ - buf_block_t* block, /*!< in: index page to which we have added new - records; the free bits are updated if the - index is non-clustered and non-unique and - the page level is 0, and the page becomes - fuller */ - ulint max_ins_size,/*!< in: value of maximum insert size with - reorganize before the latest operation - performed to the page */ - ulint increase);/*!< in: upper limit for the additional space - used in the latest operation, if known, or - ULINT_UNDEFINED */ -/**********************************************************************//** -Updates the free bits for an uncompressed page to reflect the present -state. Does this in the mtr given, which means that the latching -order rules virtually prevent any further operations for this OS -thread until mtr is committed. NOTE: The free bits in the insert -buffer bitmap must never exceed the free space on a page. It is safe -to set the free bits in the same mini-transaction that updated the -page. */ -void -ibuf_update_free_bits_low( -/*======================*/ - const buf_block_t* block, /*!< in: index page */ - ulint max_ins_size, /*!< in: value of - maximum insert size - with reorganize before - the latest operation - performed to the page */ - mtr_t* mtr); /*!< in/out: mtr */ -/**********************************************************************//** -Updates the free bits for a compressed page to reflect the present -state. Does this in the mtr given, which means that the latching -order rules virtually prevent any further operations for this OS -thread until mtr is committed. NOTE: The free bits in the insert -buffer bitmap must never exceed the free space on a page. It is safe -to set the free bits in the same mini-transaction that updated the -page. */ -void -ibuf_update_free_bits_zip( -/*======================*/ - buf_block_t* block, /*!< in/out: index page */ - mtr_t* mtr); /*!< in/out: mtr */ -/**********************************************************************//** -Updates the free bits for the two pages to reflect the present state. -Does this in the mtr given, which means that the latching order rules -virtually prevent any further operations until mtr is committed. -NOTE: The free bits in the insert buffer bitmap must never exceed the -free space on a page. It is safe to set the free bits in the same -mini-transaction that updated the pages. */ -void -ibuf_update_free_bits_for_two_pages_low( -/*====================================*/ - buf_block_t* block1, /*!< in: index page */ - buf_block_t* block2, /*!< in: index page */ - mtr_t* mtr); /*!< in: mtr */ -/**********************************************************************//** -A basic partial test if an insert to the insert buffer could be possible and -recommended. */ -UNIV_INLINE -ibool -ibuf_should_try( -/*============*/ - dict_index_t* index, /*!< in: index where to insert */ - ulint ignore_sec_unique); /*!< in: if != 0, we should - ignore UNIQUE constraint on - a secondary index when we - decide */ -/******************************************************************//** -Returns TRUE if the current OS thread is performing an insert buffer -routine. - -For instance, a read-ahead of non-ibuf pages is forbidden by threads -that are executing an insert buffer routine. -@return TRUE if inside an insert buffer routine */ -UNIV_INLINE -ibool -ibuf_inside( -/*========*/ - const mtr_t* mtr) /*!< in: mini-transaction */ - MY_ATTRIBUTE((warn_unused_result)); - -/** Checks if a page address is an ibuf bitmap page (level 3 page) address. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@return TRUE if a bitmap page */ -inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size) -{ - ut_ad(ut_is_2pow(zip_size)); - ulint size = zip_size ? zip_size : srv_page_size; - return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET; -} - -/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. -Must not be called when recv_no_ibuf_operations==true. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] x_latch FALSE if relaxed check (avoid latching the -bitmap page) -@param[in,out] mtr mtr which will contain an x-latch to the -bitmap page if the page is not one of the fixed address ibuf pages, or NULL, -in which case a new transaction is created. -@return true if level 2 or level 3 page */ -bool -ibuf_page_low( - const page_id_t page_id, - ulint zip_size, -#ifdef UNIV_DEBUG - bool x_latch, -#endif /* UNIV_DEBUG */ - mtr_t* mtr) - MY_ATTRIBUTE((warn_unused_result)); - -#ifdef UNIV_DEBUG -/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. -Must not be called when recv_no_ibuf_operations==true. -@param[in] page_id tablespace/page identifier -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in,out] mtr mini-transaction or NULL -@return TRUE if level 2 or level 3 page */ -# define ibuf_page(page_id, zip_size, mtr) \ - ibuf_page_low(page_id, zip_size, true, mtr) - -#else /* UNIV_DEBUG */ - -/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. -Must not be called when recv_no_ibuf_operations==true. -@param[in] page_id tablespace/page identifier -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in,out] mtr mini-transaction or NULL -@return TRUE if level 2 or level 3 page */ -# define ibuf_page(page_id, zip_size, mtr) \ - ibuf_page_low(page_id, zip_size, mtr) - -#endif /* UNIV_DEBUG */ -/***********************************************************************//** -Frees excess pages from the ibuf free list. This function is called when an OS -thread calls fsp services to allocate a new file segment, or a new page to a -file segment, and the thread did not own the fsp latch before this call. */ -void -ibuf_free_excess_pages(void); -/*========================*/ - -/** Buffer an operation in the change buffer, instead of applying it -directly to the file page, if this is possible. Does not do it if the index -is clustered or unique. -@param[in] op operation type -@param[in] entry index entry to insert -@param[in,out] index index where to insert -@param[in] page_id page id where to insert -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in,out] thr query thread -@return true if success */ -bool -ibuf_insert( - ibuf_op_t op, - const dtuple_t* entry, - dict_index_t* index, - const page_id_t page_id, - ulint zip_size, - que_thr_t* thr); - -/** Check whether buffered changes exist for a page. -@param[in] id page identifier -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@return whether buffered changes exist */ -bool ibuf_page_exists(const page_id_t id, ulint zip_size); - -/** When an index page is read from a disk to the buffer pool, this function -applies any buffered operations to the page and deletes the entries from the -insert buffer. If the page is not read, but created in the buffer pool, this -function deletes its buffered entries from the insert buffer; there can -exist entries for such a page if the page belonged to an index which -subsequently was dropped. -@param block X-latched page to try to apply changes to, or NULL to discard -@param page_id page identifier -@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@return error code */ -dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block, - const page_id_t page_id, - ulint zip_size); - -/** Delete all change buffer entries for a tablespace, -in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead. -@param[in] space missing or to-be-discarded tablespace */ -void ibuf_delete_for_discarded_space(uint32_t space); - -/** Contract the change buffer by reading pages to the buffer pool. -@return a lower limit for the combined size in bytes of entries which -will be merged from ibuf trees to the pages read -@retval 0 if ibuf.empty */ -ulint ibuf_contract(); - -/** Contracts insert buffer trees by reading pages referring to space_id -to the buffer pool. -@returns number of pages merged.*/ -ulint -ibuf_merge_space( -/*=============*/ - ulint space); /*!< in: space id */ - -/******************************************************************//** -Looks if the insert buffer is empty. -@return true if empty */ -bool -ibuf_is_empty(void); -/*===============*/ -/******************************************************************//** -Prints info of ibuf. */ -void -ibuf_print( -/*=======*/ - FILE* file); /*!< in: file where to print */ -/******************************************************************** -Read the first two bytes from a record's fourth field (counter field in new -records; something else in older records). -@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */ -ulint -ibuf_rec_get_counter( -/*=================*/ - const rec_t* rec); /*!< in: ibuf record */ -/******************************************************************//** -Closes insert buffer and frees the data structures. */ -void -ibuf_close(void); -/*============*/ - -/** Check the insert buffer bitmaps on IMPORT TABLESPACE. -@param[in] trx transaction -@param[in,out] space tablespace being imported -@return DB_SUCCESS or error code */ -dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - -/** Update free bits and buffered bits for bulk loaded page. -@param block secondary index leaf page -@param mtr mini-transaction -@param reset whether the page is full */ -void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset); - -#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO -#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO - -/* The ibuf header page currently contains only the file segment header -for the file segment from which the pages for the ibuf tree are allocated */ -#define IBUF_HEADER PAGE_DATA -#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */ - -/* The insert buffer tree itself is always located in space 0. */ -#define IBUF_SPACE_ID static_cast<ulint>(0) - -#include "ibuf0ibuf.inl" - -#endif +#include "db0err.h" + +/* The purpose of the change buffer was to reduce random disk access. +When we wished to +(1) insert a record into a non-unique secondary index, +(2) delete-mark a secondary index record, +(3) delete a secondary index record as part of purge (but not ROLLBACK), +and the B-tree leaf page where the record belongs to is not in the buffer +pool, we inserted a record into the change buffer B-tree, indexed by +the page identifier. When the page was eventually read into the buffer +pool, we looked up the change buffer B-tree for any modifications to the +page, applied these upon the completion of the read operation. This +was called the insert buffer merge. + +There was a hash index of the change buffer B-tree, implemented as the +"change buffer bitmap". Bits in these bitmap pages indicated how full +the page roughly was, and whether any records for the page identifier +exist in the change buffer. The "free" bits had to be updated as part of +operations that modified secondary index leaf pages. + +Because the change buffer has been removed, we will no longer update +any change buffer bitmap pages. Instead, on database startup, we will +check if an upgrade needs to be performed, and apply any buffered +changes if that is the case. Finally, the change buffer will be +transformed to a format that will not be recognized by earlier +versions of MariaDB Server, to prevent downgrades from causing +corruption (due to the removed updates of the bitmap pages) when the +change buffer might be enabled. */ + +/** Check if ibuf_upgrade() is needed as part of server startup. +@return error code +@retval DB_SUCCESS if no upgrade is needed +@retval DB_FAIL if the change buffer is not empty (need ibuf_upgrade()) */ +dberr_t ibuf_upgrade_needed(); + +/** Upgrade the change buffer after all redo log has been applied. */ +dberr_t ibuf_upgrade(); diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl deleted file mode 100644 index 003bf22a..00000000 --- a/storage/innobase/include/ibuf0ibuf.inl +++ /dev/null @@ -1,282 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2021, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/ibuf0ibuf.ic -Insert buffer - -Created 7/19/1997 Heikki Tuuri -*******************************************************/ - -#include "page0page.h" -#include "page0zip.h" -#include "fsp0types.h" -#include "buf0lru.h" - -/** An index page must contain at least srv_page_size / -IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to -buffer inserts to this page. If there is this much of free space, the -corresponding bits are set in the ibuf bitmap. */ -#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32 - -/***************************************************************//** -Starts an insert buffer mini-transaction. */ -UNIV_INLINE -void -ibuf_mtr_start( -/*===========*/ - mtr_t* mtr) /*!< out: mini-transaction */ -{ - mtr_start(mtr); - mtr->enter_ibuf(); - - if (high_level_read_only || srv_read_only_mode) { - mtr_set_log_mode(mtr, MTR_LOG_NO_REDO); - } - -} -/***************************************************************//** -Commits an insert buffer mini-transaction. */ -UNIV_INLINE -void -ibuf_mtr_commit( -/*============*/ - mtr_t* mtr) /*!< in/out: mini-transaction */ -{ - ut_ad(mtr->is_inside_ibuf()); - ut_d(mtr->exit_ibuf()); - - mtr_commit(mtr); -} - -/************************************************************************//** -Sets the free bit of the page in the ibuf bitmap. This is done in a separate -mini-transaction, hence this operation does not restrict further work to only -ibuf bitmap operations, which would result if the latch to the bitmap page -were kept. */ -void -ibuf_set_free_bits_func( -/*====================*/ - buf_block_t* block, /*!< in: index page of a non-clustered index; - free bit is reset if page level is 0 */ -#ifdef UNIV_IBUF_DEBUG - ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum - value which the bits must have before - setting; this is for debugging */ -#endif /* UNIV_IBUF_DEBUG */ - ulint val); /*!< in: value to set: < 4 */ -#ifdef UNIV_IBUF_DEBUG -# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v) -#else /* UNIV_IBUF_DEBUG */ -# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v) -#endif /* UNIV_IBUF_DEBUG */ - -/**********************************************************************//** -A basic partial test if an insert to the insert buffer could be possible and -recommended. */ -UNIV_INLINE -ibool -ibuf_should_try( -/*============*/ - dict_index_t* index, /*!< in: index where to insert */ - ulint ignore_sec_unique) /*!< in: if != 0, we should - ignore UNIQUE constraint on - a secondary index when we - decide */ -{ - if (index->type & (DICT_CLUSTERED | DICT_IBUF | DICT_SPATIAL) || - !innodb_change_buffering || !ibuf.max_size) - return false; - if (!ignore_sec_unique && index->is_unique()) - return false; - if (index->table->quiesce != QUIESCE_NONE) - return false; - for (unsigned i= 0; i < index->n_fields; i++) - if (index->fields[i].descending) - return false; - return true; -} - -/******************************************************************//** -Returns TRUE if the current OS thread is performing an insert buffer -routine. - -For instance, a read-ahead of non-ibuf pages is forbidden by threads -that are executing an insert buffer routine. -@return TRUE if inside an insert buffer routine */ -UNIV_INLINE -ibool -ibuf_inside( -/*========*/ - const mtr_t* mtr) /*!< in: mini-transaction */ -{ - return(mtr->is_inside_ibuf()); -} - -/** Translates the free space on a page to a value in the ibuf bitmap. -@param[in] page_size page size in bytes -@param[in] max_ins_size maximum insert size after reorganize for -the page -@return value for ibuf bitmap bits */ -UNIV_INLINE -ulint -ibuf_index_page_calc_free_bits( - ulint page_size, - ulint max_ins_size) -{ - ulint n; - ut_ad(ut_is_2pow(page_size)); - ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); - - n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); - - if (n == 3) { - n = 2; - } - - if (n > 3) { - n = 3; - } - - return(n); -} - -/*********************************************************************//** -Translates the free space on a compressed page to a value in the ibuf bitmap. -@return value for ibuf bitmap bits */ -UNIV_INLINE -ulint -ibuf_index_page_calc_free_zip( -/*==========================*/ - const buf_block_t* block) /*!< in: buffer block */ -{ - ulint max_ins_size; - const page_zip_des_t* page_zip; - lint zip_max_ins; - - ut_ad(block->page.zip.data); - - /* Consider the maximum insert size on the uncompressed page - without reorganizing the page. We must not assume anything - about the compression ratio. If zip_max_ins > max_ins_size and - there is 1/4 garbage on the page, recompression after the - reorganize could fail, in theory. So, let us guarantee that - merging a buffered insert to a compressed page will always - succeed without reorganizing or recompressing the page, just - by using the page modification log. */ - max_ins_size = page_get_max_insert_size( - buf_block_get_frame(block), 1); - - page_zip = buf_block_get_page_zip(block); - zip_max_ins = page_zip_max_ins_size(page_zip, - FALSE/* not clustered */); - - if (zip_max_ins < 0) { - return(0); - } else if (max_ins_size > (ulint) zip_max_ins) { - max_ins_size = (ulint) zip_max_ins; - } - - return(ibuf_index_page_calc_free_bits(block->physical_size(), - max_ins_size)); -} - -/*********************************************************************//** -Translates the free space on a page to a value in the ibuf bitmap. -@return value for ibuf bitmap bits */ -UNIV_INLINE -ulint -ibuf_index_page_calc_free( -/*======================*/ - const buf_block_t* block) /*!< in: buffer block */ -{ - if (!block->page.zip.data) { - ulint max_ins_size; - - max_ins_size = page_get_max_insert_size_after_reorganize( - buf_block_get_frame(block), 1); - - return(ibuf_index_page_calc_free_bits( - block->physical_size(), max_ins_size)); - } else { - return(ibuf_index_page_calc_free_zip(block)); - } -} - -/************************************************************************//** -Updates the free bits of an uncompressed page in the ibuf bitmap if -there is not enough free on the page any more. This is done in a -separate mini-transaction, hence this operation does not restrict -further work to only ibuf bitmap operations, which would result if the -latch to the bitmap page were kept. NOTE: The free bits in the insert -buffer bitmap must never exceed the free space on a page. It is -unsafe to increment the bits in a separately committed -mini-transaction, because in crash recovery, the free bits could -momentarily be set too high. It is only safe to use this function for -decrementing the free bits. Should more free space become available, -we must not update the free bits here, because that would break crash -recovery. */ -UNIV_INLINE -void -ibuf_update_free_bits_if_full( -/*==========================*/ - buf_block_t* block, /*!< in: index page to which we have added new - records; the free bits are updated if the - index is non-clustered and non-unique and - the page level is 0, and the page becomes - fuller */ - ulint max_ins_size,/*!< in: value of maximum insert size with - reorganize before the latest operation - performed to the page */ - ulint increase)/*!< in: upper limit for the additional space - used in the latest operation, if known, or - ULINT_UNDEFINED */ -{ - ulint before; - ulint after; - - ut_ad(buf_block_get_page_zip(block) == NULL); - - before = ibuf_index_page_calc_free_bits( - srv_page_size, max_ins_size); - - if (max_ins_size >= increase) { - compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX); - after = ibuf_index_page_calc_free_bits( - srv_page_size, max_ins_size - increase); -#ifdef UNIV_IBUF_DEBUG - ut_a(after <= ibuf_index_page_calc_free(block)); -#endif - } else { - after = ibuf_index_page_calc_free(block); - } - - if (after == 0) { - /* We move the page to the front of the buffer pool LRU list: - the purpose of this is to prevent those pages to which we - cannot make inserts using the insert buffer from slipping - out of the buffer pool */ - - buf_page_make_young(&block->page); - } - - if (before > after) { - ibuf_set_free_bits(block, after, before); - } -} diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 08b9f4bc..cab44dd9 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -185,13 +185,6 @@ lock_update_split_left( void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred, const page_id_t right); -/** Update the locks when a page is split and merged to two pages, -in defragmentation. */ -void lock_update_split_and_merge( - const buf_block_t* left_block, /*!< in: left page to which merged */ - const rec_t* orig_pred, /*!< in: original predecessor of - supremum on the left page before merge*/ - const buf_block_t* right_block);/*!< in: right page from which merged */ /*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h index 2500ac05..1b8c4b41 100644 --- a/storage/innobase/include/log0crypt.h +++ b/storage/innobase/include/log0crypt.h @@ -87,7 +87,7 @@ void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end); /** Encrypt or decrypt a temporary file block. @param[in] src block to encrypt or decrypt -@param[in] size size of the block +@param[in] size length of both src and dst in bytes @param[out] dst destination block @param[in] offs offset to block @param[in] encrypt true=encrypt; false=decrypt @@ -102,7 +102,7 @@ bool log_tmp_block_encrypt( /** Decrypt a temporary file block. @param[in] src block to decrypt -@param[in] size size of the block +@param[in] size length of both src and dst in bytes @param[out] dst destination block @param[in] offs offset to block @return whether the operation succeeded */ diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index cef0dcae..85d01f2f 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -291,6 +291,8 @@ public: bool log_maybe_unbuffered; # endif #endif + /** whether each write to ib_logfile0 is durable (O_DSYNC) */ + my_bool log_write_through; /** Fields involved in checkpoints @{ */ lsn_t log_capacity; /*!< capacity of the log; if @@ -407,6 +409,8 @@ public: /** Try to enable or disable file system caching (update log_buffered) */ void set_buffered(bool buffered); #endif + /** Try to enable or disable durable writes (update log_write_through) */ + void set_write_through(bool write_through); void close_file(); diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index a73b7279..9321a8b8 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2022, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -203,22 +203,9 @@ struct page_recv_t inline void will_not_read(); }; -/** A page initialization operation that was parsed from the redo log */ -struct recv_init -{ - /** log sequence number of the page initialization */ - lsn_t lsn; - /** Whether btr_page_create() avoided a read of the page. - At the end of the last recovery batch, mark_ibuf_exist() - will mark pages for which this flag is set. */ - bool created; -}; - /** Recovery system data structure */ struct recv_sys_t { - using init= recv_init; - /** mutex protecting this as well as some of page_recv_t */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; private: @@ -273,7 +260,10 @@ private: lsn_t lsn; /** truncated size of the tablespace, or 0 if not truncated */ unsigned pages; - } truncated_undo_spaces[127]; + }; + + trunc truncated_undo_spaces[127]; + trunc truncated_sys_space; public: /** The contents of the doublewrite buffer */ @@ -299,23 +289,23 @@ public: pages_it= pages.end(); } + /** Allow to apply system tablespace truncate redo log only + if the size to be extended is lesser than current size. + @retval true To apply the truncate shrink redo log record + @retval false otherwise */ + bool check_sys_truncate(); + private: /** Attempt to initialize a page based on redo log records. @param p iterator @param mtr mini-transaction @param b pre-allocated buffer pool block - @param init page initialization + @param init_lsn page initialization @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr, - buf_block_t *b, init &init); - /** Attempt to initialize a page based on redo log records. - @param page_id page identifier - @return the recovered block - @retval nullptr if the page cannot be initialized based on log records - @retval -1 if the page cannot be recovered due to corruption */ - ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id); + buf_block_t *b, lsn_t init_lsn); /** All found log files (multiple ones are possible if we are upgrading from before MariaDB Server 10.5.1) */ @@ -460,15 +450,14 @@ public: /** @return whether log file corruption was found */ bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); } - /** Attempt to initialize a page based on redo log records. + /** Read a page or recover it based on redo log records. @param page_id page identifier - @return the recovered block - @retval nullptr if the page cannot be initialized based on log records - @retval -1 if the page cannot be recovered due to corruption */ - buf_block_t *recover(const page_id_t page_id) - { - return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr; - } + @param mtr mini-transaction + @param err error code + @return the requested block + @retval nullptr if the page cannot be accessed due to corruption */ + ATTRIBUTE_COLD + buf_block_t *recover(const page_id_t page_id, mtr_t *mtr, dberr_t *err); /** Try to recover a tablespace that was not readable earlier @param p iterator @@ -484,16 +473,6 @@ public: /** The recovery system */ extern recv_sys_t recv_sys; -/** If the following is TRUE, the buffer pool file pages must be invalidated -after recovery and no ibuf operations are allowed; this will be set if -recv_sys.pages becomes too full, and log records must be merged -to file pages already before the recovery is finished: in this case no -ibuf operations are allowed, as they could modify the pages read in the -buffer pool before the pages have been recovered to the up-to-date state. - -TRUE means that recovery is running and no operations on the log files -are allowed yet: the variable name is misleading. */ -extern bool recv_no_ibuf_operations; /** TRUE when recv_init_crash_recovery() has been called. */ extern bool recv_needed_recovery; #ifdef UNIV_DEBUG diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index bfa66216..27811872 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -309,15 +309,6 @@ public: @retval 0 if the transaction only modified temporary tablespaces */ lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; } - /** Note that we are inside the change buffer code. */ - void enter_ibuf() { m_inside_ibuf= true; } - - /** Note that we have exited from the change buffer code. */ - void exit_ibuf() { m_inside_ibuf= false; } - - /** @return true if we are inside the change buffer code */ - bool is_inside_ibuf() const { return m_inside_ibuf; } - /** Note that some pages have been freed */ void set_trim_pages() { m_trim_pages= true; } @@ -772,10 +763,6 @@ private: /** whether log_sys.latch is locked exclusively */ uint16_t m_latch_ex:1; - /** whether change buffer is latched; only needed in non-debug builds - to suppress some read-ahead operations, @see ibuf_inside() */ - uint16_t m_inside_ibuf:1; - /** whether the pages has been trimmed */ uint16_t m_trim_pages:1; diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 7eba359f..ce686475 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -197,10 +197,14 @@ public: WRITE_SYNC= 16, /** Asynchronous write */ WRITE_ASYNC= WRITE_SYNC | 1, + /** Asynchronous doublewritten page */ + WRITE_DBL= WRITE_ASYNC | 4, /** A doublewrite batch */ DBLWR_BATCH= WRITE_ASYNC | 8, /** Write data and punch hole for the rest */ PUNCH= WRITE_ASYNC | 16, + /** Write doublewritten data and punch hole for the rest */ + PUNCH_DBL= PUNCH | 4, /** Zero out a range of bytes in fil_space_t::io() */ PUNCH_RANGE= WRITE_SYNC | 32, }; @@ -216,6 +220,14 @@ public: bool is_read() const { return (type & READ_SYNC) != 0; } bool is_write() const { return (type & WRITE_SYNC) != 0; } bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } + bool is_doublewritten() const { return (type & 4) != 0; } + + /** Create a write request for the doublewrite buffer. */ + IORequest doublewritten() const + { + ut_ad(type == WRITE_ASYNC || type == PUNCH); + return IORequest{bpage, slot, node, Type(type | 4)}; + } void write_complete(int io_error) const; void read_complete(int io_error) const; diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h index 28aa3056..279138ac 100644 --- a/storage/innobase/include/page0cur.h +++ b/storage/innobase/include/page0cur.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2022, MariaDB Corporation. +Copyright (c) 2018, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -117,11 +117,6 @@ succeed, i.e., enough space available, NULL otherwise. The cursor stays at the same logical position, but the physical position may change if it is pointing to a compressed page that was reorganized. -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE -if this is a compressed leaf page in a secondary index. -This has to be done either within the same mini-transaction, -or by invoking ibuf_reset_free_bits() before mtr_commit(). - @return pointer to record if succeed, NULL otherwise */ UNIV_INLINE rec_t* @@ -151,11 +146,6 @@ page_cur_insert_rec_low( Inserts a record next to page cursor on a compressed and uncompressed page. -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE -if this is a compressed leaf page in a secondary index. -This has to be done either within the same mini-transaction, -or by invoking ibuf_reset_free_bits() before mtr_commit(). - @return pointer to inserted record @return nullptr on failure */ rec_t* diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl index 7c4eafa2..a73c31a7 100644 --- a/storage/innobase/include/page0cur.inl +++ b/storage/innobase/include/page0cur.inl @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2022, MariaDB Corporation. +Copyright (c) 2015, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -155,11 +155,6 @@ succeed, i.e., enough space available, NULL otherwise. The cursor stays at the same logical position, but the physical position may change if it is pointing to a compressed page that was reorganized. -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE -if this is a compressed leaf page in a secondary index. -This has to be done either within the same mini-transaction, -or by invoking ibuf_reset_free_bits() before mtr_commit(). - @return pointer to record if succeed, NULL otherwise */ UNIV_INLINE rec_t* diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 2978656b..38373f6b 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -1,6 +1,6 @@ /***************************************************************************** Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -95,7 +95,7 @@ this byte can be garbage. */ direction */ #define PAGE_N_RECS 16 /* number of user records on the page */ /** The largest DB_TRX_ID that may have modified a record on the page; -Defined only in secondary index leaf pages and in change buffer leaf pages. +Defined only in secondary index leaf pages. Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */ #define PAGE_MAX_TRX_ID 18 /** The AUTO_INCREMENT value (on persistent clustered index root pages). */ @@ -901,11 +901,6 @@ MY_ATTRIBUTE((nonnull, warn_unused_result)) Differs from page_copy_rec_list_end, because this function does not touch the lock table and max trx id on page or compress the page. -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE -if new_block is a compressed leaf page in a secondary index. -This has to be done either within the same mini-transaction, -or by invoking ibuf_reset_free_bits() before mtr_t::commit(). - @return error code */ dberr_t page_copy_rec_list_end_no_locks( @@ -920,11 +915,6 @@ Copies records from page to new_page, from the given record onward, including that record. Infimum and supremum records are not copied. The records are copied to the start of the record list on new_page. -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE -if new_block is a compressed leaf page in a secondary index. -This has to be done either within the same mini-transaction, -or by invoking ibuf_reset_free_bits() before mtr_t::commit(). - @return pointer to the original successor of the infimum record on new_block @retval nullptr on ROW_FORMAT=COMPRESSED page overflow */ rec_t* @@ -942,11 +932,6 @@ Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. The records are copied to the end of the record list on new_page. -IMPORTANT: The caller will have to update IBUF_BITMAP_FREE -if new_block is a compressed leaf page in a secondary index. -This has to be done either within the same mini-transaction, -or by invoking ibuf_reset_free_bits() before mtr_commit(). - @return pointer to the original predecessor of the supremum record on new_block @retval nullptr on ROW_FORMAT=COMPRESSED page overflow */ rec_t* diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index 43329906..501ef31a 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -2,7 +2,7 @@ Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2017, 2022, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -213,9 +213,9 @@ page_zip_max_ins_size( /**********************************************************************//** Determine if enough space is available in the modification log. -@return TRUE if page_zip_write_rec() will succeed */ +@return true if page_zip_write_rec() will succeed */ UNIV_INLINE -ibool +bool page_zip_available( /*===============*/ const page_zip_des_t* page_zip,/*!< in: compressed page */ @@ -323,10 +323,6 @@ Reorganize and compress a page. This is a low-level operation for compressed pages, to be used when page_zip_compress() fails. On success, redo log will be written. The function btr_page_reorganize() should be preferred whenever possible. -IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a -non-clustered index, the caller must update the insert buffer free -bits in the same mini-transaction in such a way that the modification -will be redo-logged. @return error code @retval DB_FAIL on overflow; the block_zip will be left intact */ dberr_t diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl index afc877c3..edcd4ab4 100644 --- a/storage/innobase/include/page0zip.inl +++ b/storage/innobase/include/page0zip.inl @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2022, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -269,7 +269,7 @@ page_zip_max_ins_size( Determine if enough space is available in the modification log. @return TRUE if enough space is available */ UNIV_INLINE -ibool +bool page_zip_available( /*===============*/ const page_zip_des_t* page_zip,/*!< in: compressed page */ diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl index 46c209cb..da7337a3 100644 --- a/storage/innobase/include/rem0rec.inl +++ b/storage/innobase/include/rem0rec.inl @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2022, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1095,9 +1095,7 @@ rec_get_converted_size( ut_ad(dtuple_check_typed(dtuple)); #ifdef UNIV_DEBUG - if (dict_index_is_ibuf(index)) { - ut_ad(dtuple->n_fields > 1); - } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) == REC_STATUS_NODE_PTR) { ut_ad(dtuple->n_fields - 1 == dict_index_get_n_unique_in_tree_nonleaf(index)); diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h index fd2651da..33e0da0f 100644 --- a/storage/innobase/include/row0import.h +++ b/storage/innobase/include/row0import.h @@ -33,6 +33,7 @@ Created 2012-02-08 by Sunny Bains struct trx_t; struct dict_table_t; struct row_prebuilt_t; +struct HA_CREATE_INFO; /*****************************************************************//** Imports a tablespace. The space id in the .ibd file must match the space id @@ -64,4 +65,13 @@ dberr_t row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset) MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Prepare the create info to create a new stub table for import. +@param thd Connection +@param name Table name, format: "db/table_name". +@param create_info The create info for creating a stub. +@return ER_ error code +@retval 0 on success */ +int prepare_create_stub_for_import(THD *thd, const char *name, + HA_CREATE_INFO& create_info); + #endif /* row0import_h */ diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h index 1daf4d4a..33ac8599 100644 --- a/storage/innobase/include/row0purge.h +++ b/storage/innobase/include/row0purge.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,39 +37,6 @@ Created 3/14/1997 Heikki Tuuri #include <unordered_map> class MDL_ticket; -/** Determines if it is possible to remove a secondary index entry. -Removal is possible if the secondary index entry does not refer to any -not delete marked version of a clustered index record where DB_TRX_ID -is newer than the purge view. - -NOTE: This function should only be called by the purge thread, only -while holding a latch on the leaf page of the secondary index entry -(or keeping the buffer pool watch on the page). It is possible that -this function first returns true and then false, if a user transaction -inserts a record that the secondary index entry would refer to. -However, in that case, the user transaction would also re-insert the -secondary index entry after purge has removed it and released the leaf -page latch. -@param[in,out] node row purge node -@param[in] index secondary index -@param[in] entry secondary index entry -@param[in,out] sec_pcur secondary index cursor or NULL - if it is called for purge buffering - operation. -@param[in,out] sec_mtr mini-transaction which holds - secondary index entry or NULL if it is - called for purge buffering operation. -@param[in] is_tree true=pessimistic purge, - false=optimistic (leaf-page only) -@return true if the secondary index record can be purged */ -bool -row_purge_poss_sec( - purge_node_t* node, - dict_index_t* index, - const dtuple_t* entry, - btr_pcur_t* sec_pcur=NULL, - mtr_t* sec_mtr=NULL, - bool is_tree=false); /*************************************************************** Does the purge operation. diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h index 7056c77f..85c18dde 100644 --- a/storage/innobase/include/row0row.h +++ b/storage/innobase/include/row0row.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2016, 2022, MariaDB Corporation. +Copyright (c) 2016, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,7 +28,6 @@ Created 4/20/1996 Heikki Tuuri #define row0row_h #include "que0types.h" -#include "ibuf0ibuf.h" #include "trx0types.h" #include "mtr0mtr.h" #include "rem0types.h" @@ -344,23 +343,10 @@ row_parse_int( ulint mtype, bool unsigned_type); -/** Result of row_search_index_entry */ -enum row_search_result { - ROW_FOUND = 0, /*!< the record was found */ - ROW_NOT_FOUND, /*!< record not found */ - ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or - BTR_DELETE_MARK was specified, the - secondary index leaf page was not in - the buffer pool, and the operation was - enqueued in the insert/delete buffer */ - ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and - row_purge_poss_sec() failed */ -}; - /***************************************************************//** Searches an index record. -@return whether the record was found or buffered */ -enum row_search_result +@return whether the record was found */ +bool row_search_index_entry( /*===================*/ const dtuple_t* entry, /*!< in: index entry */ @@ -404,22 +390,17 @@ row_raw_format( in bytes */ MY_ATTRIBUTE((nonnull, warn_unused_result)); +#include "dict0mem.h" + /** Prepare to start a mini-transaction to modify an index. @param[in,out] mtr mini-transaction -@param[in,out] index possibly secondary index -@param[in] pessimistic whether this is a pessimistic operation */ -inline -void -row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic) +@param[in,out] index possibly secondary index */ +inline void row_mtr_start(mtr_t* mtr, dict_index_t* index) { mtr->start(); switch (index->table->space_id) { - case IBUF_SPACE_ID: - if (pessimistic - && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) { - ibuf_free_excess_pages(); - } + case 0: break; case SRV_TMP_SPACE_ID: mtr->set_log_mode(MTR_LOG_NO_REDO); diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 2ed26748..6b9a6f09 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -2,7 +2,7 @@ Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -226,12 +226,8 @@ enum monitor_id_t { MONITOR_MODULE_BUF_PAGE, MONITOR_INDEX_LEAF_PAGE_READ, MONITOR_INDEX_NON_LEAF_PAGE_READ, - MONITOR_INDEX_IBUF_LEAF_PAGE_READ, - MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ, MONITOR_UNDO_LOG_PAGE_READ, MONITOR_INODE_PAGE_READ, - MONITOR_IBUF_FREELIST_PAGE_READ, - MONITOR_IBUF_BITMAP_PAGE_READ, MONITOR_SYSTEM_PAGE_READ, MONITOR_TRX_SYSTEM_PAGE_READ, MONITOR_FSP_HDR_PAGE_READ, @@ -242,12 +238,8 @@ enum monitor_id_t { MONITOR_OTHER_PAGE_READ, MONITOR_INDEX_LEAF_PAGE_WRITTEN, MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN, - MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN, - MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN, MONITOR_UNDO_LOG_PAGE_WRITTEN, MONITOR_INODE_PAGE_WRITTEN, - MONITOR_IBUF_FREELIST_PAGE_WRITTEN, - MONITOR_IBUF_BITMAP_PAGE_WRITTEN, MONITOR_SYSTEM_PAGE_WRITTEN, MONITOR_TRX_SYSTEM_PAGE_WRITTEN, MONITOR_FSP_HDR_PAGE_WRITTEN, @@ -345,17 +337,6 @@ enum monitor_id_t { MONITOR_MODULE_FIL_SYSTEM, MONITOR_OVLD_N_FILE_OPENED, - /* InnoDB Change Buffer related counters */ - MONITOR_MODULE_IBUF_SYSTEM, - MONITOR_OVLD_IBUF_MERGE_INSERT, - MONITOR_OVLD_IBUF_MERGE_DELETE, - MONITOR_OVLD_IBUF_MERGE_PURGE, - MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT, - MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE, - MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE, - MONITOR_OVLD_IBUF_MERGES, - MONITOR_OVLD_IBUF_SIZE, - /* Counters for server operations */ MONITOR_MODULE_SERVER, MONITOR_MASTER_THREAD_SLEEP, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 5e6bfc33..df25983a 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -161,9 +161,9 @@ extern char* srv_data_home; recovery and open all tables in RO mode instead of RW mode. We don't sync the max trx id to disk either. */ extern my_bool srv_read_only_mode; -/** Set if InnoDB operates in read-only mode or innodb-force-recovery -is greater than SRV_FORCE_NO_IBUF_MERGE. */ -extern my_bool high_level_read_only; +/** Set if innodb_read_only is set or innodb_force_recovery +is SRV_FORCE_NO_UNDO_LOG_SCAN or greater. */ +extern bool high_level_read_only; /** store to its own file each table created by an user; data dictionary tables are in the system tablespace 0 */ extern my_bool srv_file_per_table; @@ -253,18 +253,6 @@ extern ulong srv_read_ahead_threshold; extern uint srv_n_read_io_threads; extern uint srv_n_write_io_threads; -/* Defragmentation, Origianlly facebook default value is 100, but it's too high */ -#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 -extern my_bool srv_defragment; -extern uint srv_defragment_n_pages; -extern uint srv_defragment_stats_accuracy; -extern uint srv_defragment_fill_factor_n_recs; -extern double srv_defragment_fill_factor; -extern uint srv_defragment_frequency; -extern ulonglong srv_defragment_interval; - -extern uint srv_change_buffer_max_size; - /* Number of IO operations per second the server can do */ extern ulong srv_io_capacity; @@ -289,7 +277,7 @@ extern ulong srv_flushing_avg_loops; extern ulong srv_force_recovery; -/** innodb_fast_shutdown=1 skips purge and change buffer merge. +/** innodb_fast_shutdown=1 skips purge. innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint). innodb_fast_shutdown=3 is a clean shutdown that skips the rollback of active transaction (to be done on restart). */ @@ -305,7 +293,6 @@ extern my_bool srv_stats_include_delete_marked; extern unsigned long long srv_stats_modified_counter; extern my_bool srv_stats_sample_traditional; -extern my_bool srv_use_doublewrite_buf; extern ulong srv_checksum_algorithm; extern my_bool srv_force_primary_key; @@ -562,11 +549,6 @@ void srv_monitor_task(void*); void srv_master_callback(void*); -/** -Complete the shutdown tasks such as background DROP TABLE, -and optionally change buffer merge (on innodb_fast_shutdown=0). */ -void srv_shutdown(bool ibuf_merge); - } /* extern "C" */ #ifdef UNIV_DEBUG @@ -630,14 +612,6 @@ struct export_var_t{ /** Number of undo tablespace truncation operations */ ulong innodb_undo_truncations; - ulint innodb_defragment_compression_failures; /*!< Number of - defragment re-compression - failures */ - - ulint innodb_defragment_failures; /*!< Number of defragment - failures*/ - ulint innodb_defragment_count; /*!< Number of defragment - operations*/ /** Number of instant ALTER TABLE operations that affect columns */ ulong innodb_instant_alter_column; diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h index 2c0167ac..7a7f93b6 100644 --- a/storage/innobase/include/sux_lock.h +++ b/storage/innobase/include/sux_lock.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2020, 2022, MariaDB Corporation. +Copyright (c) 2020, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -149,7 +149,7 @@ private: #endif public: - /** In crash recovery or the change buffer, claim the ownership + /** In crash recovery, claim the ownership of the exclusive block lock to the current thread */ void claim_ownership() { set_new_owner(pthread_self()); } diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 3fa41fdf..e6e8eb6b 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -589,8 +589,7 @@ public: So we take more expensive approach: get trx through current_thd()->ha_data. Some threads don't have trx attached to THD, and at least server - initialisation thread, fts_optimize_thread, srv_master_thread, - dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even + initialisation thread doesn't even have THD at all. For such cases we allocate pins only for duration of search and free them immediately. diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 15255354..7457addb 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2015, 2022, MariaDB Corporation. +Copyright (c) 2015, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -796,6 +796,7 @@ public: /** normally set; "SET unique_checks=0, foreign_key_checks=0" enables bulk insert into an empty table */ unsigned check_unique_secondary:1; + /** whether an insert into an empty table is active */ unsigned bulk_insert:1; /*------------------------------*/ diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 3d22a33e..2954cf73 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -451,10 +451,10 @@ completely purged and trx_purge_free_segment() has started freeing it */ /** Transaction end identifier (if the log is in a history list), or 0 if the transaction has not been committed */ #define TRX_UNDO_TRX_NO 8 -/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of +/* Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of surviving user records, this used to be called TRX_UNDO_DEL_MARKS. -This field is redundant; it is only being read by some debug assertions. +This field was removed in MariaDB 11.0. The value 1 indicates that purge needs to process the undo log segment. The value 0 indicates that all of it has been processed, and @@ -463,7 +463,7 @@ trx_purge_free_segment() has been invoked, so the log is not safe to access. Before MariaDB 10.3.1, a log segment may carry the value 0 even before trx_purge_free_segment() was called, for those undo log records for which purge would not result in removing delete-marked records. */ -#define TRX_UNDO_NEEDS_PURGE 16 +/*#define TRX_UNDO_NEEDS_PURGE 16*/ #define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record of this log on the header page; purge may remove undo log record from the diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 1b4f70b6..8ef01bc0 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2023, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -135,7 +135,6 @@ using the call command. */ assertions. */ #define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ #define UNIV_HASH_DEBUG /* debug HASH_ macros */ -#define UNIV_IBUF_DEBUG /* debug the insert buffer */ #define UNIV_PERF_DEBUG /* debug flag that enables light weight performance related stuff. */ @@ -468,9 +467,6 @@ extern mysql_pfs_key_t fts_cache_mutex_key; extern mysql_pfs_key_t fts_cache_init_mutex_key; extern mysql_pfs_key_t fts_delete_mutex_key; extern mysql_pfs_key_t fts_doc_id_mutex_key; -extern mysql_pfs_key_t ibuf_bitmap_mutex_key; -extern mysql_pfs_key_t ibuf_mutex_key; -extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; extern mysql_pfs_key_t recalc_pool_mutex_key; extern mysql_pfs_key_t purge_sys_pq_mutex_key; extern mysql_pfs_key_t recv_sys_mutex_key; |