diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/buf | |
parent | Initial commit. (diff) | |
download | mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/buf')
-rw-r--r-- | storage/innobase/buf/buf0block_hint.cc | 59 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buddy.cc | 769 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buf.cc | 4180 | ||||
-rw-r--r-- | storage/innobase/buf/buf0checksum.cc | 98 | ||||
-rw-r--r-- | storage/innobase/buf/buf0dblwr.cc | 779 | ||||
-rw-r--r-- | storage/innobase/buf/buf0dump.cc | 765 | ||||
-rw-r--r-- | storage/innobase/buf/buf0flu.cc | 2765 | ||||
-rw-r--r-- | storage/innobase/buf/buf0lru.cc | 1452 | ||||
-rw-r--r-- | storage/innobase/buf/buf0rea.cc | 710 |
9 files changed, 11577 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc new file mode 100644 index 00000000..6bd01faa --- /dev/null +++ b/storage/innobase/buf/buf0block_hint.cc @@ -0,0 +1,59 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#include "buf0block_hint.h" +namespace buf { + +TRANSACTIONAL_TARGET +void Block_hint::buffer_fix_block_if_still_valid() +{ + /* To check if m_block belongs to the current buf_pool, we must + prevent freeing memory while we check, and until we buffer-fix the + block. For this purpose it is enough to latch any of the many + latches taken by buf_pool_t::resize(). + + Similar to buf_page_optimistic_get(), we must validate + m_block->page.id() after acquiring the hash_lock, because the object + may have been freed and not actually attached to buf_pool.page_hash + at the moment. (The block could have been reused to store a + different page, and that slice of buf_pool.page_hash could be protected + by another hash_lock that we are not holding.) + + Finally, we must ensure that the block is not being freed. */ + if (m_block) + { + auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold()); + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(cell)}; + if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() && + m_block->page.frame && m_block->page.in_file()) + m_block->page.fix(); + else + clear(); + } +} +} // namespace buf diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc new file mode 100644 index 00000000..85a698bc --- /dev/null +++ b/storage/innobase/buf/buf0buddy.cc @@ -0,0 +1,769 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buddy.cc +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "page0zip.h" +#include "srv0start.h" + +/** When freeing a buf we attempt to coalesce by looking at its buddy +and deciding whether it is free or not. To ascertain if the buddy is +free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET +within the buddy. The question is how we can be sure that it is +safe to look at BUF_BUDDY_STAMP_OFFSET. +The answer lies in following invariants: +* All blocks allocated by buddy allocator are used for compressed +page frame. +* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND +* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in +a frame. + -- The above is true because we look at these fields when the + corresponding buddy block is free which implies that: + * The block we are looking at must have an address aligned at + the same size that its free buddy has. For example, if we have + a free block of 8K then its buddy's address must be aligned at + 8K as well. + * It is possible that the block we are looking at may have been + further divided into smaller sized blocks but its starting + address must still remain the start of a page frame i.e.: it + cannot be middle of a block. For example, if we have a free + block of size 8K then its buddy may be divided into blocks + of, say, 1K, 1K, 2K, 4K but the buddy's address will still be + the starting address of first 1K compressed page. + * What is important to note is that for any given block, the + buddy's address cannot be in the middle of a larger block i.e.: + in above example, our 8K block cannot have a buddy whose address + is aligned on 8K but it is part of a larger 16K block. +*/ + +/** Offset within buf_buddy_free_t where free or non_free stamps +are written.*/ +#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +/** Value that we stamp on all buffers that are currently on the zip_free +list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */ +#define BUF_BUDDY_STAMP_FREE SRV_SPACE_ID_UPPER_BOUND + +/** Stamp value for non-free buffers. Will be overwritten by a non-zero +value by the consumer of the block */ +#define BUF_BUDDY_STAMP_NONFREE 0XFFFFFFFFUL + +/** Return type of buf_buddy_is_free() */ +enum buf_buddy_state_t { + BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */ + BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */ + BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy + are in use */ +}; + +/**********************************************************************//** +Invalidate memory area that we won't access while page is free */ +UNIV_INLINE +void +buf_buddy_mem_invalid( +/*==================*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of zip_free[] */ +{ + ut_ad(i <= BUF_BUDDY_SIZES); + + MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i); + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i); +} + +/**********************************************************************//** +Check if a buddy is stamped free. +@return whether the buddy is free */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +bool +buf_buddy_stamp_is_free( +/*====================*/ + const buf_buddy_free_t* buf) /*!< in: block to check */ +{ + compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE); + return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_FREE); +} + +/**********************************************************************//** +Stamps a buddy free. */ +UNIV_INLINE +void +buf_buddy_stamp_free( +/*=================*/ + buf_buddy_free_t* buf, /*!< in/out: block to stamp */ + ulint i) /*!< in: block size */ +{ + ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i)); + buf_buddy_mem_invalid(buf, i); + mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, + BUF_BUDDY_STAMP_FREE); + buf->stamp.size = i; +} + +/**********************************************************************//** +Stamps a buddy nonfree. +@param[in,out] buf block to stamp +@param[in] i block size */ +static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i) +{ + buf_buddy_mem_invalid(buf, i); + compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU); + memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); +} + +/**********************************************************************//** +Get the offset of the buddy of a compressed page frame. +@return the buddy relative of page */ +UNIV_INLINE +void* +buf_buddy_get( +/*==========*/ + byte* page, /*!< in: compressed page */ + ulint size) /*!< in: page size in bytes */ +{ + ut_ad(ut_is_2pow(size)); + ut_ad(size >= BUF_BUDDY_LOW); + ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN); + ut_ad(size < BUF_BUDDY_HIGH); + ut_ad(BUF_BUDDY_HIGH == srv_page_size); + ut_ad(!ut_align_offset(page, size)); + + if (((ulint) page) & size) { + return(page - size); + } else { + return(page + size); + } +} + +#ifdef UNIV_DEBUG +/** Validate a given zip_free list. */ +struct CheckZipFree { + CheckZipFree(ulint i) : m_i(i) {} + + void operator()(const buf_buddy_free_t* elem) const + { + ut_ad(buf_buddy_stamp_is_free(elem)); + ut_ad(elem->stamp.size <= m_i); + } + + const ulint m_i; +}; + +/** Validate a buddy list. +@param[in] i buddy size to validate */ +static void buf_buddy_list_validate(ulint i) +{ + ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i)); +} + +/**********************************************************************//** +Debug function to validate that a buffer is indeed free i.e.: in the +zip_free[]. +@param[in] buf block to check +@param[in] i index of buf_pool.zip_free[] +@return true if free */ +static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i) +{ + const ulint size = BUF_BUDDY_LOW << i; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + buf_buddy_free_t* itr; + + for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + itr && itr != buf; + itr = UT_LIST_GET_NEXT(list, itr)) { + } + + return(itr == buf); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Checks if a buf is free i.e.: in the zip_free[]. +@retval BUF_BUDDY_STATE_FREE if fully free +@retval BUF_BUDDY_STATE_USED if currently in use +@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */ +static MY_ATTRIBUTE((warn_unused_result)) +buf_buddy_state_t +buf_buddy_is_free( +/*==============*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of + buf_pool.zip_free[] */ +{ +#ifdef UNIV_DEBUG + const ulint size = BUF_BUDDY_LOW << i; + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); +#endif /* UNIV_DEBUG */ + + /* We assume that all memory from buf_buddy_alloc() + is used for compressed page frames. */ + + /* We look inside the allocated objects returned by + buf_buddy_alloc() and assume that each block is a compressed + page that contains one of the following in space_id. + * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or + * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but + not initialized yet or + * A valid space_id of a compressed tablespace + + The call below attempts to read from free memory. The memory + is "owned" by the buddy allocator (and it has been allocated + from the buffer pool), so there is nothing wrong about this. */ + if (!buf_buddy_stamp_is_free(buf)) { + return(BUF_BUDDY_STATE_USED); + } + + /* A block may be free but a fragment of it may still be in use. + To guard against that we write the free block size in terms of + zip_free index at start of stamped block. Note that we can + safely rely on this value only if the buf is free. */ + ut_ad(buf->stamp.size <= i); + return(buf->stamp.size == i + ? BUF_BUDDY_STATE_FREE + : BUF_BUDDY_STATE_PARTIALLY_USED); +} + +/** Add a block to the head of the appropriate buddy free list. +@param[in,out] buf block to be freed +@param[in] i index of buf_pool.zip_free[] */ +UNIV_INLINE +void +buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.zip_free[i].start != buf); + + buf_buddy_stamp_free(buf, i); + UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf); + ut_d(buf_buddy_list_validate(i)); +} + +/** Remove a block from the appropriate buddy free list. +@param[in,out] buf block to be freed +@param[in] i index of buf_pool.zip_free[] */ +UNIV_INLINE +void +buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_buddy_check_free(buf, i)); + + UT_LIST_REMOVE(buf_pool.zip_free[i], buf); + buf_buddy_stamp_nonfree(buf, i); +} + +/** Try to allocate a block from buf_pool.zip_free[]. +@param[in] i index of buf_pool.zip_free[] +@return allocated block, or NULL if buf_pool.zip_free[] was empty */ +static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i) +{ + buf_buddy_free_t* buf; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + ut_d(buf_buddy_list_validate(i)); + + buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + if (buf_pool.is_shrinking() + && UT_LIST_GET_LEN(buf_pool.withdraw) + < buf_pool.withdraw_target) { + + while (buf != NULL + && buf_pool.will_be_withdrawn( + reinterpret_cast<byte*>(buf))) { + /* This should be withdrawn, not to be allocated */ + buf = UT_LIST_GET_NEXT(list, buf); + } + } + + if (buf) { + buf_buddy_remove_from_free(buf, i); + } else if (i + 1 < BUF_BUDDY_SIZES) { + /* Attempt to split. */ + buf = buf_buddy_alloc_zip(i + 1); + + if (buf) { + buf_buddy_free_t* buddy = + reinterpret_cast<buf_buddy_free_t*>( + reinterpret_cast<byte*>(buf) + + (BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool.contains_zip(buddy)); + buf_buddy_add_to_free(buddy, i); + } + } + + if (buf) { + /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */ + MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET); + MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes, + (BUF_BUDDY_LOW << i) + - (BUF_BUDDY_STAMP_OFFSET + 4)); + ut_ad(mach_read_from_4(buf->stamp.bytes + + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_NONFREE); + } + + return(buf); +} + +/** Deallocate a buffer frame of srv_page_size. +@param[in] buf buffer frame to deallocate */ +static +void +buf_buddy_block_free(void* buf) +{ + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); + buf_page_t* bpage; + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(!ut_align_offset(buf, srv_page_size)); + + HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->state() == buf_page_t::MEMORY + && bpage->in_zip_hash), + bpage->frame == buf); + ut_a(bpage); + ut_a(bpage->state() == buf_page_t::MEMORY); + ut_ad(bpage->in_zip_hash); + ut_d(bpage->in_zip_hash = false); + HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage); + bpage->hash = nullptr; + + ut_d(memset(buf, 0, srv_page_size)); + MEM_UNDEFINED(buf, srv_page_size); + + block = (buf_block_t*) bpage; + buf_LRU_block_free_non_file_page(block); + + ut_ad(buf_pool.buddy_n_frames > 0); + ut_d(buf_pool.buddy_n_frames--); +} + +/**********************************************************************//** +Allocate a buffer block to the buddy allocator. */ +static +void +buf_buddy_block_register( +/*=====================*/ + buf_block_t* block) /*!< in: buffer frame to allocate */ +{ + const ulint fold = BUF_POOL_ZIP_FOLD(block); + ut_ad(block->page.state() == buf_page_t::MEMORY); + + ut_a(block->page.frame); + ut_a(!ut_align_offset(block->page.frame, srv_page_size)); + + ut_ad(!block->page.in_zip_hash); + ut_d(block->page.in_zip_hash = true); + HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page); + + ut_d(buf_pool.buddy_n_frames++); +} + +/** Allocate a block from a bigger object. +@param[in] buf a block that is free to use +@param[in] i index of buf_pool.zip_free[] +@param[in] j size of buf as an index of buf_pool.zip_free[] +@return allocated block */ +static +void* +buf_buddy_alloc_from(void* buf, ulint i, ulint j) +{ + ulint offs = BUF_BUDDY_LOW << j; + ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(j >= i); + ut_ad(!ut_align_offset(buf, offs)); + + /* Add the unused parts of the block to the free lists. */ + while (j > i) { + buf_buddy_free_t* zip_buf; + + offs >>= 1; + j--; + + zip_buf = reinterpret_cast<buf_buddy_free_t*>( + reinterpret_cast<byte*>(buf) + offs); + buf_buddy_add_to_free(zip_buf, j); + } + + buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i); + return(buf); +} + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +byte *buf_buddy_alloc_low(ulint i, bool *lru) +{ + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = (buf_block_t*) buf_buddy_alloc_zip(i); + + if (block) { + goto func_exit; + } + } + + /* Try allocating from the buf_pool.free list. */ + block = buf_LRU_get_free_only(); + + if (block) { + goto alloc_big; + } + + /* Try replacing an uncompressed page in the buffer pool. */ + block = buf_LRU_get_free_block(true); + if (lru) { + *lru = true; + } + +alloc_big: + buf_buddy_block_register(block); + + block = reinterpret_cast<buf_block_t*>( + buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES)); + +func_exit: + buf_pool.buddy_stat[i].used++; + return reinterpret_cast<byte*>(block); +} + +/** Try to relocate a block. The caller must hold zip_free_mutex, and this +function will release and lock it again. +@param[in] src block to relocate +@param[in] dst free block to relocated to +@param[in] i index of buf_pool.zip_free[] +@param[in] force true if we must relocated always +@return true if relocated */ +static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) +{ + buf_page_t* bpage; + const ulint size = BUF_BUDDY_LOW << i; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!ut_align_offset(src, size)); + ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + MEM_CHECK_ADDRESSABLE(dst, size); + + uint32_t space = mach_read_from_4(static_cast<const byte*>(src) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + uint32_t offset = mach_read_from_4(static_cast<const byte*>(src) + + FIL_PAGE_OFFSET); + + /* Suppress Valgrind or MSAN warnings. */ + MEM_MAKE_DEFINED(&space, sizeof space); + MEM_MAKE_DEFINED(&offset, sizeof offset); + + ut_ad(space != BUF_BUDDY_STAMP_FREE); + + const page_id_t page_id(space, offset); + /* FIXME: we are computing this while holding buf_pool.mutex */ + auto &cell= buf_pool.page_hash.cell_get(page_id.fold()); + + bpage = buf_pool.page_hash.get(page_id, cell); + + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool.page_hash yet. Obviously, + it cannot be relocated. */ + + if (!force || space != 0 || offset != 0) { + return(false); + } + + /* It might be just uninitialized page. + We should search from LRU list also. */ + + bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + while (bpage != NULL) { + if (bpage->zip.data == src) { + ut_ad(bpage->id() == page_id); + break; + } + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + + if (bpage == NULL) { + return(false); + } + } + + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + return(false); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + MEM_CHECK_ADDRESSABLE(src, size); + + if (!bpage->can_relocate()) { + return false; + } + + page_hash_latch &hash_lock = buf_pool.page_hash.lock_get(cell); + /* It does not make sense to use transactional_lock_guard here, + because the memcpy() of 1024 to 16384 bytes would likely make the + memory transaction too large. */ + hash_lock.lock(); + + if (bpage->can_relocate()) { + /* Relocate the compressed page. */ + const ulonglong ns = my_interval_timer(); + + ut_a(bpage->zip.data == src); + + memcpy(dst, src, size); + bpage->zip.data = reinterpret_cast<page_zip_t*>(dst); + + hash_lock.unlock(); + + buf_buddy_mem_invalid( + reinterpret_cast<buf_buddy_free_t*>(src), i); + + buf_buddy_stat_t* buddy_stat = &buf_pool.buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000; + return(true); + } + + hash_lock.unlock(); + + return(false); +} + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +void buf_buddy_free_low(void* buf, ulint i) +{ + buf_buddy_free_t* buddy; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(buf_pool.buddy_stat[i].used > 0); + + buf_pool.buddy_stat[i].used--; +recombine: + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i); + + if (i == BUF_BUDDY_SIZES) { + buf_buddy_block_free(buf); + return; + } + + ut_ad(i < BUF_BUDDY_SIZES); + ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool.contains_zip(buf)); + + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16 + && !buf_pool.is_shrinking()) { + goto func_exit; + } + + /* Try to combine adjacent blocks. */ + buddy = reinterpret_cast<buf_buddy_free_t*>( + buf_buddy_get(reinterpret_cast<byte*>(buf), + BUF_BUDDY_LOW << i)); + + switch (buf_buddy_is_free(buddy, i)) { + case BUF_BUDDY_STATE_FREE: + /* The buddy is free: recombine */ + buf_buddy_remove_from_free(buddy, i); +buddy_is_free: + ut_ad(!buf_pool.contains_zip(buddy)); + i++; + buf = ut_align_down(buf, BUF_BUDDY_LOW << i); + + goto recombine; + + case BUF_BUDDY_STATE_USED: + ut_d(buf_buddy_list_validate(i)); + + /* The buddy is not free. Is there a free block of + this size? */ + if (buf_buddy_free_t* zip_buf = + UT_LIST_GET_FIRST(buf_pool.zip_free[i])) { + + /* Remove the block from the free list, because + a successful buf_buddy_relocate() will overwrite + zip_free->list. */ + buf_buddy_remove_from_free(zip_buf, i); + + /* Try to relocate the buddy of buf to the free + block. */ + if (buf_buddy_relocate(buddy, zip_buf, i, false)) { + goto buddy_is_free; + } + + buf_buddy_add_to_free(zip_buf, i); + } + + break; + case BUF_BUDDY_STATE_PARTIALLY_USED: + /* Some sub-blocks in the buddy are still in use. + Relocation will fail. No need to try. */ + break; + } + +func_exit: + /* Free the block to the buddy list. */ + buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i); +} + +/** Try to reallocate a block. +@param[in] buf buf_pool block to be reallocated +@param[in] size block size, up to srv_page_size +@return whether the reallocation succeeded */ +bool +buf_buddy_realloc(void* buf, ulint size) +{ + buf_block_t* block = NULL; + ulint i = buf_buddy_get_slot(size); + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i)); + } + + if (block == NULL) { + /* Try allocating from the buf_pool.free list. */ + block = buf_LRU_get_free_only(); + + if (block == NULL) { + return(false); /* free_list was not enough */ + } + + buf_buddy_block_register(block); + + block = reinterpret_cast<buf_block_t*>( + buf_buddy_alloc_from( + block->page.frame, i, BUF_BUDDY_SIZES)); + } + + buf_pool.buddy_stat[i].used++; + + /* Try to relocate the buddy of buf to the free block. */ + if (buf_buddy_relocate(buf, block, i, true)) { + /* succeeded */ + buf_buddy_free_low(buf, i); + } else { + /* failed */ + buf_buddy_free_low(block, i); + } + + return(true); /* free_list was enough */ +} + +/** Combine all pairs of free buddies. */ +void buf_buddy_condense_free() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.is_shrinking()); + + for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) { + buf_buddy_free_t* buf = + UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + /* seek to withdraw target */ + while (buf != NULL + && !buf_pool.will_be_withdrawn( + reinterpret_cast<byte*>(buf))) { + buf = UT_LIST_GET_NEXT(list, buf); + } + + while (buf != NULL) { + buf_buddy_free_t* next = + UT_LIST_GET_NEXT(list, buf); + + buf_buddy_free_t* buddy = + reinterpret_cast<buf_buddy_free_t*>( + buf_buddy_get( + reinterpret_cast<byte*>(buf), + BUF_BUDDY_LOW << i)); + + /* seek to the next withdraw target */ + while (true) { + while (next != NULL + && !buf_pool.will_be_withdrawn( + reinterpret_cast<byte*>(next))) { + next = UT_LIST_GET_NEXT(list, next); + } + + if (buddy != next) { + break; + } + + next = UT_LIST_GET_NEXT(list, next); + } + + if (buf_buddy_is_free(buddy, i) + == BUF_BUDDY_STATE_FREE) { + /* Both buf and buddy are free. + Try to combine them. */ + buf_buddy_remove_from_free(buf, i); + buf_pool.buddy_stat[i].used++; + + buf_buddy_free_low(buf, i); + } + + buf = next; + } + } +} diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc new file mode 100644 index 00000000..8ef18ee0 --- /dev/null +++ b/storage/innobase/buf/buf0buf.cc @@ -0,0 +1,4180 @@ +/***************************************************************************** + +Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buf.cc +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "assume_aligned.h" +#include "mtr0types.h" +#include "mach0data.h" +#include "buf0checksum.h" +#include "mariadb_stats.h" +#include <string.h> + +#ifdef UNIV_INNOCHECKSUM +# include "my_sys.h" +# include "buf0buf.h" +#else +#include "my_cpu.h" +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "buf0rea.h" +#include "buf0flu.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "log0log.h" +#include "dict0stats_bg.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "srv0mon.h" +#include "log0crypt.h" +#include "fil0pagecompress.h" +#endif /* !UNIV_INNOCHECKSUM */ +#include "page0zip.h" +#include "buf0dump.h" +#include <map> +#include <sstream> +#include "log.h" + +using st_::span; + +#ifdef HAVE_LIBNUMA +#include <numa.h> +#include <numaif.h> +struct set_numa_interleave_t +{ + set_numa_interleave_t() + { + if (srv_numa_interleave) { + + struct bitmask *numa_mems_allowed = numa_get_mems_allowed(); + ib::info() << "Setting NUMA memory policy to" + " MPOL_INTERLEAVE"; + if (set_mempolicy(MPOL_INTERLEAVE, + numa_mems_allowed->maskp, + numa_mems_allowed->size) != 0) { + + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_INTERLEAVE: " + << strerror(errno); + } + numa_bitmask_free(numa_mems_allowed); + } + } + + ~set_numa_interleave_t() + { + if (srv_numa_interleave) { + + ib::info() << "Setting NUMA memory policy to" + " MPOL_DEFAULT"; + if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) { + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_DEFAULT: " + << strerror(errno); + } + } + } +}; + +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa +#else +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE +#endif /* HAVE_LIBNUMA */ + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool.mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool.mutex. + +The buf_pool.mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool.mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool.mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. + +The free list (buf_pool.free) contains blocks which are currently not +used. + +The common LRU list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would normally be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +end of the LRU list, we make sure that most of the buf_pool stays +in the main memory, undisturbed. + +The unzip_LRU list contains a subset of the common LRU list. The +blocks on the unzip_LRU list hold a compressed file page and the +corresponding uncompressed page frame. A block is in unzip_LRU if and +only if the predicate block->page.belongs_to_unzip_LRU() +holds. The blocks in unzip_LRU will be in same order as they are in +the common LRU list. That is, each manipulation of the common LRU +list will result in the same manipulation of the unzip_LRU list. + +The chain of modified blocks (buf_pool.flush_list) contains the blocks +holding persistent file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. +The access to this list is protected by buf_pool.flush_list_mutex. + +The control blocks for uncompressed pages are accessible via +buf_block_t objects that are reachable via buf_pool.chunks[]. +The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages +that are not in buf_pool.flush_list and for which no uncompressed +page has been allocated in buf_pool are only accessible via +buf_pool.LRU. + +The chains of free memory blocks (buf_pool.zip_free[]) are used by +the buddy allocator (buf0buddy.cc) to keep track of currently unused +memory blocks of size 1024..innodb_page_size / 2. These +blocks are inside the memory blocks of size innodb_page_size and type +BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer +pool. The buddy allocator is solely used for allocating +ROW_FORMAT=COMPRESSED page frames. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and releases the io_fix +when the io operation completes. + +A thread may request the above operation using the function +buf_page_get(). It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. +*/ + +#ifndef UNIV_INNOCHECKSUM +# ifdef SUX_LOCK_GENERIC +void page_hash_latch::read_lock_wait() +{ + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + LF_BACKOFF(); + if (read_trylock()) + return; + } + /* Fall back to yielding to other threads. */ + do + std::this_thread::yield(); + while (!read_trylock()); +} + +void page_hash_latch::write_lock_wait() +{ + write_lock_wait_start(); + + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + if (write_lock_poll()) + return; + LF_BACKOFF(); + } + + /* Fall back to yielding to other threads. */ + do + std::this_thread::yield(); + while (!write_lock_poll()); +} +# endif + +/** Number of attempts made to read in a page in the buffer pool */ +constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100; +/** The maximum portion of the buffer pool that can be used for the +read-ahead buffer. (Divide buf_pool size by this amount) */ +constexpr uint32_t BUF_READ_AHEAD_PORTION= 32; + +/** A 64KiB buffer of NUL bytes, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to NUL bytes, in +dtuple_convert_big_rec(). */ +const byte *field_ref_zero; + +/** The InnoDB buffer pool */ +buf_pool_t buf_pool; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref; + +#ifdef UNIV_DEBUG +/** This is used to insert validation operations in execution +in the debug version */ +static Atomic_counter<size_t> buf_dbg_counter; +#endif /* UNIV_DEBUG */ + +/** Macro to determine whether the read of write counter is used depending +on the io_type */ +#define MONITOR_RW_COUNTER(read, counter) \ + (read ? (counter##_READ) : (counter##_WRITTEN)) + +/** Decrypt a page for temporary tablespace. +@param[in,out] tmp_frame Temporary buffer +@param[in] src_frame Page to decrypt +@return true if temporary tablespace decrypted, false if not */ +static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) +{ + if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) { + return true; + } + + /* read space & lsn */ + uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + /* Copy FIL page header, it is not encrypted */ + memcpy(tmp_frame, src_frame, header_len); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + header_len; + byte* dst = tmp_frame + header_len; + uint srclen = uint(srv_page_size) + - (header_len + FIL_PAGE_FCRC32_CHECKSUM); + ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + + if (!log_tmp_block_decrypt(src, srclen, dst, + (offset * srv_page_size))) { + return false; + } + + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); + + memcpy_aligned<UNIV_PAGE_SIZE_MIN>(src_frame, tmp_frame, + srv_page_size); + srv_stats.pages_decrypted.inc(); + srv_stats.n_temp_blocks_decrypted.inc(); + + return true; /* page was decrypted */ +} + +/** Decrypt a page. +@param[in,out] bpage Page control block +@param[in] node data file +@return whether the operation was successful */ +static bool buf_page_decrypt_after_read(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + ut_ad(node.space->id == bpage->id().space()); + const auto flags = node.space->flags; + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame; + bool page_compressed = node.space->is_compressed() + && buf_page_is_compressed(dst_frame, flags); + const page_id_t id(bpage->id()); + + if (id.page_no() == 0) { + /* File header pages are not encrypted/compressed */ + return (true); + } + + buf_tmp_buffer_t* slot; + + if (id.space() == SRV_TMP_SPACE_ID + && innodb_encrypt_temporary_tables) { + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); + slot->release(); + return ok; + } + + /* Page is encrypted if encryption information is found from + tablespace and page contains used key_version. This is true + also for pages first compressed and then encrypted. */ + + uint key_version = buf_page_get_key_version(dst_frame, flags); + + if (page_compressed && !key_version) { + /* the page we read is unencrypted */ + /* Find free slot from temporary memory array */ +decompress: + if (fil_space_t::full_crc32(flags) + && buf_page_is_corrupted(true, dst_frame, flags)) { + return false; + } + + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + +decompress_with_slot: + ulint write_size = fil_page_decompress( + slot->crypt_buf, dst_frame, flags); + slot->release(); + ut_ad(node.space->referenced()); + return write_size != 0; + } + + if (key_version && node.space->crypt_data) { + /* Verify encryption checksum before we even try to + decrypt. */ + if (!buf_page_verify_crypt_checksum(dst_frame, flags)) { +decrypt_failed: + ib::error() << "Encrypted page " << id + << " in file " << node.name + << " looks corrupted; key_version=" + << key_version; + return false; + } + + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + + /* decrypt using crypt_buf to dst_frame */ + if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) { + slot->release(); + goto decrypt_failed; + } + + if ((fil_space_t::full_crc32(flags) && page_compressed) + || fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress_with_slot; + } + + slot->release(); + } else if (fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress; + } + + ut_ad(node.space->referenced()); + return true; +} +#endif /* !UNIV_INNOCHECKSUM */ + +/** Checks if the page is in crc32 checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in crc32 checksum format. */ +static +bool +buf_page_is_checksum_valid_crc32( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) +{ + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + +#ifdef UNIV_INNOCHECKSUM + extern FILE* log_file; + extern uint32_t cur_page_num; + if (log_file) { + fprintf(log_file, "page::" UINT32PF ";" + " crc32 calculated = " UINT32PF ";" + " recorded checksum field1 = " ULINTPF " recorded" + " checksum field2 =" ULINTPF "\n", cur_page_num, + crc32, checksum_field1, checksum_field2); + } +#endif /* UNIV_INNOCHECKSUM */ + + if (checksum_field1 != checksum_field2) { + return false; + } + + return checksum_field1 == crc32; +} + +/** Checks whether the lsn present in the page is lesser than the +peek current lsn. +@param[in] check_lsn lsn to check +@param[in] read_buf page. */ +static void buf_page_check_lsn(bool check_lsn, const byte* read_buf) +{ +#ifndef UNIV_INNOCHECKSUM + if (check_lsn && recv_lsn_checks_on) { + const lsn_t current_lsn = log_sys.get_lsn(); + const lsn_t page_lsn + = mach_read_from_8(read_buf + FIL_PAGE_LSN); + + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + if (current_lsn < page_lsn) { + + const uint32_t space_id = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_ID); + const uint32_t page_no = mach_read_from_4( + read_buf + FIL_PAGE_OFFSET); + + ib::error() << "Page " << page_id_t(space_id, page_no) + << " log sequence number " << page_lsn + << " is in the future! Current system" + << " log sequence number " + << current_lsn << "."; + + ib::error() << "Your database may be corrupt or" + " you may have copied the InnoDB" + " tablespace but not the InnoDB" + " log files. " + << FORCE_RECOVERY_MSG; + + } + } +#endif /* !UNIV_INNOCHECKSUM */ +} + + +/** Check if a buffer is all zeroes. +@param[in] buf data to check +@return whether the buffer is all zeroes */ +bool buf_is_zeroes(span<const byte> buf) +{ + ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX); + return memcmp(buf.data(), field_ref_zero, buf.size()) == 0; +} + +/** Check if a page is corrupt. +@param check_lsn whether FIL_PAGE_LSN should be checked +@param read_buf database page +@param fsp_flags contents of FIL_SPACE_FLAGS +@return whether the page is corrupted */ +bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf, + uint32_t fsp_flags) +{ + if (fil_space_t::full_crc32(fsp_flags)) { + bool compressed = false, corrupted = false; + const uint size = buf_page_full_crc32_size( + read_buf, &compressed, &corrupted); + if (corrupted) { + return true; + } + const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM); + uint crc32 = mach_read_from_4(end); + + if (!crc32 && size == srv_page_size + && buf_is_zeroes(span<const byte>(read_buf, size))) { + return false; + } + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 3) { + crc32++; + } + }); + + if (crc32 != my_crc32c(0, read_buf, + size - FIL_PAGE_FCRC32_CHECKSUM)) { + return true; + } + static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + if (!compressed + && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION + + read_buf) + && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4), + end - (FIL_PAGE_FCRC32_END_LSN + - FIL_PAGE_FCRC32_CHECKSUM), + 4)) { + return true; + } + + buf_page_check_lsn(check_lsn, read_buf); + return false; + } + + const ulint zip_size = fil_space_t::zip_size(fsp_flags); + const uint16_t page_type = fil_page_get_type(read_buf); + + /* We can trust page type if page compression is set on tablespace + flags because page compression flag means file must have been + created with 10.1 (later than 5.5 code base). In 10.1 page + compressed tables do not contain post compression checksum and + FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can + be null if we are in fil_check_first_page() and first page + is not compressed or encrypted. Page checksum is verified + after decompression (i.e. normally pages are already + decompressed at this stage). */ + if ((page_type == FIL_PAGE_PAGE_COMPRESSED || + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) +#ifndef UNIV_INNOCHECKSUM + && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags) +#endif + ) { + return(false); + } + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment"); + + if (!zip_size + && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4, + read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(true); + } + + buf_page_check_lsn(check_lsn, read_buf); + + /* Check whether the checksum fields have correct values */ + + if (zip_size) { + return !page_zip_verify_checksum(read_buf, zip_size); + } + + const uint32_t checksum_field1 = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + const uint32_t checksum_field2 = mach_read_from_4( + read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); + + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + + /* A page filled with NUL bytes is considered not corrupted. + Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7), + the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero + for the first page of each file of the system tablespace. + We want to ignore it for the system tablespace, but because + we do not know the expected tablespace here, we ignore the + field for all data files, except for + innodb_checksum_algorithm=full_crc32 which we handled above. */ + if (!checksum_field1 && !checksum_field2) { + /* Checksum fields can have valid value as zero. + If the page is not empty then do the checksum + calculation for the page. */ + bool all_zeroes = true; + for (size_t i = 0; i < srv_page_size; i++) { +#ifndef UNIV_INNOCHECKSUM + if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) { + i += 8; + } +#endif + if (read_buf[i]) { + all_zeroes = false; + break; + } + } + + if (all_zeroes) { + return false; + } + } + +#ifndef UNIV_INNOCHECKSUM + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: +#endif /* !UNIV_INNOCHECKSUM */ + return !buf_page_is_checksum_valid_crc32( + read_buf, checksum_field1, checksum_field2); +#ifndef UNIV_INNOCHECKSUM + default: + if (checksum_field1 == BUF_NO_CHECKSUM_MAGIC + && checksum_field2 == BUF_NO_CHECKSUM_MAGIC) { + return false; + } + + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + + /* Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. */ + + /* Since innodb_checksum_algorithm is not strict_* allow + any of the algos to match for the old field */ + + if (checksum_field2 + != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) { + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 3) return true; + }); + + if ((checksum_field1 != crc32 + || checksum_field2 != crc32) + && checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + return true; + } + } + + switch (checksum_field1) { + case 0: + case BUF_NO_CHECKSUM_MAGIC: + return false; + } + return (checksum_field1 != crc32 || checksum_field2 != crc32) + && checksum_field1 + != buf_calc_page_new_checksum(read_buf); + } +#endif /* !UNIV_INNOCHECKSUM */ +} + +#ifndef UNIV_INNOCHECKSUM + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +/** Enable buffers to be dumped to core files + +A convience function, not called anyhwere directly however +it is left available for gdb or any debugger to call +in the event that you want all of the memory to be dumped +to a core file. + +Returns number of errors found in madvise calls. */ +MY_ATTRIBUTE((used)) +int +buf_madvise_do_dump() +{ + int ret= 0; + + /* mirrors allocation in log_t::create() */ + if (log_sys.buf) { + ret += madvise(log_sys.buf, log_sys.buf_size, MADV_DODUMP); + ret += madvise(log_sys.flush_buf, log_sys.buf_size, + MADV_DODUMP); + } + + mysql_mutex_lock(&buf_pool.mutex); + auto chunk = buf_pool.chunks; + + for (ulint n = buf_pool.n_chunks; n--; chunk++) { + ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return ret; +} +#endif + +#ifndef UNIV_DEBUG +static inline byte hex_to_ascii(byte hex_digit) +{ + const int offset= hex_digit <= 9 ? '0' : 'a' - 10; + return byte(hex_digit + offset); +} +#endif + +/** Dump a page to stderr. +@param[in] read_buf database page +@param[in] zip_size compressed page size, or 0 */ +ATTRIBUTE_COLD +void buf_page_print(const byte *read_buf, ulint zip_size) +{ +#ifndef UNIV_DEBUG + const size_t size = zip_size ? zip_size : srv_page_size; + const byte * const end= read_buf + size; + sql_print_information("InnoDB: Page dump (%zu bytes):", size); + + do + { + byte row[64]; + + for (byte *r= row; r != &row[64]; r+= 2, read_buf++) + { + r[0]= hex_to_ascii(byte(*read_buf >> 4)); + r[1]= hex_to_ascii(*read_buf & 15); + } + + sql_print_information("InnoDB: %.*s", 64, row); + } + while (read_buf != end); + + sql_print_information("InnoDB: End of page dump"); +#endif +} + +/** Initialize a buffer page descriptor. +@param[in,out] block buffer page descriptor +@param[in] frame buffer page frame */ +static +void +buf_block_init(buf_block_t* block, byte* frame) +{ + /* This function should only be executed at database startup or by + buf_pool.resize(). Either way, adaptive hash index must not exist. */ + assert_block_ahi_empty_on_init(block); + + block->page.frame = frame; + + MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock); + ut_ad(!block->modify_clock); + MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock); + block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL)); +#ifdef BTR_CUR_HASH_ADAPT + MEM_MAKE_DEFINED(&block->index, sizeof block->index); + ut_ad(!block->index); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->in_unzip_LRU_list = false); + ut_d(block->in_withdraw_list = false); + + page_zip_des_init(&block->page.zip); + + MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash); + ut_ad(!block->page.hash); +} + +/** Allocate a chunk of buffer frames. +@param bytes requested size +@return whether the allocation succeeded */ +inline bool buf_pool_t::chunk_t::create(size_t bytes) +{ + DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;); + /* Round down to a multiple of page size, although it already should be. */ + bytes= ut_2pow_round<size_t>(bytes, srv_page_size); + + mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx); + + if (UNIV_UNLIKELY(!mem)) + return false; + + MEM_UNDEFINED(mem, mem_size()); + +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) + { + struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); + if (mbind(mem, mem_size(), MPOL_INTERLEAVE, + numa_mems_allowed->maskp, numa_mems_allowed->size, + MPOL_MF_MOVE)) + { + ib::warn() << "Failed to set NUMA memory policy of" + " buffer pool page frames to MPOL_INTERLEAVE" + " (error: " << strerror(errno) << ")."; + } + numa_bitmask_free(numa_mems_allowed); + } +#endif /* HAVE_LIBNUMA */ + + + /* Allocate the block descriptors from + the start of the memory block. */ + blocks= reinterpret_cast<buf_block_t*>(mem); + + /* Align a pointer to the first frame. Note that when + opt_large_page_size is smaller than srv_page_size, + (with max srv_page_size at 64k don't think any hardware + makes this true), + we may allocate one fewer block than requested. When + it is bigger, we may allocate more blocks than requested. */ + static_assert(sizeof(byte*) == sizeof(ulint), "pointer size"); + + byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) + + srv_page_size - 1) & + ~ulint{srv_page_size - 1}); + size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem); + + /* Subtract the space needed for block descriptors. */ + { + ulint s= size; + + while (frame < reinterpret_cast<const byte*>(blocks + s)) + { + frame+= srv_page_size; + s--; + } + + size= s; + } + + /* Init block structs and assign frames for them. Then we assign the + frames to the first blocks (we already mapped the memory above). */ + + buf_block_t *block= blocks; + + for (auto i= size; i--; ) { + buf_block_init(block, frame); + MEM_UNDEFINED(block->page.frame, srv_page_size); + /* Add the block to the free list */ + UT_LIST_ADD_LAST(buf_pool.free, &block->page); + + ut_d(block->page.in_free_list = TRUE); + block++; + frame+= srv_page_size; + } + + reg(); + + return true; +} + +#ifdef UNIV_DEBUG +/** Check that all file pages in the buffer chunk are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const +{ + buf_block_t *block= blocks; + for (auto i= size; i--; block++) + { + if (block->page.in_file()) + { + /* The uncompressed buffer pool should never + contain ROW_FORMAT=COMPRESSED block descriptors. */ + ut_ad(block->page.frame); + const lsn_t lsn= block->page.oldest_modification(); + + if (srv_read_only_mode) + { + /* The page cleaner is disabled in read-only mode. No pages + can be dirtied, so all of them must be clean. */ + ut_ad(lsn == 0 || lsn == recv_sys.lsn || + srv_force_recovery == SRV_FORCE_NO_LOG_REDO); + break; + } + + if (fsp_is_system_temporary(block->page.id().space())) + { + ut_ad(lsn == 0 || lsn == 2); + break; + } + + if (lsn > 1 || !block->page.can_relocate()) + return block; + + break; + } + } + + return nullptr; +} +#endif /* UNIV_DEBUG */ + +/** Create the hash table. +@param n the lower bound of n_cells */ +void buf_pool_t::page_hash_table::create(ulint n) +{ + n_cells= ut_find_prime(n); + const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array, + CPU_LEVEL1_DCACHE_LINESIZE); + void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE); + memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size); + array= static_cast<hash_chain*>(v); +} + +/** Create the buffer pool. +@return whether the creation failed */ +bool buf_pool_t::create() +{ + ut_ad(this == &buf_pool); + ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0); + ut_ad(!is_initialised()); + ut_ad(srv_buf_pool_size > 0); + ut_ad(!resizing); + ut_ad(!chunks_old); + /* mariabackup loads tablespaces, and it requires field_ref_zero to be + allocated before innodb initialization */ + ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero); + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + if (!field_ref_zero) { + if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096)) + field_ref_zero= static_cast<const byte*> + (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX)); + else + return true; + } + + chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map()); + + new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool); + + n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit; + const size_t chunk_size= srv_buf_pool_chunk_unit; + + chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks)); + UT_LIST_INIT(free, &buf_page_t::list); + curr_size= 0; + auto chunk= chunks; + + do + { + if (!chunk->create(chunk_size)) + { + while (--chunk >= chunks) + { + buf_block_t* block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + block->page.lock.free(); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + ut_free(chunks); + chunks= nullptr; + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= nullptr; + aligned_free(const_cast<byte*>(field_ref_zero)); + field_ref_zero= nullptr; + ut_ad(!is_initialised()); + return true; + } + + curr_size+= chunk->size; + } + while (++chunk < chunks + n_chunks); + + ut_ad(is_initialised()); +#if defined(__aarch64__) + mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); +#else + mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr); +#endif + + UT_LIST_INIT(LRU, &buf_page_t::LRU); + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target= 0; + UT_LIST_INIT(flush_list, &buf_page_t::list); + UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); + + for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i) + UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list); + ulint s= curr_size; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= srv_buf_pool_size; + + n_chunks_new= n_chunks; + + page_hash.create(2 * curr_size); + zip_hash.create(2 * curr_size); + last_printout_time= time(NULL); + + mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex, + MY_MUTEX_INIT_FAST); + + pthread_cond_init(&done_flush_LRU, nullptr); + pthread_cond_init(&done_flush_list, nullptr); + pthread_cond_init(&do_flush_list, nullptr); + pthread_cond_init(&done_free, nullptr); + + try_LRU_scan= true; + + ut_d(flush_hp.m_mutex= &flush_list_mutex;); + ut_d(lru_hp.m_mutex= &mutex); + ut_d(lru_scan_itr.m_mutex= &mutex); + + io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) * + OS_AIO_N_PENDING_IOS_PER_THREAD); + + /* FIXME: remove some of these variables */ + srv_buf_pool_curr_size= curr_pool_size; + srv_buf_pool_old_size= srv_buf_pool_size; + srv_buf_pool_base_size= srv_buf_pool_size; + + last_activity_count= srv_get_activity_count(); + + chunk_t::map_ref= chunk_t::map_reg; + buf_LRU_old_ratio_update(100 * 3 / 8, false); + btr_search_sys_create(); + ut_ad(is_initialised()); + return false; +} + +/** Clean up after successful create() */ +void buf_pool_t::close() +{ + ut_ad(this == &buf_pool); + if (!is_initialised()) + return; + + mysql_mutex_destroy(&mutex); + mysql_mutex_destroy(&flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage; + bpage= prev_bpage) + { + prev_bpage= UT_LIST_GET_PREV(LRU, bpage); + ut_ad(bpage->in_file()); + ut_ad(bpage->in_LRU_list); + /* The buffer pool must be clean during normal shutdown. + Only on aborted startup (with recovery) or with innodb_fast_shutdown=2 + we may discard changes. */ + ut_d(const lsn_t oldest= bpage->oldest_modification();) + ut_ad(fsp_is_system_temporary(bpage->id().space()) + ? (oldest == 0 || oldest == 2) + : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2); + + if (UNIV_UNLIKELY(!bpage->frame)) + { + bpage->lock.free(); + ut_free(bpage); + } + } + + for (auto chunk= chunks + n_chunks; --chunk >= chunks; ) + { + buf_block_t *block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + block->page.lock.free(); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + + pthread_cond_destroy(&done_flush_LRU); + pthread_cond_destroy(&done_flush_list); + pthread_cond_destroy(&do_flush_list); + pthread_cond_destroy(&done_free); + + ut_free(chunks); + chunks= nullptr; + page_hash.free(); + zip_hash.free(); + + io_buf.close(); + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= chunk_t::map_ref= nullptr; + aligned_free(const_cast<byte*>(field_ref_zero)); + field_ref_zero= nullptr; +} + +/** Try to reallocate a control block. +@param block control block to reallocate +@return whether the reallocation succeeded */ +inline bool buf_pool_t::realloc(buf_block_t *block) +{ + buf_block_t* new_block; + + mysql_mutex_assert_owner(&mutex); + ut_ad(block->page.in_file()); + ut_ad(block->page.frame); + + new_block = buf_LRU_get_free_only(); + + if (new_block == NULL) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return(false); /* free list was not enough */ + } + + const page_id_t id{block->page.id()}; + hash_chain& chain = page_hash.cell_get(id.fold()); + page_hash_latch& hash_lock = page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard + here, because copying innodb_page_size (4096 to 65536) bytes + as well as other changes would likely make the memory + transaction too large. */ + hash_lock.lock(); + + if (block->page.can_relocate()) { + memcpy_aligned<UNIV_PAGE_SIZE_MIN>( + new_block->page.frame, block->page.frame, + srv_page_size); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto frame = new_block->page.frame; + new_block->page.lock.free(); + new (&new_block->page) buf_page_t(block->page); + new_block->page.frame = frame; + + /* relocate LRU list */ + if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) { + UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page); + } else { + UT_LIST_ADD_FIRST(LRU, &new_block->page); + } + + if (LRU_old == &block->page) { + LRU_old = &new_block->page; + } + + ut_ad(new_block->page.in_LRU_list); + + /* relocate unzip_LRU list */ + if (block->page.zip.data != NULL) { + ut_ad(block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = true); + + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + UT_LIST_REMOVE(unzip_LRU, block); + + ut_d(block->in_unzip_LRU_list = false); + block->page.zip.data = NULL; + page_zip_set_size(&block->page.zip, 0); + + if (prev_block != NULL) { + UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block); + } else { + UT_LIST_ADD_FIRST(unzip_LRU, new_block); + } + } else { + ut_ad(!block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = false); + } + + /* relocate page_hash */ + hash_chain& chain = page_hash.cell_get(id.fold()); + ut_ad(&block->page == page_hash.get(id, chain)); + buf_pool.page_hash.replace(chain, &block->page, + &new_block->page); + buf_block_modify_clock_inc(block); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(block->page.frame + + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(block->page.frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(block->page.frame, srv_page_size); + block->page.set_state(buf_page_t::REMOVE_HASH); + if (!fsp_is_system_temporary(id.space())) { + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + block->page.set_corrupt_id(); + + /* set other flags of buf_block_t */ + +#ifdef BTR_CUR_HASH_ADAPT + /* This code should only be executed by resize(), + while the adaptive hash index is disabled. */ + assert_block_ahi_empty(block); + assert_block_ahi_empty_on_init(new_block); + ut_ad(!block->index); + new_block->index = NULL; + new_block->n_hash_helps = 0; + new_block->n_fields = 1; + new_block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->page.set_state(buf_page_t::MEMORY)); + /* free block */ + new_block = block; + } + + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(new_block); + return(true); /* free_list was enough */ +} + +void buf_pool_t::io_buf_t::create(ulint n_slots) +{ + this->n_slots= n_slots; + slots= static_cast<buf_tmp_buffer_t*> + (ut_malloc_nokey(n_slots * sizeof *slots)); + memset((void*) slots, 0, n_slots * sizeof *slots); +} + +void buf_pool_t::io_buf_t::close() +{ + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); + slots= nullptr; + n_slots= 0; +} + +buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve() +{ + for (;;) + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_writes(true); + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_reads(true); + } +} + +/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). +@param[in] fmt format +@param[in] ... extra parameters according to fmt */ +static +void +buf_resize_status( + const char* fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_resize_status, + sizeof(export_vars.innodb_buffer_pool_resize_status), + fmt, ap); + + va_end(ap); + + ib::info() << export_vars.innodb_buffer_pool_resize_status; +} + +/** Withdraw blocks from the buffer pool until meeting withdraw_target. +@return whether retry is needed */ +inline bool buf_pool_t::withdraw_blocks() +{ + buf_block_t* block; + ulint loop_count = 0; + + ib::info() << "Start to withdraw the last " + << withdraw_target << " blocks."; + + while (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + + /* try to withdraw from free_list */ + ulint count1 = 0; + + mysql_mutex_lock(&mutex); + buf_buddy_condense_free(); + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(free)); + while (block != NULL + && UT_LIST_GET_LEN(withdraw) < withdraw_target) { + ut_ad(block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + + buf_block_t* next_block; + next_block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_NEXT( + list, &block->page)); + + if (will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_REMOVE(free, &block->page); + UT_LIST_ADD_LAST(withdraw, &block->page); + ut_d(block->in_withdraw_list = true); + count1++; + } + + block = next_block; + } + + /* reserve free_list length */ + if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + buf_flush_LRU( + std::max<ulint>(withdraw_target + - UT_LIST_GET_LEN(withdraw), + srv_LRU_scan_depth), + true); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + } + + /* relocate blocks/buddies in withdrawn area */ + ulint count2 = 0; + + buf_pool_mutex_exit_forbid(); + for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; + bpage; bpage = next_bpage) { + ut_ad(bpage->in_file()); + next_bpage = UT_LIST_GET_NEXT(LRU, bpage); + if (UNIV_LIKELY_NULL(bpage->zip.data) + && will_be_withdrawn(bpage->zip.data) + && bpage->can_relocate()) { + if (!buf_buddy_realloc( + bpage->zip.data, + page_zip_get_size(&bpage->zip))) { + /* failed to allocate block */ + break; + } + count2++; + if (bpage->frame) { + goto realloc_frame; + } + } + + if (bpage->frame && will_be_withdrawn(*bpage) + && bpage->can_relocate()) { +realloc_frame: + if (!realloc(reinterpret_cast<buf_block_t*>( + bpage))) { + /* failed to allocate block */ + break; + } + count2++; + } + } + buf_pool_mutex_exit_allow(); + mysql_mutex_unlock(&mutex); + + buf_resize_status( + "Withdrawing blocks. (" ULINTPF "/" ULINTPF ").", + UT_LIST_GET_LEN(withdraw), + withdraw_target); + + ib::info() << "Withdrew " + << count1 << " blocks from free list." + << " Tried to relocate " << count2 << " blocks (" + << UT_LIST_GET_LEN(withdraw) << "/" + << withdraw_target << ")."; + + if (++loop_count >= 10) { + /* give up for now. + retried after user threads paused. */ + + ib::info() << "will retry to withdraw later"; + + /* need retry later */ + return(true); + } + } + + /* confirm withdrawn enough */ + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; chunk != echunk; chunk++) { + block = chunk->blocks; + for (ulint j = chunk->size; j--; block++) { + ut_a(block->page.state() == buf_page_t::NOT_USED); + ut_ad(block->in_withdraw_list); + } + } + + ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw) + << " blocks."; + + return(false); +} + + + +inline void buf_pool_t::page_hash_table::write_lock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast<page_hash_latch&>(array[n]).lock(); + if (!n) + break; + } +} + + +inline void buf_pool_t::page_hash_table::write_unlock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast<page_hash_latch&>(array[n]).unlock(); + if (!n) + break; + } +} + + +namespace +{ + +struct find_interesting_trx +{ + void operator()(const trx_t &trx) + { + if (trx.state == TRX_STATE_NOT_STARTED) + return; + if (trx.mysql_thd == nullptr) + return; + if (withdraw_started <= trx.start_time_micro) + return; + + if (!found) + { + ib::warn() << "The following trx might hold " + "the blocks in buffer pool to " + "be withdrawn. Buffer pool " + "resizing can complete only " + "after all the transactions " + "below release the blocks."; + found= true; + } + + lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time); + } + + bool &found; + /** microsecond_interval_timer() */ + const ulonglong withdraw_started; + const my_hrtime_t current_time; +}; + +} // namespace + +/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ +inline void buf_pool_t::resize() +{ + ut_ad(this == &buf_pool); + + bool warning = false; + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + ut_ad(!resize_in_progress()); + ut_ad(srv_buf_pool_chunk_unit > 0); + + ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift; + std::ostringstream str_old_size, str_new_size, str_chunk_size; + str_old_size << ib::bytes_iec{srv_buf_pool_old_size}; + str_new_size << ib::bytes_iec{srv_buf_pool_size}; + str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit}; + + buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).", + str_old_size.str().c_str(), + str_new_size.str().c_str(), + str_chunk_size.str().c_str()); + +#ifdef BTR_CUR_HASH_ADAPT + /* disable AHI if needed */ + buf_resize_status("Disabling adaptive hash index."); + + btr_search_s_lock_all(); + const bool btr_search_disabled = btr_search_enabled; + btr_search_s_unlock_all(); + + btr_search_disable(); + + if (btr_search_disabled) { + ib::info() << "disabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + mysql_mutex_lock(&mutex); + ut_ad(n_chunks_new == n_chunks); + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + + n_chunks_new = (new_instance_size << srv_page_size_shift) + / srv_buf_pool_chunk_unit; + curr_size = n_chunks_new * chunks->size; + mysql_mutex_unlock(&mutex); + + if (is_shrinking()) { + /* set withdraw target */ + size_t w = 0; + + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; + chunk != echunk; chunk++) + w += chunk->size; + + ut_ad(withdraw_target == 0); + withdraw_target = w; + } + + buf_resize_status("Withdrawing blocks to be shrunken."); + + ulonglong withdraw_started = microsecond_interval_timer(); + ulonglong message_interval = 60ULL * 1000 * 1000; + ulint retry_interval = 1; + +withdraw_retry: + /* wait for the number of blocks fit to the new size (if needed)*/ + bool should_retry_withdraw = is_shrinking() + && withdraw_blocks(); + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + /* abort to resize for shutdown. */ + return; + } + + /* abort buffer pool load */ + buf_load_abort(); + + const ulonglong current_time = microsecond_interval_timer(); + + if (should_retry_withdraw + && current_time - withdraw_started >= message_interval) { + + if (message_interval > 900000000) { + message_interval = 1800000000; + } else { + message_interval *= 2; + } + + bool found= false; + find_interesting_trx f + {found, withdraw_started, my_hrtime_coarse()}; + withdraw_started = current_time; + + /* This is going to exceed the maximum size of a + memory transaction. */ + LockMutexGuard g{SRW_LOCK_CALL}; + trx_sys.trx_list.for_each(f); + } + + if (should_retry_withdraw) { + ib::info() << "Will retry to withdraw " << retry_interval + << " seconds later."; + std::this_thread::sleep_for( + std::chrono::seconds(retry_interval)); + + if (retry_interval > 5) { + retry_interval = 10; + } else { + retry_interval *= 2; + } + + goto withdraw_retry; + } + + buf_resize_status("Latching entire buffer pool."); + +#ifndef DBUG_OFF + { + bool should_wait = true; + + while (should_wait) { + should_wait = false; + DBUG_EXECUTE_IF( + "ib_buf_pool_resize_wait_before_resize", + should_wait = true; + std::this_thread::sleep_for( + std::chrono::milliseconds(10));); + } + } +#endif /* !DBUG_OFF */ + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } + + /* Indicate critical path */ + resizing.store(true, std::memory_order_relaxed); + + mysql_mutex_lock(&mutex); + page_hash.write_lock_all(); + + chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); + + /* add/delete chunks */ + + buf_resize_status("Resizing buffer pool from " + ULINTPF " chunks to " ULINTPF " chunks.", + n_chunks, n_chunks_new); + + if (is_shrinking()) { + /* delete chunks */ + chunk_t* chunk = chunks + n_chunks_new; + const chunk_t* const echunk = chunks + n_chunks; + + ulint sum_freed = 0; + + while (chunk < echunk) { + /* buf_LRU_block_free_non_file_page() invokes + MEM_NOACCESS() on any buf_pool.free blocks. + We must cancel the effect of that. In + MemorySanitizer, MEM_NOACCESS() is no-op, so + we must not do anything special for it here. */ +#ifdef HAVE_valgrind +# if !__has_feature(memory_sanitizer) + MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size()); +# endif +#else + MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size); +#endif + + buf_block_t* block = chunk->blocks; + + for (ulint j = chunk->size; j--; block++) { + block->page.lock.free(); + } + + allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx); + sum_freed += chunk->size; + ++chunk; + } + + /* discard withdraw list */ + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target = 0; + + ib::info() << n_chunks - n_chunks_new + << " Chunks (" << sum_freed + << " blocks) were freed."; + + n_chunks = n_chunks_new; + } + + { + /* reallocate chunks */ + const size_t new_chunks_size + = n_chunks_new * sizeof(chunk_t); + + chunk_t* new_chunks = static_cast<chunk_t*>( + ut_zalloc_nokey_nofatal(new_chunks_size)); + + DBUG_EXECUTE_IF("buf_pool_resize_chunk_null", + ut_free(new_chunks); new_chunks= nullptr; ); + + if (!new_chunks) { + ib::error() << "failed to allocate" + " the chunk array."; + n_chunks_new = n_chunks; + warning = true; + chunks_old = NULL; + goto calc_buf_pool_size; + } + + ulint n_chunks_copy = ut_min(n_chunks_new, n_chunks); + + memcpy(new_chunks, chunks, + n_chunks_copy * sizeof *new_chunks); + + for (ulint j = 0; j < n_chunks_copy; j++) { + new_chunks[j].reg(); + } + + chunks_old = chunks; + chunks = new_chunks; + } + + if (n_chunks_new > n_chunks) { + /* add chunks */ + ulint sum_added = 0; + ulint n = n_chunks; + const size_t unit = srv_buf_pool_chunk_unit; + + for (chunk_t* chunk = chunks + n_chunks, + * const echunk = chunks + n_chunks_new; + chunk != echunk; chunk++) { + if (!chunk->create(unit)) { + ib::error() << "failed to allocate" + " memory for buffer pool chunk"; + + warning = true; + n_chunks_new = n_chunks; + break; + } + + sum_added += chunk->size; + ++n; + } + + ib::info() << n_chunks_new - n_chunks + << " chunks (" << sum_added + << " blocks) were added."; + + n_chunks = n; + } +calc_buf_pool_size: + /* recalc curr_size */ + ulint new_size = 0; + + { + chunk_t* chunk = chunks; + const chunk_t* const echunk = chunk + n_chunks; + do { + new_size += chunk->size; + } while (++chunk != echunk); + } + + curr_size = new_size; + n_chunks_new = n_chunks; + + if (chunks_old) { + ut_free(chunks_old); + chunks_old = NULL; + } + + chunk_t::map* chunk_map_old = chunk_t::map_ref; + chunk_t::map_ref = chunk_t::map_reg; + + /* set size */ + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + ulint s= curr_size; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= n_chunks * srv_buf_pool_chunk_unit; + srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/ + extern ulonglong innobase_buffer_pool_size; + innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size); + + const bool new_size_too_diff + = srv_buf_pool_base_size > srv_buf_pool_size * 2 + || srv_buf_pool_base_size * 2 < srv_buf_pool_size; + + mysql_mutex_unlock(&mutex); + page_hash.write_unlock_all(); + + UT_DELETE(chunk_map_old); + + resizing.store(false, std::memory_order_relaxed); + + /* Normalize other components, if the new size is too different */ + if (!warning && new_size_too_diff) { + srv_buf_pool_base_size = srv_buf_pool_size; + + buf_resize_status("Resizing other hash tables."); + + srv_lock_table_size = 5 + * (srv_buf_pool_size >> srv_page_size_shift); + lock_sys.resize(srv_lock_table_size); + dict_sys.resize(); + + ib::info() << "Resized hash tables: lock_sys," +#ifdef BTR_CUR_HASH_ADAPT + " adaptive hash index," +#endif /* BTR_CUR_HASH_ADAPT */ + " and dictionary."; + } + + /* normalize ibuf.max_size */ + ibuf_max_size_update(srv_change_buffer_max_size); + + if (srv_buf_pool_old_size != srv_buf_pool_size) { + + buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes." + ,srv_buf_pool_old_size, srv_buf_pool_size); + srv_buf_pool_old_size = srv_buf_pool_size; + } + +#ifdef BTR_CUR_HASH_ADAPT + /* enable AHI if needed */ + if (btr_search_disabled) { + btr_search_enable(true); + ib::info() << "Re-enabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (warning) + buf_resize_status("Resizing buffer pool failed"); + + ut_d(validate()); + + return; +} + +/** Thread pool task invoked by innodb_buffer_pool_size changes. */ +static void buf_resize_callback(void *) +{ + DBUG_ENTER("buf_resize_callback"); + ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); + mysql_mutex_lock(&buf_pool.mutex); + const auto size= srv_buf_pool_size; + const bool work= srv_buf_pool_old_size != size; + mysql_mutex_unlock(&buf_pool.mutex); + + if (work) + buf_pool.resize(); + else + { + std::ostringstream sout; + sout << "Size did not change: old size = new size = " << size; + buf_resize_status(sout.str().c_str()); + } + DBUG_VOID_RETURN; +} + +/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */ +static tpool::task_group single_threaded_group(1); +static tpool::waitable_task buf_resize_task(buf_resize_callback, + nullptr, &single_threaded_group); + +void buf_resize_start() +{ + srv_thread_pool->submit_task(&buf_resize_task); +} + +void buf_resize_shutdown() +{ + buf_resize_task.wait(); +} + + +/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and +buf_pool.page_hash. +The caller must relocate bpage->list. +@param bpage ROW_FORMAT=COMPRESSED only block +@param dpage destination control block */ +static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) +{ + const page_id_t id{bpage->id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + ut_ad(!bpage->frame); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); + ut_ad(bpage == buf_pool.page_hash.get(id, chain)); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); + ut_d(const auto state= bpage->state()); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state <= buf_page_t::READ_FIX); + ut_ad(bpage->lock.is_write_locked()); + const auto frame= dpage->frame; + + dpage->lock.free(); + new (dpage) buf_page_t(*bpage); + + dpage->frame= frame; + + /* Important that we adjust the hazard pointer before + removing bpage from LRU list. */ + if (buf_page_t *b= buf_pool.LRU_remove(bpage)) + UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage); + else + UT_LIST_ADD_FIRST(buf_pool.LRU, dpage); + + if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage)) + { + buf_pool.LRU_old= dpage; +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) || + !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) || + UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); + } + else + { + /* Check that the "old" flag is consistent in + the block and its neighbours. */ + dpage->set_old(dpage->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } + + ut_d(CheckInLRUList::validate()); + + buf_pool.page_hash.replace(chain, bpage, dpage); +} + +buf_page_t *buf_pool_t::watch_set(const page_id_t id, + buf_pool_t::hash_chain &chain) +{ + ut_ad(&chain == &page_hash.cell_get(id.fold())); + page_hash.lock_get(chain).lock(); + + buf_page_t *bpage= page_hash.get(id, chain); + + if (bpage) + { +got_block: + bpage->fix(); + if (watch_is_sentinel(*bpage)) + bpage= nullptr; + page_hash.lock_get(chain).unlock(); + return bpage; + } + + page_hash.lock_get(chain).unlock(); + /* Allocate a watch[] and then try to insert it into the page_hash. */ + mysql_mutex_lock(&mutex); + + /* The maximum number of purge tasks should never exceed + the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a + watch when setting another watch. */ + for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; ) + { + ut_ad(w->access_time == 0); + ut_ad(!w->oldest_modification()); + ut_ad(!w->zip.data); + ut_ad(!w->in_zip_hash); + static_assert(buf_page_t::NOT_USED == 0, "efficiency"); + if (ut_d(auto s=) w->state()) + { + /* This watch may be in use for some other page. */ + ut_ad(s >= buf_page_t::UNFIXED); + continue; + } + /* w is pointing to watch[], which is protected by mutex. + Normally, buf_page_t::id for objects that are reachable by + page_hash.get(id, chain) are protected by hash_lock. */ + w->set_state(buf_page_t::UNFIXED + 1); + w->id_= id; + + page_hash.lock_get(chain).lock(); + bpage= page_hash.get(id, chain); + if (UNIV_LIKELY_NULL(bpage)) + { + w->set_state(buf_page_t::NOT_USED); + mysql_mutex_unlock(&mutex); + goto got_block; + } + + ut_ad(w->state() == buf_page_t::UNFIXED + 1); + buf_pool.page_hash.append(chain, w); + mysql_mutex_unlock(&mutex); + page_hash.lock_get(chain).unlock(); + return nullptr; + } + + ut_error; +} + +/** Stop watching whether a page has been read in. +watch_set(id) must have returned nullptr before. +@param id page identifier +@param chain unlocked hash table chain */ +TRANSACTIONAL_TARGET +void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain) +{ + mysql_mutex_assert_not_owner(&mutex); + buf_page_t *w; + { + transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)}; + /* The page must exist because watch_set() did fix(). */ + w= page_hash.get(id, chain); + ut_ad(w->in_page_hash); + if (!watch_is_sentinel(*w)) + { + no_watch: + w->unfix(); + w= nullptr; + } + else + { + const auto state= w->state(); + ut_ad(~buf_page_t::LRU_MASK & state); + ut_ad(state >= buf_page_t::UNFIXED + 1); + if (state != buf_page_t::UNFIXED + 1) + goto no_watch; + } + } + + if (!w) + return; + + const auto old= w; + /* The following is based on buf_pool_t::watch_remove(). */ + mysql_mutex_lock(&mutex); + w= page_hash.get(id, chain); + + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + auto f= w->unfix(); + ut_ad(f < buf_page_t::READ_FIX || w != old); + + if (f == buf_page_t::UNFIXED && w == old) + { + page_hash.remove(chain, w); + // Now that w is detached from page_hash, release it to watch[]. + ut_ad(w->id_ == id); + ut_ad(!w->frame); + ut_ad(!w->zip.data); + w->set_state(buf_page_t::NOT_USED); + } + } + + mysql_mutex_unlock(&mutex); +} + +/** Mark the page status as FREED for the given tablespace and page number. +@param[in,out] space tablespace +@param[in] page page number +@param[in,out] mtr mini-transaction */ +TRANSACTIONAL_TARGET +void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + + if (srv_immediate_scrub_data_uncompressed +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + || space->is_compressed() +#endif + ) + mtr->add_freed_offset(space, page); + + ++buf_pool.stat.n_page_gets; + const page_id_t page_id(space->id, page); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + uint32_t fix; + buf_block_t *block; + { + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash.get(page_id, chain)); + if (!block || !block->page.frame) + /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */ + return; + /* To avoid a deadlock with buf_LRU_free_page() of some other page + and buf_page_write_complete() of this page, we must not wait for a + page latch while holding a page_hash latch. */ + fix= block->page.fix(); + } + + if (UNIV_UNLIKELY(fix < buf_page_t::UNFIXED)) + { + block->page.unfix(); + return; + } + + block->page.lock.x_lock(); + if (block->page.is_ibuf_exist()) + ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size()); +#ifdef BTR_CUR_HASH_ADAPT + if (block->index) + btr_search_drop_page_hash_index(block, false); +#endif /* BTR_CUR_HASH_ADAPT */ + block->page.set_freed(block->page.state()); + mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY); +} + +/** Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with unfix(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size in bytes +@return pointer to the block, s-latched */ +TRANSACTIONAL_TARGET +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) +{ + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + buf_page_t *bpage; + +lookup: + for (bool discard_attempted= false;;) + { +#ifndef NO_ELISION + if (xbegin()) + { + if (hash_lock.is_locked()) + xabort(); + bpage= buf_pool.page_hash.get(page_id, chain); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + { + xend(); + goto must_read_page; + } + if (!bpage->zip.data) + { + /* There is no ROW_FORMAT=COMPRESSED page. */ + xend(); + return nullptr; + } + if (discard_attempted || !bpage->frame) + { + if (!bpage->lock.s_lock_try()) + xabort(); + xend(); + break; + } + xend(); + } + else +#endif + { + hash_lock.lock_shared(); + bpage= buf_pool.page_hash.get(page_id, chain); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + { + hash_lock.unlock_shared(); + goto must_read_page; + } + + ut_ad(bpage->in_file()); + ut_ad(page_id == bpage->id()); + + if (!bpage->zip.data) + { + /* There is no ROW_FORMAT=COMPRESSED page. */ + hash_lock.unlock_shared(); + return nullptr; + } + + if (discard_attempted || !bpage->frame) + { + /* Even when we are holding a hash_lock, it should be + acceptable to wait for a page S-latch here, because + buf_page_t::read_complete() will not wait for buf_pool.mutex, + and because S-latch would not conflict with a U-latch + that would be protecting buf_page_t::write_complete(). */ + bpage->lock.s_lock(); + hash_lock.unlock_shared(); + break; + } + + hash_lock.unlock_shared(); + } + + discard_attempted= true; + mysql_mutex_lock(&buf_pool.mutex); + if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain)) + buf_LRU_free_page(bpage, false); + mysql_mutex_unlock(&buf_pool.mutex); + } + + { + ut_d(const auto s=) bpage->fix(); + ut_ad(s >= buf_page_t::UNFIXED); + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + } + + bpage->set_accessed(); + buf_page_make_young_if_needed(bpage); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return bpage; + +must_read_page: + switch (dberr_t err= buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(); + goto lookup; + default: + ib::error() << "Reading compressed page " << page_id + << " failed with error: " << err; + return nullptr; + } +} + +/********************************************************************//** +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_block_init_low( +/*===============*/ + buf_block_t* block) /*!< in: block to init */ +{ +#ifdef BTR_CUR_HASH_ADAPT + /* No adaptive hash index entries may point to a previously + unused (and now freshly allocated) block. */ + assert_block_ahi_empty_on_init(block); + block->index = NULL; + + block->n_hash_helps = 0; + block->n_fields = 1; + block->n_bytes = 0; + block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ +} + +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check) /*!< in: TRUE=verify the page checksum */ +{ + const byte* frame = block->page.zip.data; + ulint size = page_zip_get_size(&block->page.zip); + /* The tablespace will not be found if this function is called + during IMPORT. */ + fil_space_t* space= fil_space_t::get(block->page.id().space()); + const unsigned key_version = mach_read_from_4( + frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL; + const bool encrypted = crypt_data + && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED + && (!crypt_data->is_default_encryption() + || srv_encrypt_tables); + + ut_ad(block->zip_size()); + ut_a(block->page.id().space() != 0); + + if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { + + ib::error() << "Compressed page checksum mismatch for " + << (space ? space->chain.start->name : "") + << block->page.id() << ": stored: " + << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) + << ", crc32: " + << page_zip_calc_checksum(frame, size, false) + << " adler32: " + << page_zip_calc_checksum(frame, size, true); + goto err_exit; + } + + switch (fil_page_get_type(frame)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + if (page_zip_decompress(&block->page.zip, + block->page.frame, TRUE)) { +func_exit: + if (space) { + space->release(); + } + return(TRUE); + } + + ib::error() << "Unable to decompress " + << (space ? space->chain.start->name : "") + << block->page.id(); + goto err_exit; + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + /* Copy to uncompressed storage. */ + memcpy(block->page.frame, frame, block->zip_size()); + goto func_exit; + } + + ib::error() << "Unknown compressed page type " + << fil_page_get_type(frame) + << " in " << (space ? space->chain.start->name : "") + << block->page.id(); + +err_exit: + if (encrypted) { + ib::info() << "Row compressed page could be encrypted" + " with key_version " << key_version; + } + + if (space) { + space->release(); + } + + return(FALSE); +} + +/** Low level function used to get access to a database page. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge to happen +while reading the page from file +then it makes sure that it does merging of change buffer changes while +reading the page from file. +@return pointer to the block or NULL */ +TRANSACTIONAL_TARGET +buf_block_t* +buf_page_get_low( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + unsigned access_time; + ulint retries = 0; + + ut_ad(!mtr || mtr->is_active()); + ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH) + || (rw_latch == RW_NO_LATCH)); + + if (err) { + *err = DB_SUCCESS; + } + +#ifdef UNIV_DEBUG + switch (mode) { + default: + ut_ad(!allow_ibuf_merge); + ut_ad(mode == BUF_PEEK_IF_IN_POOL); + break; + case BUF_GET_POSSIBLY_FREED: + case BUF_GET_IF_IN_POOL: + /* The caller may pass a dummy page size, + because it does not really matter. */ + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL_OR_WATCH: + ut_ad(!mtr->is_freeing_tree()); + fil_space_t* s = fil_space_get(page_id.space()); + ut_ad(s); + ut_ad(s->zip_size() == zip_size); + } +#endif /* UNIV_DEBUG */ + + ut_ad(!mtr || !ibuf_inside(mtr) + || ibuf_page_low(page_id, zip_size, FALSE, NULL)); + + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + + auto& chain= buf_pool.page_hash.cell_get(page_id.fold()); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); +loop: + buf_block_t* block = guess; + uint32_t state; + + if (block) { + transactional_shared_lock_guard<page_hash_latch> g{hash_lock}; + if (buf_pool.is_uncompressed(block) + && page_id == block->page.id()) { + ut_ad(!block->page.in_zip_hash); + state = block->page.state(); + /* Ignore guesses that point to read-fixed blocks. + We can only avoid a race condition by + looking up the block via buf_pool.page_hash. */ + if ((state >= buf_page_t::FREED + && state < buf_page_t::READ_FIX) + || state >= buf_page_t::WRITE_FIX) { + state = block->page.fix(); + goto got_block; + } + } + } + + guess = nullptr; + + /* A memory transaction would frequently be aborted here. */ + hash_lock.lock_shared(); + block = reinterpret_cast<buf_block_t*>( + buf_pool.page_hash.get(page_id, chain)); + if (UNIV_LIKELY(block + && !buf_pool.watch_is_sentinel(block->page))) { + state = block->page.fix(); + hash_lock.unlock_shared(); + goto got_block; + } + hash_lock.unlock_shared(); + + /* Page not in buf_pool: needs to be read from file */ + switch (mode) { + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + return nullptr; + case BUF_GET_IF_IN_POOL_OR_WATCH: + /* Buffer-fixing inside watch_set() will prevent eviction */ + block = reinterpret_cast<buf_block_t*> + (buf_pool.watch_set(page_id, chain)); + + if (block) { + state = block->page.state(); + goto got_block_fixed; + } + + return nullptr; + } + + /* The call path is buf_read_page() -> + buf_read_page_low() (fil_space_t::io()) -> + buf_page_t::read_complete() -> + buf_decrypt_after_read(). Here fil_space_t* is used + and we decrypt -> buf_page_check_corrupt() where page + checksums are compared. Decryption, decompression as + well as error handling takes place at a lower level. + Here we only need to know whether the page really is + corrupted, or if an encrypted page with a valid + checksum cannot be decypted. */ + + switch (dberr_t local_err = buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(); + buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); + break; + default: + if (mode != BUF_GET_POSSIBLY_FREED + && retries++ < BUF_PAGE_READ_MAX_RETRIES) { + DBUG_EXECUTE_IF("intermittent_read_failure", + retries = BUF_PAGE_READ_MAX_RETRIES;); + } + /* fall through */ + case DB_PAGE_CORRUPTED: + if (err) { + *err = local_err; + } + return nullptr; + } + + ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); + goto loop; + +got_block: + ut_ad(!block->page.in_zip_hash); + state++; +got_block_fixed: + ut_ad(state > buf_page_t::FREED); + + if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) { + if (mode == BUF_PEEK_IF_IN_POOL) { +ignore_block: + ut_ad(mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL); + block->unfix(); + if (err) { + *err = DB_CORRUPTION; + } + return nullptr; + } + + if (UNIV_UNLIKELY(!block->page.frame)) { + goto wait_for_unzip; + } + /* A read-fix is released after block->page.lock + in buf_page_t::read_complete() or + buf_pool_t::corrupted_evict(), or + after buf_zip_decompress() in this function. */ + block->page.lock.s_lock(); + state = block->page.state(); + ut_ad(state < buf_page_t::READ_FIX + || state >= buf_page_t::WRITE_FIX); + const page_id_t id{block->page.id()}; + block->page.lock.s_unlock(); + + if (UNIV_UNLIKELY(id != page_id)) { + ut_ad(id == page_id_t{~0ULL}); + block->page.unfix(); + if (++retries < BUF_PAGE_READ_MAX_RETRIES) { + goto loop; + } + + if (err) { + *err = DB_PAGE_CORRUPTED; + } + + return nullptr; + } + } else if (mode != BUF_PEEK_IF_IN_POOL) { + } else if (!mtr) { + ut_ad(!block->page.oldest_modification()); + mysql_mutex_lock(&buf_pool.mutex); + block->unfix(); + +free_unfixed_block: + if (!buf_LRU_free_page(&block->page, true)) { + ut_ad(0); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return nullptr; + } else if (UNIV_UNLIKELY(!block->page.frame)) { + /* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an + adaptive hash index. There cannot be an + adaptive hash index for a compressed-only page. */ + goto ignore_block; + } + + ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL + || block->zip_size() == zip_size); + + if (UNIV_UNLIKELY(!block->page.frame)) { + if (!block->page.lock.x_lock_try()) { +wait_for_unzip: + /* The page is being read or written, or + another thread is executing buf_zip_decompress() + in buf_page_get_low() on it. */ + block->page.unfix(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto loop; + } + + buf_block_t *new_block = buf_LRU_get_free_block(false); + buf_block_init_low(new_block); + +wait_for_unfix: + mysql_mutex_lock(&buf_pool.mutex); + page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain); + + /* It does not make sense to use + transactional_lock_guard here, because buf_relocate() + would likely make a memory transaction too large. */ + hash_lock.lock(); + + /* block->page.lock implies !block->page.can_relocate() */ + ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain)); + + /* Wait for any other threads to release their buffer-fix + on the compressed-only block descriptor. + FIXME: Never fix() before acquiring the lock. + Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free() + we are violating that principle. */ + state = block->page.state(); + + switch (state) { + case buf_page_t::UNFIXED + 1: + case buf_page_t::IBUF_EXIST + 1: + case buf_page_t::REINIT + 1: + break; + default: + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED + 1) { + ut_ad(state > buf_page_t::FREED); + block->page.lock.x_unlock(); + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(new_block); + mysql_mutex_unlock(&buf_pool.mutex); + goto ignore_block; + } + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto wait_for_unfix; + } + + /* Ensure that another buf_page_get_low() will wait for + new_block->page.lock.x_unlock(). */ + block->page.set_state(buf_page_t::READ_FIX); + + /* Move the compressed page from block->page to new_block, + and uncompress it. */ + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(&block->page, &new_block->page); + + /* X-latch the block for the duration of the decompression. */ + new_block->page.lock.x_lock(); + ut_d(block->page.lock.x_unlock()); + + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(new_block, FALSE); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); + +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + block->page.lock.free(); +#endif + ut_free(reinterpret_cast<buf_page_t*>(block)); + block = new_block; + + buf_pool.n_pend_unzip++; + + access_time = block->page.is_accessed(); + + if (!access_time && !recv_no_ibuf_operations + && ibuf_page_exists(block->page.id(), block->zip_size())) { + state = buf_page_t::IBUF_EXIST + 1; + } + + /* Decompress the page while not holding + buf_pool.mutex. */ + const auto ok = buf_zip_decompress(block, false); + --buf_pool.n_pend_unzip; + if (!ok) { + if (err) { + *err = DB_PAGE_CORRUPTED; + } + mysql_mutex_lock(&buf_pool.mutex); + } + state = block->page.read_unfix(state); + block->page.lock.x_unlock(); + + if (!ok) { + goto free_unfixed_block; + } + } + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +re_evict: + if (mode != BUF_GET_IF_IN_POOL + && mode != BUF_GET_IF_IN_POOL_OR_WATCH) { + } else if (!ibuf_debug || recv_recovery_is_on()) { + } else if (fil_space_t* space = fil_space_t::get(page_id.space())) { + for (ulint i = 0; i < mtr->get_savepoint(); i++) { + if (buf_block_t* b = mtr->block_at_savepoint(i)) { + if (b->page.oldest_modification() > 2 + && b->page.lock.have_any()) { + /* We are holding a dirty page latch + that would hang buf_flush_sync(). */ + space->release(); + goto re_evict_fail; + } + } + } + + /* Try to evict the block from the buffer pool, to use the + insert buffer (change buffer) as much as possible. */ + + mysql_mutex_lock(&buf_pool.mutex); + + block->unfix(); + + /* Blocks cannot be relocated or enter or exit the + buf_pool while we are holding the buf_pool.mutex. */ + const bool evicted = buf_LRU_free_page(&block->page, true); + space->release(); + + if (!evicted) { + block->fix(); + } + + mysql_mutex_unlock(&buf_pool.mutex); + + if (evicted) { + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + buf_pool.watch_set(page_id, chain); + } + return(NULL); + } + + buf_flush_sync(); + + state = block->page.state(); + + if (state == buf_page_t::UNFIXED + 1 + && !block->page.oldest_modification()) { + goto re_evict; + } + + /* Failed to evict the page; change it directly */ + } +re_evict_fail: +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + goto ignore_block; + } + ut_ad((~buf_page_t::LRU_MASK) & state); + ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.frame); + + if (state >= buf_page_t::UNFIXED + && allow_ibuf_merge + && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX + && page_is_leaf(block->page.frame)) { + block->page.lock.x_lock(); + ut_ad(block->page.id() == page_id + || (state >= buf_page_t::READ_FIX + && state < buf_page_t::WRITE_FIX)); + +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + + dberr_t e; + + if (UNIV_UNLIKELY(block->page.id() != page_id)) { +page_id_mismatch: + state = block->page.state(); + e = DB_CORRUPTION; +ibuf_merge_corrupted: + if (err) { + *err = e; + } + + if (block->page.id().is_corrupted()) { + buf_pool.corrupted_evict(&block->page, state); + } + return nullptr; + } + + state = block->page.state(); + ut_ad(state < buf_page_t::READ_FIX); + + if (state >= buf_page_t::IBUF_EXIST + && state < buf_page_t::REINIT) { + block->page.clear_ibuf_exist(); + e = ibuf_merge_or_delete_for_page(block, page_id, + block->zip_size()); + if (UNIV_UNLIKELY(e != DB_SUCCESS)) { + goto ibuf_merge_corrupted; + } + } + + if (rw_latch == RW_X_LATCH) { + goto get_latch_valid; + } else { + block->page.lock.x_unlock(); + goto get_latch; + } + } else { +get_latch: + switch (rw_latch) { + case RW_NO_LATCH: + mtr->memo_push(block, MTR_MEMO_BUF_FIX); + return block; + case RW_S_LATCH: + block->page.lock.s_lock(); + ut_ad(!block->page.is_read_fixed()); + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + block->page.lock.s_unlock(); + block->page.lock.x_lock(); + goto page_id_mismatch; + } +get_latch_valid: + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + break; + case RW_SX_LATCH: + block->page.lock.u_lock(); + ut_ad(!block->page.is_io_fixed()); + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + block->page.lock.u_x_upgrade(); + goto page_id_mismatch; + } + goto get_latch_valid; + default: + ut_ad(rw_latch == RW_X_LATCH); + if (block->page.lock.x_lock_upgraded()) { + ut_ad(block->page.id() == page_id); + block->unfix(); + mtr->page_lock_upgrade(*block); + return block; + } + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + goto page_id_mismatch; + } + goto get_latch_valid; + } + + ut_ad(page_id_t(page_get_space_id(block->page.frame), + page_get_page_no(block->page.frame)) + == page_id); + + if (mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL) { + return block; + } + + const bool not_first_access{block->page.set_accessed()}; + buf_page_make_young_if_needed(&block->page); + if (!not_first_access) { + buf_read_ahead_linear(page_id, block->zip_size(), + ibuf_inside(mtr)); + } + } + + return block; +} + +/** Get access to a database page. Buffered redo log may be applied. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in,out] mtr mini-transaction, or NULL +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge while +reading the pages from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_gen( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + buf_block_t *block= recv_sys.recover(page_id); + if (UNIV_LIKELY(!block)) + return buf_page_get_low(page_id, zip_size, rw_latch, + guess, mode, mtr, err, allow_ibuf_merge); + else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1))) + { + corrupted: + if (err) + *err= DB_CORRUPTION; + return nullptr; + } + /* Recovery is a special case; we fix() before acquiring lock. */ + auto s= block->page.fix(); + ut_ad(s >= buf_page_t::FREED); + /* The block may be write-fixed at this point because we are not + holding a lock, but it must not be read-fixed. */ + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + if (err) + *err= DB_SUCCESS; + const bool must_merge= allow_ibuf_merge && + ibuf_page_exists(page_id, block->zip_size()); + if (s < buf_page_t::UNFIXED) + { + got_freed_page: + ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); + mysql_mutex_lock(&buf_pool.mutex); + block->page.unfix(); + buf_LRU_free_page(&block->page, true); + mysql_mutex_unlock(&buf_pool.mutex); + goto corrupted; + } + else if (must_merge && + fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX && + page_is_leaf(block->page.frame)) + { + block->page.lock.x_lock(); + s= block->page.state(); + ut_ad(s > buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + if (s < buf_page_t::UNFIXED) + { + block->page.lock.x_unlock(); + goto got_freed_page; + } + else + { + if (block->page.is_ibuf_exist()) + block->page.clear_ibuf_exist(); + if (dberr_t e= + ibuf_merge_or_delete_for_page(block, page_id, block->zip_size())) + { + if (err) + *err= e; + buf_pool.corrupted_evict(&block->page, s); + return nullptr; + } + } + + if (rw_latch == RW_X_LATCH) + { + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + return block; + } + block->page.lock.x_unlock(); + } + mtr->page_lock(block, rw_latch); + return block; +} + +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +TRANSACTIONAL_TARGET +bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, + uint64_t modify_clock, mtr_t *mtr) +{ + ut_ad(block); + ut_ad(mtr); + ut_ad(mtr->is_active()); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + + if (have_transactional_memory); + else if (UNIV_UNLIKELY(!block->page.frame)) + return false; + else + { + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return false; + } + + bool success; + const page_id_t id{block->page.id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + bool have_u_not_x= false; + + { + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame)) + return false; + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return false; + + if (rw_latch == RW_S_LATCH) + success= block->page.lock.s_lock_try(); + else + { + have_u_not_x= block->page.lock.have_u_not_x(); + success= have_u_not_x || block->page.lock.x_lock_try(); + } + } + + if (!success) + return false; + + if (have_u_not_x) + { + block->page.lock.u_x_upgrade(); + mtr->page_lock_upgrade(*block); + ut_ad(id == block->page.id()); + ut_ad(modify_clock == block->modify_clock); + } + else + { + ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed()); + ut_ad(id == block->page.id()); + ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr)); + + if (modify_clock != block->modify_clock || block->page.is_freed()) + { + if (rw_latch == RW_S_LATCH) + block->page.lock.s_unlock(); + else + block->page.lock.x_unlock(); + return false; + } + + block->page.fix(); + ut_ad(!block->page.is_read_fixed()); + block->page.set_accessed(); + buf_page_make_young_if_needed(&block->page); + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + } + + ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); + ut_d(const auto state = block->page.state()); + ut_ad(state > buf_page_t::UNFIXED); + ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX); + ut_ad(~buf_page_t::LRU_MASK & state); + ut_ad(block->page.frame); + + return true; +} + +/** Try to S-latch a page. +Suitable for using when holding the lock_sys latches (as it avoids deadlock). +@param[in] page_id page identifier +@param[in,out] mtr mini-transaction +@return the block +@retval nullptr if an S-latch cannot be granted immediately */ +TRANSACTIONAL_TARGET +buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + buf_block_t *block; + + { + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash.get(page_id, chain)); + if (!block || !block->page.frame || !block->page.lock.s_lock_try()) + return nullptr; + } + + block->page.fix(); + ut_ad(!block->page.is_read_fixed()); + mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.id() == page_id); + + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + return block; +} + +/** Initialize the block. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param fix initial buf_fix_count() */ +void buf_block_t::initialise(const page_id_t page_id, ulint zip_size, + uint32_t fix) +{ + ut_ad(!page.in_file()); + buf_block_init_low(this); + page.init(fix, page_id); + page.set_os_used(); + page_zip_set_size(&page.zip, zip_size); +} + +TRANSACTIONAL_TARGET +static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size, + mtr_t *mtr, buf_block_t *free_block) +{ + ut_ad(mtr->is_active()); + ut_ad(page_id.space() != 0 || !zip_size); + + free_block->initialise(page_id, zip_size, buf_page_t::MEMORY); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); +retry: + mysql_mutex_lock(&buf_pool.mutex); + + buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + + if (bpage && !buf_pool.watch_is_sentinel(*bpage)) + { +#ifdef BTR_CUR_HASH_ADAPT + const dict_index_t *drop_hash_entry= nullptr; +#endif + bool ibuf_exist= false; + + if (!mtr->have_x_latch(reinterpret_cast<const buf_block_t&>(*bpage))) + { + const bool got= bpage->lock.x_lock_try(); + if (!got) + { + mysql_mutex_unlock(&buf_pool.mutex); + bpage->lock.x_lock(); + const page_id_t id{bpage->id()}; + if (UNIV_UNLIKELY(id != page_id)) + { + ut_ad(id.is_corrupted()); + bpage->lock.x_unlock(); + goto retry; + } + mysql_mutex_lock(&buf_pool.mutex); + } + + auto state= bpage->fix(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED) + bpage->set_reinit(buf_page_t::FREED); + else + { + bpage->set_reinit(state & buf_page_t::LRU_MASK); + ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST; + } + + if (UNIV_LIKELY(bpage->frame != nullptr)) + { + mysql_mutex_unlock(&buf_pool.mutex); + buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage); + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + drop_hash_entry= block->index; +#endif + } + else + { + auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard here, + because buf_relocate() would likely make the memory transaction + too large. */ + hash_lock.lock(); + + if (state < buf_page_t::UNFIXED) + bpage->set_reinit(buf_page_t::FREED); + else + { + bpage->set_reinit(state & buf_page_t::LRU_MASK); + ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST; + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(bpage, &free_block->page); + free_block->page.lock.x_lock(); + buf_flush_relocate_on_flush_list(bpage, &free_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_unzip_LRU_add_block(free_block, FALSE); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + bpage->lock.x_unlock(); + bpage->lock.free(); +#endif + ut_free(bpage); + mtr->memo_push(free_block, MTR_MEMO_PAGE_X_FIX); + bpage= &free_block->page; + } + } + else + { + mysql_mutex_unlock(&buf_pool.mutex); + ut_ad(bpage->frame); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index); +#endif + const auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + bpage->set_reinit(state < buf_page_t::UNFIXED ? buf_page_t::FREED + : state & buf_page_t::LRU_MASK); + } + +#ifdef BTR_CUR_HASH_ADAPT + if (drop_hash_entry) + btr_search_drop_page_hash_index(reinterpret_cast<buf_block_t*>(bpage), + false); +#endif /* BTR_CUR_HASH_ADAPT */ + + if (ibuf_exist && !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + return reinterpret_cast<buf_block_t*>(bpage); + } + + /* If we get here, the page was not in buf_pool: init it there */ + + DBUG_PRINT("ib_buf", ("create page %u:%u", + page_id.space(), page_id.page_no())); + + bpage= &free_block->page; + + ut_ad(bpage->state() == buf_page_t::MEMORY); + bpage->lock.x_lock(); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(bpage, false); + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + bpage->set_state(buf_page_t::REINIT + 1); + buf_pool.page_hash.append(chain, bpage); + } + + if (UNIV_UNLIKELY(zip_size)) + { + bpage->zip.data= buf_buddy_alloc(zip_size); + + /* To maintain the invariant block->in_unzip_LRU_list == + block->page.belongs_to_unzip_LRU() we have to add this + block to unzip_LRU after block->page.zip.data is set. */ + ut_ad(bpage->belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), FALSE); + } + + buf_pool.stat.n_pages_created++; + mysql_mutex_unlock(&buf_pool.mutex); + + mtr->memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX); + + bpage->set_accessed(); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} && + !srv_is_undo_tablespace(page_id.space()) && + !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); + memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + + /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the + following pages: + (1) The first page of the InnoDB system tablespace (page 0:0) + (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages + (3) key_version on encrypted pages (not page 0:0) */ + + memset(bpage->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + memset_aligned<8>(bpage->frame + FIL_PAGE_LSN, 0, 8); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return reinterpret_cast<buf_block_t*>(bpage); +} + +/** Initialize a page in the buffer pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@param[in,out] space space object +@param[in] offset offset of the tablespace + or deferred space id if space + object is null +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[in,out] free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create(fil_space_t *space, uint32_t offset, + ulint zip_size, mtr_t *mtr, buf_block_t *free_block) +{ + space->free_page(offset, false); + return buf_page_create_low({space->id, offset}, zip_size, mtr, free_block); +} + +/** Initialize a page in buffer pool while initializing the +deferred tablespace +@param space_id space identfier +@param zip_size ROW_FORMAT=COMPRESSED page size or 0 +@param mtr mini-transaction +@param free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size, + mtr_t *mtr, buf_block_t *free_block) +{ + return buf_page_create_low({space_id, 0}, zip_size, mtr, free_block); +} + +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param read true=read, false=write */ +ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read) +{ + monitor_id_t counter; + + const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame; + + switch (fil_page_get_type(frame)) { + ulint level; + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + level = btr_page_get_level(frame); + + /* Check if it is an index page for insert buffer */ + if (fil_page_get_type(frame) == FIL_PAGE_INDEX + && btr_page_get_index_id(frame) + == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_IBUF_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + read, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE); + } + } else { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_NON_LEAF_PAGE); + } + } + break; + + case FIL_PAGE_UNDO_LOG: + counter = MONITOR_RW_COUNTER(read, MONITOR_UNDO_LOG_PAGE); + break; + + case FIL_PAGE_INODE: + counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE); + break; + + case FIL_PAGE_IBUF_FREE_LIST: + counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE); + break; + + case FIL_PAGE_IBUF_BITMAP: + counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE); + break; + + case FIL_PAGE_TYPE_SYS: + counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_TRX_SYS: + counter = MONITOR_RW_COUNTER(read, MONITOR_TRX_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_FSP_HDR: + counter = MONITOR_RW_COUNTER(read, MONITOR_FSP_HDR_PAGE); + break; + + case FIL_PAGE_TYPE_XDES: + counter = MONITOR_RW_COUNTER(read, MONITOR_XDES_PAGE); + break; + + case FIL_PAGE_TYPE_BLOB: + counter = MONITOR_RW_COUNTER(read, MONITOR_BLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB: + counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB2: + counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB2_PAGE); + break; + + default: + counter = MONITOR_RW_COUNTER(read, MONITOR_OTHER_PAGE); + } + + MONITOR_INC_NOCHECK(counter); +} + +/** Check if the encrypted page is corrupted for the full crc32 format. +@param[in] space_id page belongs to space id +@param[in] d page +@param[in] is_compressed compressed page +@return true if page is corrupted or false if it isn't */ +static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d, + bool is_compressed) +{ + if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID)) + return true; + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + + return !is_compressed && + memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d, + d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4); +} + +/** Check if page is maybe compressed, encrypted or both when we encounter +corrupted page. Note that we can't be 100% sure if page is corrupted +or decrypt/decompress just failed. +@param[in,out] bpage page +@param[in] node data file +@return whether the operation succeeded +@retval DB_SUCCESS if page has been read and is not corrupted +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. */ +static dberr_t buf_page_check_corrupt(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame; + dberr_t err = DB_SUCCESS; + uint key_version = buf_page_get_key_version(dst_frame, + node.space->flags); + + /* In buf_decrypt_after_read we have either decrypted the page if + page post encryption checksum matches and used key_id is found + from the encryption plugin. If checksum did not match page was + not decrypted and it could be either encrypted and corrupted + or corrupted or good page. If we decrypted, there page could + still be corrupted if used key does not match. */ + const bool seems_encrypted = !node.space->full_crc32() && key_version + && node.space->crypt_data + && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY || + node.space->full_crc32()); + + /* If traditional checksums match, we assume that page is + not anymore encrypted. */ + if (node.space->full_crc32() + && !buf_is_zeroes(span<const byte>(dst_frame, + node.space->physical_size())) + && (key_version || node.space->is_compressed() + || node.space->purpose == FIL_TYPE_TEMPORARY)) { + if (buf_page_full_crc32_is_corrupted( + bpage->id().space(), dst_frame, + node.space->is_compressed())) { + err = DB_PAGE_CORRUPTED; + } + } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) { + err = DB_PAGE_CORRUPTED; + } + + if (seems_encrypted && err == DB_PAGE_CORRUPTED + && bpage->id().page_no() != 0) { + err = DB_DECRYPTION_FAILED; + + ib::error() + << "The page " << bpage->id() + << " in file '" << node.name + << "' cannot be decrypted; key_version=" + << key_version; + } + + return (err); +} + +/** Complete a read of a page. +@param node data file +@return whether the operation succeeded +@retval DB_PAGE_CORRUPTED if the checksum fails +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted +@retval DB_FAIL if the page contains the wrong ID */ +dberr_t buf_page_t::read_complete(const fil_node_t &node) +{ + const page_id_t expected_id{id()}; + ut_ad(is_read_fixed()); + ut_ad(!buf_dblwr.is_inside(id())); + ut_ad(id().space() == node.space->id); + ut_ad(zip_size() == node.space->zip_size()); + ut_ad(!!zip.ssize == !!zip.data); + + const byte *read_frame= zip.data ? zip.data : frame; + ut_ad(read_frame); + + dberr_t err; + if (!buf_page_decrypt_after_read(this, node)) + { + err= DB_DECRYPTION_FAILED; + goto database_corrupted; + } + + if (belongs_to_unzip_LRU()) + { + buf_pool.n_pend_unzip++; + auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(this), false); + buf_pool.n_pend_unzip--; + + if (!ok) + { + ib::info() << "Page " << expected_id << " zip_decompress failure."; + err= DB_PAGE_CORRUPTED; + goto database_corrupted; + } + } + + { + const page_id_t read_id(mach_read_from_4(read_frame + FIL_PAGE_SPACE_ID), + mach_read_from_4(read_frame + FIL_PAGE_OFFSET)); + + if (read_id == expected_id); + else if (read_id == page_id_t(0, 0)) + { + /* This is likely an uninitialized (all-zero) page. */ + err= DB_FAIL; + goto release_page; + } + else if (!node.space->full_crc32() && + page_id_t(0, read_id.page_no()) == expected_id) + /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace + before MySQL 4.1.1, which introduced innodb_file_per_table. */; + else if (node.space->full_crc32() && + *reinterpret_cast<const uint32_t*> + (&read_frame[FIL_PAGE_FCRC32_KEY_VERSION]) && + node.space->crypt_data && + node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED) + { + ib::error() << "Cannot decrypt " << expected_id; + err= DB_DECRYPTION_FAILED; + goto release_page; + } + else + { + ib::error() << "Space id and page no stored in the page, read in are " + << read_id << ", should be " << expected_id; + err= DB_PAGE_CORRUPTED; + goto release_page; + } + } + + err= buf_page_check_corrupt(this, node); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { +database_corrupted: + if (belongs_to_unzip_LRU()) + memset_aligned<UNIV_PAGE_SIZE_MIN>(frame, 0, srv_page_size); + + if (err == DB_PAGE_CORRUPTED) + { + ib::error() << "Database page corruption on disk" + " or a failed read of file '" + << node.name << "' page " << expected_id + << ". You may have to recover from a backup."; + + buf_page_print(read_frame, zip_size()); + + node.space->set_corrupted(); + + ib::info() << " You can use CHECK TABLE to scan" + " your table for corruption. " + << FORCE_RECOVERY_MSG; + } + + if (!srv_force_recovery) + goto release_page; + } + + if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) + { +release_page: + buf_pool.corrupted_evict(this, buf_page_t::READ_FIX); + return err; + } + + const bool recovery= recv_recovery_is_on(); + + if (recovery && !recv_recover_page(node.space, this)) + return DB_PAGE_CORRUPTED; + + const bool ibuf_may_exist= frame && !recv_no_ibuf_operations && + (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) && + fil_page_get_type(read_frame) == FIL_PAGE_INDEX && + page_is_leaf(read_frame); + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(*this, true); + DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no())); + + if (!recovery) + { + ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist + ? READ_FIX - IBUF_EXIST + : READ_FIX - UNFIXED); + ut_ad(f >= READ_FIX); + ut_ad(f < WRITE_FIX); + } + else if (ibuf_may_exist) + set_ibuf_exist(); + + lock.x_unlock(true); + + return DB_SUCCESS; +} + +#ifdef UNIV_DEBUG +/** Check that all blocks are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +void buf_pool_t::assert_all_freed() +{ + mysql_mutex_lock(&mutex); + const chunk_t *chunk= chunks; + for (auto i= n_chunks; i--; chunk++) + if (const buf_block_t* block= chunk->not_freed()) + ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; + mysql_mutex_unlock(&mutex); +} +#endif /* UNIV_DEBUG */ + +/** Refresh the statistics used to print per-second averages. */ +void buf_refresh_io_stats() +{ + buf_pool.last_printout_time = time(NULL); + buf_pool.old_stat = buf_pool.stat; +} + +/** Invalidate all pages in the buffer pool. +All pages must be in a replaceable state (not modified or latched). */ +void buf_pool_invalidate() +{ + mysql_mutex_lock(&buf_pool.mutex); + + /* It is possible that a write batch that has been posted + earlier is still not complete. For buffer pool invalidation to + proceed we must ensure there is NO write activity happening. */ + + ut_d(mysql_mutex_unlock(&buf_pool.mutex)); + ut_d(buf_pool.assert_all_freed()); + ut_d(mysql_mutex_lock(&buf_pool.mutex)); + + while (UT_LIST_GET_LEN(buf_pool.LRU)) { + buf_LRU_scan_and_free_block(); + } + + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + + buf_pool.freed_page_clock = 0; + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + buf_pool.stat.init(); + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +#ifdef UNIV_DEBUG +/** Validate the buffer pool. */ +void buf_pool_t::validate() +{ + ulint n_lru = 0; + ulint n_flushing = 0; + ulint n_free = 0; + ulint n_zip = 0; + + mysql_mutex_lock(&mutex); + + chunk_t* chunk = chunks; + + /* Check the uncompressed blocks. */ + + for (auto i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + + for (auto j = chunk->size; j--; block++) { + ut_ad(block->page.frame); + switch (const auto f = block->page.state()) { + case buf_page_t::NOT_USED: + n_free++; + break; + + case buf_page_t::MEMORY: + case buf_page_t::REMOVE_HASH: + /* do nothing */ + break; + + default: + if (f >= buf_page_t::READ_FIX + && f < buf_page_t::WRITE_FIX) { + /* A read-fixed block is not + necessarily in the page_hash yet. */ + break; + } + ut_ad(f >= buf_page_t::FREED); + const page_id_t id{block->page.id()}; + ut_ad(page_hash.get( + id, + page_hash.cell_get(id.fold())) + == &block->page); + n_lru++; + } + } + } + + /* Check dirty blocks. */ + + mysql_mutex_lock(&flush_list_mutex); + for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_file()); + ut_ad(b->oldest_modification()); + ut_ad(!fsp_is_system_temporary(b->id().space())); + n_flushing++; + + if (UNIV_UNLIKELY(!b->frame)) { + n_lru++; + n_zip++; + } + const page_id_t id{b->id()}; + ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b); + } + + ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); + + mysql_mutex_unlock(&flush_list_mutex); + + if (n_chunks_new == n_chunks + && n_lru + n_free > curr_size + n_zip) { + + ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free + << ", pool " << curr_size + << " zip " << n_zip << ". Aborting..."; + } + + ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru); + + if (n_chunks_new == n_chunks + && UT_LIST_GET_LEN(free) != n_free) { + + ib::fatal() << "Free list len " + << UT_LIST_GET_LEN(free) + << ", free blocks " << n_free << ". Aborting..."; + } + + mysql_mutex_unlock(&mutex); + + ut_d(buf_LRU_validate()); + ut_d(buf_flush_validate()); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Write information of the buf_pool to the error log. */ +void buf_pool_t::print() +{ + index_id_t* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + index_id_t id; + ulint n_found; + chunk_t* chunk; + dict_index_t* index; + + size = curr_size; + + index_ids = static_cast<index_id_t*>( + ut_malloc_nokey(size * sizeof *index_ids)); + + counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size)); + + mysql_mutex_lock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + + ib::info() + << "[buffer pool: size=" << curr_size + << ", database pages=" << UT_LIST_GET_LEN(LRU) + << ", free pages=" << UT_LIST_GET_LEN(free) + << ", modified database pages=" + << UT_LIST_GET_LEN(flush_list) + << ", n pending decompressions=" << n_pend_unzip + << ", n pending flush LRU=" << n_flush() + << " list=" << os_aio_pending_writes() + << ", pages made young=" << stat.n_pages_made_young + << ", not young=" << stat.n_pages_not_made_young + << ", pages read=" << stat.n_pages_read + << ", created=" << stat.n_pages_created + << ", written=" << stat.n_pages_written << "]"; + + mysql_mutex_unlock(&flush_list_mutex); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + chunk = chunks; + + for (i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + ulint n_blocks = chunk->size; + + for (; n_blocks--; block++) { + const buf_frame_t* frame = block->page.frame; + + if (fil_page_index_page_check(frame)) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (index_ids[j] == id) { + counts[j]++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + } + + mysql_mutex_unlock(&mutex); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + if (!index) { + ib::info() << "Block count for index " + << index_ids[i] << " in buffer is about " + << counts[i]; + } else { + ib::info() << "Block count for index " << index_ids[i] + << " in buffer is about " << counts[i] + << ", index " << index->name + << " of table " << index->table->name; + } + } + + ut_free(index_ids); + ut_free(counts); + + validate(); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/** @return the number of latched pages in the buffer pool */ +ulint buf_get_latched_pages_number() +{ + ulint fixed_pages_number= 0; + + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b; + b= UT_LIST_GET_NEXT(LRU, b)) + if (b->state() > buf_page_t::UNFIXED) + fixed_pages_number++; + + mysql_mutex_unlock(&buf_pool.mutex); + + return fixed_pages_number; +} +#endif /* UNIV_DEBUG */ + +/** Collect buffer pool metadata. +@param[out] pool_info buffer pool metadata */ +void buf_stats_get_pool_info(buf_pool_info_t *pool_info) +{ + time_t current_time; + double time_elapsed; + + mysql_mutex_lock(&buf_pool.mutex); + + pool_info->pool_size = buf_pool.curr_size; + + pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + pool_info->old_lru_len = buf_pool.LRU_old_len; + + pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); + + pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->n_pend_reads = os_aio_pending_reads_approx(); + + pool_info->n_pending_flush_lru = buf_pool.n_flush(); + + pool_info->n_pending_flush_list = os_aio_pending_writes(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool.last_printout_time); + + pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young; + + pool_info->n_pages_not_made_young = + buf_pool.stat.n_pages_not_made_young; + + pool_info->n_pages_read = buf_pool.stat.n_pages_read; + + pool_info->n_pages_created = buf_pool.stat.n_pages_created; + + pool_info->n_pages_written = buf_pool.stat.n_pages_written; + + pool_info->n_page_gets = buf_pool.stat.n_page_gets; + + pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd; + pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read; + + pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted; + + pool_info->page_made_young_rate = + static_cast<double>(buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young) + / time_elapsed; + + pool_info->page_not_made_young_rate = + static_cast<double>(buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young) + / time_elapsed; + + pool_info->pages_read_rate = + static_cast<double>(buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read) + / time_elapsed; + + pool_info->pages_created_rate = + static_cast<double>(buf_pool.stat.n_pages_created + - buf_pool.old_stat.n_pages_created) + / time_elapsed; + + pool_info->pages_written_rate = + static_cast<double>(buf_pool.stat.n_pages_written + - buf_pool.old_stat.n_pages_written) + / time_elapsed; + + pool_info->n_page_get_delta = buf_pool.stat.n_page_gets + - buf_pool.old_stat.n_page_gets; + + if (pool_info->n_page_get_delta) { + pool_info->page_read_delta = buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read; + + pool_info->young_making_delta = + buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young; + + pool_info->not_young_making_delta = + buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young; + } + pool_info->pages_readahead_rnd_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd + - buf_pool.old_stat.n_ra_pages_read_rnd) + / time_elapsed; + + + pool_info->pages_readahead_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_read + - buf_pool.old_stat.n_ra_pages_read) + / time_elapsed; + + pool_info->pages_evicted_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_evicted + - buf_pool.old_stat.n_ra_pages_evicted) + / time_elapsed; + + pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->io_sum = buf_LRU_stat_sum.io; + + pool_info->io_cur = buf_LRU_stat_cur.io; + + pool_info->unzip_sum = buf_LRU_stat_sum.unzip; + + pool_info->unzip_cur = buf_LRU_stat_cur.unzip; + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +static +void +buf_print_io_instance( +/*==================*/ + buf_pool_info_t*pool_info, /*!< in: buffer pool info */ + FILE* file) /*!< in/out: buffer where to print */ +{ + ut_ad(pool_info); + + fprintf(file, + "Buffer pool size " ULINTPF "\n" + "Free buffers " ULINTPF "\n" + "Database pages " ULINTPF "\n" + "Old database pages " ULINTPF "\n" + "Modified db pages " ULINTPF "\n" + "Percent of dirty pages(LRU & free pages): %.3f\n" + "Max dirty pages percent: %.3f\n" + "Pending reads " ULINTPF "\n" + "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n", + pool_info->pool_size, + pool_info->free_list_len, + pool_info->lru_len, + pool_info->old_lru_len, + pool_info->flush_list_len, + static_cast<double>(pool_info->flush_list_len) + / (static_cast<double>(pool_info->lru_len + + pool_info->free_list_len) + 1.0) + * 100.0, + srv_max_buf_pool_modified_pct, + pool_info->n_pend_reads, + pool_info->n_pending_flush_lru, + pool_info->n_pending_flush_list); + + fprintf(file, + "Pages made young " ULINTPF ", not young " ULINTPF "\n" + "%.2f youngs/s, %.2f non-youngs/s\n" + "Pages read " ULINTPF ", created " ULINTPF + ", written " ULINTPF "\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + pool_info->n_pages_made_young, + pool_info->n_pages_not_made_young, + pool_info->page_made_young_rate, + pool_info->page_not_made_young_rate, + pool_info->n_pages_read, + pool_info->n_pages_created, + pool_info->n_pages_written, + pool_info->pages_read_rate, + pool_info->pages_created_rate, + pool_info->pages_written_rate); + + if (pool_info->n_page_get_delta) { + double hit_rate = static_cast<double>( + pool_info->page_read_delta) + / static_cast<double>(pool_info->n_page_get_delta); + + if (hit_rate > 1) { + hit_rate = 1; + } + + fprintf(file, + "Buffer pool hit rate " ULINTPF " / 1000," + " young-making rate " ULINTPF " / 1000 not " + ULINTPF " / 1000\n", + ulint(1000 * (1 - hit_rate)), + ulint(1000 + * double(pool_info->young_making_delta) + / double(pool_info->n_page_get_delta)), + ulint(1000 * double(pool_info->not_young_making_delta) + / double(pool_info->n_page_get_delta))); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + /* Statistics about read ahead algorithm */ + fprintf(file, "Pages read ahead %.2f/s," + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", + + pool_info->pages_readahead_rate, + pool_info->pages_evicted_rate, + pool_info->pages_readahead_rnd_rate); + + /* Print some values to help us with visualizing what is + happening with LRU eviction. */ + fprintf(file, + "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n" + "I/O sum[" ULINTPF "]:cur[" ULINTPF "], " + "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n", + pool_info->lru_len, pool_info->unzip_lru_len, + pool_info->io_sum, pool_info->io_cur, + pool_info->unzip_sum, pool_info->unzip_cur); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +void +buf_print_io( +/*=========*/ + FILE* file) /*!< in/out: buffer where to print */ +{ + buf_pool_info_t pool_info; + + buf_stats_get_pool_info(&pool_info); + buf_print_io_instance(&pool_info, file); +} + +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param page page frame +@param fsp_flags contents of FSP_SPACE_FLAGS +@return whether the page is encrypted and valid */ +bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags) +{ + if (!fil_space_t::full_crc32(fsp_flags)) { + return fil_space_verify_crypt_checksum( + page, fil_space_t::zip_size(fsp_flags)); + } + + return !buf_page_is_corrupted(true, page, fsp_flags); +} + +/** Print the given page_id_t object. +@param[in,out] out the output stream +@param[in] page_id the page_id_t object to be printed +@return the output stream */ +std::ostream& operator<<(std::ostream &out, const page_id_t page_id) +{ + out << "[page id: space=" << page_id.space() + << ", page number=" << page_id.page_no() << "]"; + return out; +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc new file mode 100644 index 00000000..662343ae --- /dev/null +++ b/storage/innobase/buf/buf0checksum.cc @@ -0,0 +1,98 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.cc +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#include "buf0checksum.h" +#include "fil0fil.h" +#include "ut0rnd.h" + +#ifndef UNIV_INNOCHECKSUM +#include "srv0srv.h" +#endif /* !UNIV_INNOCHECKSUM */ + +/** Calculate the CRC32 checksum of a page. The value is stored to the page +when it is written to a file and also checked for a match when reading from +the file. Note that we must be careful to calculate the same value on all +architectures. +@param[in] page buffer page (srv_page_size bytes) +@return CRC-32C */ +uint32_t buf_calc_page_crc32(const byte* page) +{ + /* Note: innodb_checksum_algorithm=crc32 could and should have + included the entire page in the checksum, and CRC-32 values + should be combined with the CRC-32 function, not with + exclusive OR. We stick to the current algorithm in order to + remain compatible with old data files. */ + return my_crc32c(0, page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_OFFSET) + ^ my_crc32c(0, page + FIL_PAGE_DATA, + srv_page_size + - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM)); +} + +#ifndef UNIV_INNOCHECKSUM +/** Calculate a checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_new_checksum(const byte* page) +{ + ulint checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool + to the first pages of data files, we have to skip them in the page + checksum calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + srv_page_size - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + return(static_cast<uint32_t>(checksum)); +} + +/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that +the checksum only looked at the first few bytes of the page. +This calculates that old checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_old_checksum(const byte* page) +{ + return(static_cast<uint32_t> + (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION))); +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc new file mode 100644 index 00000000..e9aea355 --- /dev/null +++ b/storage/innobase/buf/buf0dblwr.cc @@ -0,0 +1,779 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dblwr.cc +Doublwrite buffer module + +Created 2011/12/19 +*******************************************************/ + +#include "buf0dblwr.h" +#include "buf0flu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "fil0crypt.h" +#include "fil0pagecompress.h" + +using st_::span; + +/** The doublewrite buffer */ +buf_dblwr_t buf_dblwr; + +/** @return the TRX_SYS page */ +inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) +{ + return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, RW_X_LATCH, mtr); +} + +void buf_dblwr_t::init() +{ + if (!active_slot) + { + active_slot= &slots[0]; + mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); + pthread_cond_init(&cond, nullptr); + } +} + +/** Initialise the persistent storage of the doublewrite buffer. +@param header doublewrite page header in the TRX_SYS page */ +inline void buf_dblwr_t::init(const byte *header) +{ + ut_ad(!active_slot->first_free); + ut_ad(!active_slot->reserved); + ut_ad(!batch_running); + + block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1)); + block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2)); + + const uint32_t buf_size= 2 * block_size(); + for (int i= 0; i < 2; i++) + { + slots[i].write_buf= static_cast<byte*> + (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size)); + slots[i].buf_block_arr= static_cast<element*> + (ut_zalloc_nokey(buf_size * sizeof(element))); + } + active_slot= &slots[0]; +} + +/** Create or restore the doublewrite buffer in the TRX_SYS page. +@return whether the operation succeeded */ +bool buf_dblwr_t::create() +{ + if (is_created()) + return true; + + mtr_t mtr; + const ulint size= block_size(); + +start_again: + mtr.start(); + + dberr_t err; + buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + if (!trx_sys_block) + { + mtr.commit(); + return false; + } + + if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + trx_sys_block->page.frame) == + TRX_SYS_DOUBLEWRITE_MAGIC_N) + { + /* The doublewrite buffer has already been created: just read in + some numbers */ + init(TRX_SYS_DOUBLEWRITE + trx_sys_block->page.frame); + mtr.commit(); + return true; + } + + if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size) + { + ib::error() << "Cannot create doublewrite buffer: " + "the first file in innodb_data_file_path must be at least " + << (3 * (size >> (20U - srv_page_size_shift))) << "M."; +fail: + mtr.commit(); + return false; + } + else + { + buf_block_t *b= fseg_create(fil_system.sys_space, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, + &mtr, &err, false, trx_sys_block); + if (!b) + { + ib::error() << "Cannot create doublewrite buffer: " << err; + goto fail; + } + + ib::info() << "Doublewrite buffer not found: creating new"; + + /* FIXME: After this point, the doublewrite buffer creation + is not atomic. The doublewrite buffer should not exist in + the InnoDB system tablespace file in the first place. + It could be located in separate optional file(s) in a + user-specified location. */ + } + + byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->page.frame; + for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE; + i < 2 * size + extent_size / 2; i++) + { + buf_block_t *new_block= + fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP, + false, &mtr, &mtr, &err); + if (!new_block) + { + ib::error() << "Cannot create doublewrite buffer: " + " you must increase your tablespace size." + " Cannot continue operation."; + /* This may essentially corrupt the doublewrite + buffer. However, usually the doublewrite buffer + is created at database initialization, and it + should not matter (just remove all newly created + InnoDB files and restart). */ + mtr.commit(); + return false; + } + + /* We read the allocated pages to the buffer pool; when they are + written to disk in a flush, the space id and page number fields + are also written to the pages. When we at database startup read + pages from the doublewrite buffer, we know that if the space id + and page number in them are the same as the page position in the + tablespace, then the page has not been written to in + doublewrite. */ + + ut_ad(new_block->page.lock.not_recursive()); + const page_id_t id= new_block->page.id(); + /* We only do this in the debug build, to ensure that the check in + buf_flush_init_for_writing() will see a valid page type. The + flushes of new_block are actually unnecessary here. */ + ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame, + FIL_PAGE_TYPE_SYS)); + + if (i == size / 2) + { + ut_a(id.page_no() == size); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 + + trx_sys_block->page.frame, id.page_no()); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame, + id.page_no()); + } + else if (i == size / 2 + size) + { + ut_a(id.page_no() == 2 * size); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 + + trx_sys_block->page.frame, id.page_no()); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame, + id.page_no()); + } + else if (i > size / 2) + ut_a(id.page_no() == prev_page_no + 1); + + if (((i + 1) & 15) == 0) { + /* rw_locks can only be recursively x-locked 2048 times. (on 32 + bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a + negative number, and thus lock_word becomes like a shared lock). + For 4k page size this loop will lock the fseg header too many + times. Since this code is not done while any other threads are + active, restart the MTR occasionally. */ + mtr.commit(); + mtr.start(); + trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->page.frame; + } + + prev_page_no= id.page_no(); + } + + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame, + TRX_SYS_DOUBLEWRITE_MAGIC_N); + + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + + trx_sys_block->page.frame, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N); + mtr.commit(); + + buf_flush_wait_flushed(mtr.commit_lsn()); + + /* Remove doublewrite pages from LRU */ + buf_pool_invalidate(); + goto start_again; +} + +/** Initialize the doublewrite buffer memory structure on recovery. +If we are upgrading from a version before MySQL 4.1, then this +function performs the necessary update operations to support +innodb_file_per_table. If we are in a crash recovery, this function +loads the pages from double write buffer into memory. +@param file File handle +@param path Path name of file +@return DB_SUCCESS or error code */ +dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path) +{ + ut_ad(this == &buf_dblwr); + const uint32_t size= block_size(); + + /* We do the file i/o past the buffer pool */ + byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size, + srv_page_size)); + /* Read the TRX_SYS header to check if we are using the doublewrite buffer */ + dberr_t err= os_file_read(IORequestRead, file, read_buf, + TRX_SYS_PAGE_NO << srv_page_size_shift, + srv_page_size, nullptr); + + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the system tablespace header page"; +func_exit: + aligned_free(read_buf); + return err; + } + + /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */ + if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE + + read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N) + { + /* There is no doublewrite buffer initialized in the TRX_SYS page. + This should normally not be possible; the doublewrite buffer should + be initialized when creating the database. */ + err= DB_SUCCESS; + goto func_exit; + } + + init(TRX_SYS_DOUBLEWRITE + read_buf); + + const bool upgrade_to_innodb_file_per_table= + mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + + TRX_SYS_DOUBLEWRITE + read_buf) != + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N; + + auto write_buf= active_slot->write_buf; + /* Read the pages from the doublewrite buffer to memory */ + err= os_file_read(IORequestRead, file, write_buf, + block1.page_no() << srv_page_size_shift, + size << srv_page_size_shift, nullptr); + + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the first double write buffer extent"; + goto func_exit; + } + + err= os_file_read(IORequestRead, file, + write_buf + (size << srv_page_size_shift), + block2.page_no() << srv_page_size_shift, + size << srv_page_size_shift, nullptr); + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the second double write buffer extent"; + goto func_exit; + } + + byte *page= write_buf; + + if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table)) + { + ib::info() << "Resetting space id's in the doublewrite buffer"; + + for (ulint i= 0; i < size * 2; i++, page += srv_page_size) + { + memset(page + FIL_PAGE_SPACE_ID, 0, 4); + /* For pre-MySQL-4.1 innodb_checksum_algorithm=innodb, we do not need to + calculate new checksums for the pages because the field + .._SPACE_ID does not affect them. Write the page back to where + we read it from. */ + const ulint source_page_no= i < size + ? block1.page_no() + i + : block2.page_no() + i - size; + err= os_file_write(IORequestWrite, path, file, page, + source_page_no << srv_page_size_shift, srv_page_size); + if (err != DB_SUCCESS) + { + ib::error() << "Failed to upgrade the double write buffer"; + goto func_exit; + } + } + os_file_flush(file); + } + else + for (ulint i= 0; i < size * 2; i++, page += srv_page_size) + if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN))) + /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */ + recv_sys.dblwr.add(page); + + err= DB_SUCCESS; + goto func_exit; +} + +/** Process and remove the double write buffer pages for all tablespaces. */ +void buf_dblwr_t::recover() +{ + ut_ad(log_sys.last_checkpoint_lsn); + if (!is_created()) + return; + + uint32_t page_no_dblwr= 0; + byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size, + srv_page_size)); + byte *const buf= read_buf + srv_page_size; + + for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin(); + i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr) + { + byte *page= *i; + const uint32_t page_no= page_get_page_no(page); + if (!page_no) /* recovered via recv_dblwr_t::restore_first_page() */ + continue; + + const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN); + if (log_sys.last_checkpoint_lsn > lsn) + /* Pages written before the checkpoint are not useful for recovery. */ + continue; + const uint32_t space_id= page_get_space_id(page); + const page_id_t page_id(space_id, page_no); + + if (recv_sys.scanned_lsn < lsn) + { + ib::info() << "Ignoring a doublewrite copy of page " << page_id + << " with future log sequence number " << lsn; + continue; + } + + fil_space_t *space= fil_space_t::get(space_id); + + if (!space) + /* The tablespace that this page once belonged to does not exist */ + continue; + + if (UNIV_UNLIKELY(page_no >= space->get_size())) + { + /* Do not report the warning for undo tablespaces, because they + can be truncated in place. */ + if (!srv_is_undo_tablespace(space_id)) + ib::warn() << "A copy of page " << page_no + << " in the doublewrite buffer slot " << page_no_dblwr + << " is beyond the end of " << space->chain.start->name + << " (" << space->size << " pages)"; +next_page: + space->release(); + continue; + } + + const ulint physical_size= space->physical_size(); + ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size))); + + /* We want to ensure that for partial reads the unread portion of + the page is NUL. */ + memset(read_buf, 0x0, physical_size); + + /* Read in the actual page from the file */ + fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER), + os_offset_t{page_no} * physical_size, + physical_size, read_buf); + + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) + { + ib::warn() << "Double write buffer recovery: " << page_id + << " ('" << space->chain.start->name + << "') read failed with error: " << fio.err; + continue; + } + + if (buf_is_zeroes(span<const byte>(read_buf, physical_size))) + { + /* We will check if the copy in the doublewrite buffer is + valid. If not, we will ignore this page (there should be redo + log records to initialize it). */ + } + else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf)) + goto next_page; + else + /* We intentionally skip this message for all-zero pages. */ + ib::info() << "Trying to recover page " << page_id + << " from the doublewrite buffer."; + + page= recv_sys.dblwr.find_page(page_id, space, buf); + + if (!page) + goto next_page; + + /* Write the good page from the doublewrite buffer to the intended + position. */ + space->reacquire(); + fio= space->io(IORequestWrite, + os_offset_t{page_id.page_no()} * physical_size, + physical_size, page); + + if (fio.err == DB_SUCCESS) + ib::info() << "Recovered page " << page_id << " to '" << fio.node->name + << "' from the doublewrite buffer."; + goto next_page; + } + + recv_sys.dblwr.pages.clear(); + fil_flush_file_spaces(); + aligned_free(read_buf); +} + +/** Free the doublewrite buffer. */ +void buf_dblwr_t::close() +{ + if (!active_slot) + return; + + ut_ad(!active_slot->reserved); + ut_ad(!active_slot->first_free); + ut_ad(!batch_running); + + pthread_cond_destroy(&cond); + for (int i= 0; i < 2; i++) + { + aligned_free(slots[i].write_buf); + ut_free(slots[i].buf_block_arr); + } + mysql_mutex_destroy(&mutex); + + memset((void*) this, 0, sizeof *this); +} + +/** Update the doublewrite buffer on write completion. */ +void buf_dblwr_t::write_completed() +{ + ut_ad(this == &buf_dblwr); + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&mutex); + + ut_ad(is_created()); + ut_ad(srv_use_doublewrite_buf); + ut_ad(batch_running); + slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved); + ut_ad(flush_slot->reserved <= flush_slot->first_free); + + if (!--flush_slot->reserved) + { + mysql_mutex_unlock(&mutex); + /* This will finish the batch. Sync data files to the disk. */ + fil_flush_file_spaces(); + mysql_mutex_lock(&mutex); + + /* We can now reuse the doublewrite memory buffer: */ + flush_slot->first_free= 0; + batch_running= false; + pthread_cond_broadcast(&cond); + } + + mysql_mutex_unlock(&mutex); +} + +#ifdef UNIV_DEBUG +/** Check the LSN values on the page. +@param[in] page page to check +@param[in] s tablespace */ +static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s) +{ + /* Ignore page_compressed or encrypted pages */ + if (s.is_compressed() || buf_page_get_key_version(page, s.flags)) + return; + const byte* lsn_start= FIL_PAGE_LSN + 4 + page; + const byte* lsn_end= page + srv_page_size - + (s.full_crc32() + ? FIL_PAGE_FCRC32_END_LSN + : FIL_PAGE_END_LSN_OLD_CHKSUM - 4); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4)); +} + +static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page) +{ + if (fil_space_t *space= fil_space_t::get_for_write(b.id().space())) + { + buf_dblwr_check_page_lsn(page, *space); + space->release(); + } +} + +/** Check the LSN values on the page with which this block is associated. */ +static void buf_dblwr_check_block(const buf_page_t *bpage) +{ + ut_ad(bpage->in_file()); + const page_t *page= bpage->frame; + ut_ad(page); + + switch (fil_page_get_type(page)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + if (page_is_comp(page)) + { + if (page_simple_validate_new(page)) + return; + } + else if (page_simple_validate_old(page)) + return; + /* While it is possible that this is not an index page but just + happens to have wrongly set FIL_PAGE_TYPE, such pages should never + be modified to without also adjusting the page type during page + allocation or buf_flush_init_for_writing() or + fil_block_reset_type(). */ + buf_page_print(page); + + ib::fatal() << "Apparent corruption of an index page " << bpage->id() + << " to be written to data file. We intentionally crash" + " the server to prevent corrupt data from ending up in" + " data files."; + } +} +#endif /* UNIV_DEBUG */ + +bool buf_dblwr_t::flush_buffered_writes(const ulint size) +{ + mysql_mutex_assert_owner(&mutex); + ut_ad(size == block_size()); + + for (;;) + { + if (!active_slot->first_free) + return false; + if (!batch_running) + break; + my_cond_wait(&cond, &mutex.m_mutex); + } + + ut_ad(active_slot->reserved == active_slot->first_free); + ut_ad(!flushing_buffered_writes); + + /* Disallow anyone else to start another batch of flushing. */ + slot *flush_slot= active_slot; + /* Switch the active slot */ + active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_a(active_slot->first_free == 0); + batch_running= true; + const ulint old_first_free= flush_slot->first_free; + auto write_buf= flush_slot->write_buf; + const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 && + old_first_free > size; + flushing_buffered_writes= 1 + multi_batch; + /* Now safe to release the mutex. */ + mysql_mutex_unlock(&mutex); +#ifdef UNIV_DEBUG + for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++) + { + buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage; + + if (bpage->zip.data) + /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */ + continue; + + /* Check that the actual page in the buffer pool is not corrupt + and the LSN values are sane. */ + buf_dblwr_check_block(bpage); + ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2)); + } +#endif /* UNIV_DEBUG */ + const IORequest request{nullptr, nullptr, fil_system.sys_space->chain.start, + IORequest::DBLWR_BATCH}; + ut_a(fil_system.sys_space->acquire()); + if (multi_batch) + { + fil_system.sys_space->reacquire(); + os_aio(request, write_buf, + os_offset_t{block1.page_no()} << srv_page_size_shift, + size << srv_page_size_shift); + os_aio(request, write_buf + (size << srv_page_size_shift), + os_offset_t{block2.page_no()} << srv_page_size_shift, + (old_first_free - size) << srv_page_size_shift); + } + else + os_aio(request, write_buf, + os_offset_t{block1.page_no()} << srv_page_size_shift, + old_first_free << srv_page_size_shift); + return true; +} + +static void *get_frame(const IORequest &request) +{ + if (request.slot) + return request.slot->out_buf; + const buf_page_t *bpage= request.bpage; + return bpage->zip.data ? bpage->zip.data : bpage->frame; +} + +void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) +{ + ut_ad(this == &buf_dblwr); + ut_ad(srv_use_doublewrite_buf); + ut_ad(is_created()); + ut_ad(!srv_read_only_mode); + ut_ad(!request.bpage); + ut_ad(request.node == fil_system.sys_space->chain.start); + ut_ad(request.type == IORequest::DBLWR_BATCH); + mysql_mutex_lock(&mutex); + ut_ad(batch_running); + ut_ad(flushing_buffered_writes); + ut_ad(flushing_buffered_writes <= 2); + writes_completed++; + if (UNIV_UNLIKELY(--flushing_buffered_writes)) + { + mysql_mutex_unlock(&mutex); + return; + } + + slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved == flush_slot->first_free); + /* increment the doublewrite flushed pages counter */ + pages_written+= flush_slot->first_free; + mysql_mutex_unlock(&mutex); + + /* Now flush the doublewrite buffer data to disk */ + fil_system.sys_space->flush<false>(); + + /* The writes have been flushed to disk now and in recovery we will + find them in the doublewrite buffer blocks. Next, write the data pages. */ + for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++) + { + auto e= flush_slot->buf_block_arr[i]; + buf_page_t* bpage= e.request.bpage; + ut_ad(bpage->in_file()); + + void *frame= get_frame(e.request); + ut_ad(frame); + + auto e_size= e.size; + + if (UNIV_LIKELY_NULL(bpage->zip.data)) + { + e_size= bpage->zip_size(); + ut_ad(e_size); + } + else + { + ut_ad(!bpage->zip_size()); + ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame))); + } + + const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + + static_cast<const byte*>(frame))); + ut_ad(lsn); + ut_ad(lsn >= bpage->oldest_modification()); + log_write_up_to(lsn, true); + e.request.node->space->io(e.request, bpage->physical_offset(), e_size, + frame, bpage); + } +} + +/** Flush possible buffered writes to persistent storage. +It is very important to call this function after a batch of writes has been +posted, and also when we may have to wait for a page latch! +Otherwise a deadlock of threads can occur. */ +void buf_dblwr_t::flush_buffered_writes() +{ + if (!is_created() || !srv_use_doublewrite_buf) + { + fil_flush_file_spaces(); + return; + } + + ut_ad(!srv_read_only_mode); + const ulint size= block_size(); + + mysql_mutex_lock(&mutex); + if (!flush_buffered_writes(size)) + mysql_mutex_unlock(&mutex); +} + +/** Schedule a page write. If the doublewrite memory buffer is full, +flush_buffered_writes() will be invoked to make space. +@param request asynchronous write request +@param size payload size in bytes */ +void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) +{ + ut_ad(request.is_async()); + ut_ad(request.is_write()); + ut_ad(request.bpage); + ut_ad(request.bpage->in_file()); + ut_ad(request.node); + ut_ad(request.node->space->purpose == FIL_TYPE_TABLESPACE); + ut_ad(request.node->space->id == request.bpage->id().space()); + ut_ad(request.node->space->referenced()); + ut_ad(!srv_read_only_mode); + + const ulint buf_size= 2 * block_size(); + + mysql_mutex_lock(&mutex); + + for (;;) + { + ut_ad(active_slot->first_free <= buf_size); + if (active_slot->first_free != buf_size) + break; + + if (flush_buffered_writes(buf_size / 2)) + mysql_mutex_lock(&mutex); + } + + byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free; + + /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages, + and at least srv_page_size (4096-byte) for everything else. */ + memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, get_frame(request), size); + /* fil_page_compress() for page_compressed guarantees 256-byte alignment */ + memset_aligned<256>(p + size, 0, srv_page_size - size); + /* FIXME: Inform the compiler that "size" and "srv_page_size - size" + are integer multiples of 256, so the above can translate into simple + SIMD instructions. Currently, we make no such assumptions about the + non-pointer parameters that are passed to the _aligned templates. */ + ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size); + ut_ad(active_slot->reserved == active_slot->first_free); + ut_ad(active_slot->reserved < buf_size); + new (active_slot->buf_block_arr + active_slot->first_free++) + element{request, size}; + active_slot->reserved= active_slot->first_free; + + if (active_slot->first_free != buf_size || + !flush_buffered_writes(buf_size / 2)) + mysql_mutex_unlock(&mutex); +} diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc new file mode 100644 index 00000000..957632db --- /dev/null +++ b/storage/innobase/buf/buf0dump.cc @@ -0,0 +1,765 @@ +/***************************************************************************** + +Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.cc +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#include "my_global.h" +#include "mysqld.h" +#include "my_sys.h" + +#include "mysql/psi/mysql_stage.h" +#include "mysql/psi/psi.h" + +#include "buf0rea.h" +#include "buf0dump.h" +#include "dict0dict.h" +#include "os0file.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "ut0byte.h" + +#include <algorithm> + +#include "mysql/service_wsrep.h" /* wsrep_recovery */ +#include <my_service_manager.h> + +static void buf_do_load_dump(); + +enum status_severity { + STATUS_INFO, + STATUS_ERR +}; + +#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) + +/* Flags that tell the buffer pool dump/load thread which action should it +take after being waked up. */ +static volatile bool buf_dump_should_start; +static volatile bool buf_load_should_start; + +static bool buf_load_abort_flag; + +/** Start the buffer pool dump/load task and instructs it to start a dump. */ +void buf_dump_start() +{ + buf_dump_should_start= true; + buf_do_load_dump(); +} + +/** Start the buffer pool dump/load task and instructs it to start a load. */ +void buf_load_start() +{ + buf_load_should_start= true; + buf_do_load_dump(); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */ +static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) +void +buf_dump_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according + to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_dump_status, + sizeof(export_vars.innodb_buffer_pool_dump_status), + fmt, ap); + + switch (severity) { + case STATUS_INFO: + ib::info() << export_vars.innodb_buffer_pool_dump_status; + break; + + case STATUS_ERR: + ib::error() << export_vars.innodb_buffer_pool_dump_status; + break; + } + + va_end(ap); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */ +static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) +void +buf_load_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_load_status, + sizeof(export_vars.innodb_buffer_pool_load_status), + fmt, ap); + + switch (severity) { + case STATUS_INFO: + ib::info() << export_vars.innodb_buffer_pool_load_status; + break; + + case STATUS_ERR: + ib::error() << export_vars.innodb_buffer_pool_load_status; + break; + } + + va_end(ap); +} + +/** Returns the directory path where the buffer pool dump file will be created. +@return directory path */ +static +const char* +get_buf_dump_dir() +{ + const char* dump_dir; + + /* The dump file should be created in the default data directory if + innodb_data_home_dir is set as an empty string. */ + if (!*srv_data_home) { + dump_dir = fil_path_to_mysql_datadir; + } else { + dump_dir = srv_data_home; + } + + return(dump_dir); +} + +/** Generate the path to the buffer pool dump/load file. +@param[out] path generated path +@param[in] path_size size of 'path', used as in snprintf(3). */ +static void buf_dump_generate_path(char *path, size_t path_size) +{ + char buf[FN_REFLEN]; + + mysql_mutex_lock(&LOCK_global_system_variables); + snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(), + srv_buf_dump_filename); + mysql_mutex_unlock(&LOCK_global_system_variables); + + os_file_type_t type; + bool exists = false; + bool ret; + + ret = os_file_status(buf, &exists, &type); + + /* For realpath() to succeed the file must exist. */ + + if (ret && exists) { + /* my_realpath() assumes the destination buffer is big enough + to hold FN_REFLEN bytes. */ + ut_a(path_size >= FN_REFLEN); + + my_realpath(path, buf, 0); + } else { + /* If it does not exist, then resolve only srv_data_home + and append srv_buf_dump_filename to it. */ + char srv_data_home_full[FN_REFLEN]; + + my_realpath(srv_data_home_full, get_buf_dump_dir(), 0); + const char *format; + + switch (srv_data_home_full[strlen(srv_data_home_full) - 1]) { +#ifdef _WIN32 + case '\\': +#endif + case '/': + format = "%s%s"; + break; + default: + format = "%s/%s"; + } + + snprintf(path, path_size, format, + srv_data_home_full, srv_buf_dump_filename); + } +} + + +/*****************************************************************//** +Perform a buffer pool dump into the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_dump( +/*=====*/ + ibool obey_shutdown) /*!< in: quit if we are in a shutting down + state */ +{ +#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown) + + char full_filename[OS_FILE_MAX_PATH]; + char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"]; + char now[32]; + FILE* f; + int ret; + + buf_dump_generate_path(full_filename, sizeof(full_filename)); + + snprintf(tmp_filename, sizeof(tmp_filename), + "%s.incomplete", full_filename); + + buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s", + full_filename); + +#ifdef _WIN32 + /* use my_fopen() for correct permissions during bootstrap*/ + f = my_fopen(tmp_filename, O_RDWR|O_TRUNC|O_CREAT, 0); +#elif defined(__GLIBC__) || O_CLOEXEC == 0 + f = fopen(tmp_filename, "w" STR_O_CLOEXEC); +#else + { + int fd; + fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640); + if (fd >= 0) { + f = fdopen(fd, "w"); + } + else { + f = NULL; + } + } +#endif + if (f == NULL) { + buf_dump_status(STATUS_ERR, + "Cannot open '%s' for writing: %s", + tmp_filename, strerror(errno)); + return; + } + const buf_page_t* bpage; + page_id_t* dump; + ulint n_pages; + ulint j; + + mysql_mutex_lock(&buf_pool.mutex); + + n_pages = UT_LIST_GET_LEN(buf_pool.LRU); + + /* skip empty buffer pools */ + if (n_pages == 0) { + mysql_mutex_unlock(&buf_pool.mutex); + goto done; + } + + if (srv_buf_pool_dump_pct != 100) { + ulint t_pages; + + /* limit the number of total pages dumped to X% of the + total number of pages */ + t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100; + if (n_pages > t_pages) { + buf_dump_status(STATUS_INFO, + "Restricted to " ULINTPF + " pages due to " + "innodb_buf_pool_dump_pct=%lu", + t_pages, srv_buf_pool_dump_pct); + n_pages = t_pages; + } + + if (n_pages == 0) { + n_pages = 1; + } + } + + dump = static_cast<page_id_t*>(ut_malloc_nokey( + n_pages * sizeof(*dump))); + + if (dump == NULL) { + std::ostringstream str_bytes; + mysql_mutex_unlock(&buf_pool.mutex); + fclose(f); + str_bytes << ib::bytes_iec{n_pages * sizeof(*dump)}; + buf_dump_status(STATUS_ERR, + "Cannot allocate %s: %s", + str_bytes.str().c_str(), + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0; + bpage != NULL && j < n_pages; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + const auto status = bpage->state(); + if (status < buf_page_t::UNFIXED) { + ut_a(status >= buf_page_t::FREED); + continue; + } + const page_id_t id{bpage->id()}; + + if (id.space() == SRV_TMP_SPACE_ID) { + /* Ignore the innodb_temporary tablespace. */ + continue; + } + + dump[j++] = id; + } + + mysql_mutex_unlock(&buf_pool.mutex); + + ut_a(j <= n_pages); + n_pages = j; + + for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) { + ret = fprintf(f, "%u,%u\n", + dump[j].space(), dump[j].page_no()); + if (ret < 0) { + ut_free(dump); + fclose(f); + buf_dump_status(STATUS_ERR, + "Cannot write to '%s': %s", + tmp_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + if (SHUTTING_DOWN() && !(j & 1023)) { + service_manager_extend_timeout( + INNODB_EXTEND_TIMEOUT_INTERVAL, + "Dumping buffer pool page " + ULINTPF "/" ULINTPF, j + 1, n_pages); + } + } + + ut_free(dump); + +done: + ret = IF_WIN(my_fclose(f,0),fclose(f)); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot close '%s': %s", + tmp_filename, strerror(errno)); + return; + } + /* else */ + + ret = unlink(full_filename); + if (ret != 0 && errno != ENOENT) { + buf_dump_status(STATUS_ERR, + "Cannot delete '%s': %s", + full_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + ret = rename(tmp_filename, full_filename); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot rename '%s' to '%s': %s", + tmp_filename, full_filename, + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + /* success */ + + ut_sprintf_timestamp(now); + + buf_dump_status(STATUS_INFO, + "Buffer pool(s) dump completed at %s", now); + + /* Though dumping doesn't related to an incomplete load, + we reset this to 0 here to indicate that a shutdown can also perform + a dump */ + export_vars.innodb_buffer_pool_load_incomplete = 0; +} + +/*****************************************************************//** +Perform a buffer pool load from the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_load_status will be set accordingly, see buf_load_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_load() +/*======*/ +{ + char full_filename[OS_FILE_MAX_PATH]; + char now[32]; + FILE* f; + page_id_t* dump; + ulint dump_n; + ulint i; + uint32_t space_id; + uint32_t page_no; + int fscanf_ret; + + /* Ignore any leftovers from before */ + buf_load_abort_flag = false; + + buf_dump_generate_path(full_filename, sizeof(full_filename)); + + buf_load_status(STATUS_INFO, + "Loading buffer pool(s) from %s", full_filename); + + f = fopen(full_filename, "r" STR_O_CLOEXEC); + if (f == NULL) { + buf_load_status(STATUS_INFO, + "Cannot open '%s' for reading: %s", + full_filename, strerror(errno)); + return; + } + /* else */ + + /* First scan the file to estimate how many entries are in it. + This file is tiny (approx 500KB per 1GB buffer pool), reading it + two times is fine. */ + dump_n = 0; + while (fscanf(f, "%u,%u", &space_id, &page_no) == 2 + && !SHUTTING_DOWN()) { + dump_n++; + } + + if (!SHUTTING_DOWN() && !feof(f)) { + /* fscanf() returned != 2 */ + const char* what; + if (ferror(f)) { + what = "reading"; + } else { + what = "parsing"; + } + fclose(f); + buf_load_status(STATUS_ERR, "Error %s '%s'," + " unable to load buffer pool (stage 1)", + what, full_filename); + return; + } + + /* If dump is larger than the buffer pool(s), then we ignore the + extra trailing. This could happen if a dump is made, then buffer + pool is shrunk and then load is attempted. */ + dump_n = std::min(dump_n, buf_pool.get_n_pages()); + + if (dump_n != 0) { + dump = static_cast<page_id_t*>(ut_malloc_nokey( + dump_n * sizeof(*dump))); + } else { + fclose(f); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s" + " (%s was empty)", now, full_filename); + return; + } + + if (dump == NULL) { + std::ostringstream str_bytes; + fclose(f); + str_bytes << ib::bytes_iec{dump_n * sizeof(*dump)}; + buf_dump_status(STATUS_ERR, + "Cannot allocate %s: %s", + str_bytes.str().c_str(), + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + rewind(f); + + export_vars.innodb_buffer_pool_load_incomplete = 1; + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no); + + if (fscanf_ret != 2) { + if (feof(f)) { + break; + } + /* else */ + + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s', unable" + " to load buffer pool (stage 2)", + full_filename); + return; + } + + if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) { + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s': bogus" + " space,page %u,%u at line " ULINTPF + ", unable to load buffer pool", + full_filename, + space_id, page_no, + i); + return; + } + + dump[i] = page_id_t(space_id, page_no); + } + + /* Set dump_n to the actual number of initialized elements, + i could be smaller than dump_n here if the file got truncated after + we read it the first time. */ + dump_n = i; + + fclose(f); + + if (dump_n == 0) { + ut_free(dump); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s" + " (%s was empty or had errors)", now, full_filename); + return; + } + + if (!SHUTTING_DOWN()) { + std::sort(dump, dump + dump_n); + } + + /* Avoid calling the expensive fil_space_t::get() for each + page within the same tablespace. dump[] is sorted by (space, page), + so all pages from a given tablespace are consecutive. */ + uint32_t cur_space_id = dump[0].space(); + fil_space_t* space = fil_space_t::get(cur_space_id); + ulint zip_size = space ? space->zip_size() : 0; + + PSI_stage_progress* pfs_stage_progress __attribute__((unused)) + = mysql_set_stage(srv_stage_buffer_pool_load.m_key); + mysql_stage_set_work_estimated(pfs_stage_progress, dump_n); + mysql_stage_set_work_completed(pfs_stage_progress, 0); + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + + /* space_id for this iteration of the loop */ + const uint32_t this_space_id = dump[i].space(); + + if (this_space_id >= SRV_SPACE_ID_UPPER_BOUND) { + continue; + } + + if (this_space_id != cur_space_id) { + if (space) { + space->release(); + } + + cur_space_id = this_space_id; + space = fil_space_t::get(cur_space_id); + + if (!space) { + continue; + } + + zip_size = space->zip_size(); + } + + /* JAN: TODO: As we use background page read below, + if tablespace is encrypted we cant use it. */ + if (!space || dump[i].page_no() >= space->get_size() || + (space->crypt_data && + space->crypt_data->encryption != FIL_ENCRYPTION_OFF && + space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) { + continue; + } + + if (space->is_stopping()) { + space->release(); + space = nullptr; + continue; + } + + space->reacquire(); + buf_read_page_background(space, dump[i], zip_size); + + if (buf_load_abort_flag) { + if (space) { + space->release(); + } + buf_load_abort_flag = false; + ut_free(dump); + buf_load_status( + STATUS_INFO, + "Buffer pool(s) load aborted on request"); + /* Premature end, set estimated = completed = i and + end the current stage event. */ + + mysql_stage_set_work_estimated(pfs_stage_progress, i); + mysql_stage_set_work_completed(pfs_stage_progress, i); + + mysql_end_stage(); + return; + } + +#ifdef UNIV_DEBUG + if ((i+1) >= srv_buf_pool_load_pages_abort) { + buf_load_abort_flag = true; + } +#endif + } + + if (space) { + space->release(); + } + + ut_free(dump); + + if (i == dump_n) { + os_aio_wait_until_no_pending_reads(true); + } + + ut_sprintf_timestamp(now); + + if (i == dump_n) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s", now); + export_vars.innodb_buffer_pool_load_incomplete = 0; + } else if (!buf_load_abort_flag) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to user instigated abort at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we don't want a shutdown to save the buffer pool */ + } else { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to shutdown at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we want to abort without saving the buffer pool */ + } + + /* Make sure that estimated = completed when we end. */ + mysql_stage_set_work_completed(pfs_stage_progress, dump_n); + /* End the stage progress event. */ + mysql_end_stage(); +} + +/** Abort a currently running buffer pool load. */ +void buf_load_abort() +{ + buf_load_abort_flag= true; +} + +/*****************************************************************//** +This is the main task for buffer pool dump/load. when scheduled +either performs a dump or load, depending on server state, state of the variables etc- */ +static void buf_dump_load_func(void *) +{ + ut_ad(!srv_read_only_mode); + static bool first_time = true; + if (first_time && srv_buffer_pool_load_at_startup) { + +#ifdef WITH_WSREP + if (!get_wsrep_recovery()) { +#endif /* WITH_WSREP */ + srv_thread_pool->set_concurrency(srv_n_read_io_threads); + buf_load(); + srv_thread_pool->set_concurrency(); +#ifdef WITH_WSREP + } +#endif /* WITH_WSREP */ + } + first_time = false; + + while (!SHUTTING_DOWN()) { + if (buf_dump_should_start) { + buf_dump_should_start = false; + buf_dump(true); + } + if (buf_load_should_start) { + buf_load_should_start = false; + buf_load(); + } + + if (!buf_dump_should_start && !buf_load_should_start) { + return; + } + } + + /* In shutdown */ + if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) { + if (export_vars.innodb_buffer_pool_load_incomplete) { + buf_dump_status(STATUS_INFO, + "Dumping of buffer pool not started" + " as load was incomplete"); +#ifdef WITH_WSREP + } else if (get_wsrep_recovery()) { +#endif /* WITH_WSREP */ + } else { + buf_dump(false/* do complete dump at shutdown */); + } + } +} + + +/* Execute task with max.concurrency */ +static tpool::task_group tpool_group(1); +static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group); +static bool load_dump_enabled; + +/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/ +void buf_load_at_startup() +{ + load_dump_enabled= true; + if (srv_buffer_pool_load_at_startup) + buf_do_load_dump(); +} + +static void buf_do_load_dump() +{ + if (load_dump_enabled && !buf_dump_load_task.is_running()) + srv_thread_pool->submit_task(&buf_dump_load_task); +} + +/** Wait for currently running load/dumps to finish*/ +void buf_load_dump_end() +{ + ut_ad(SHUTTING_DOWN()); + buf_dump_load_task.wait(); +} diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc new file mode 100644 index 00000000..b6357989 --- /dev/null +++ b/storage/innobase/buf/buf0flu.cc @@ -0,0 +1,2765 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2014, Fusion-io + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0flu.cc +The database buffer buf_pool flush algorithm + +Created 11/11/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include <my_service_manager.h> +#include <mysql/service_thd_wait.h> +#include <sql_class.h> + +#include "buf0flu.h" +#include "buf0buf.h" +#include "buf0checksum.h" +#include "buf0dblwr.h" +#include "srv0start.h" +#include "page0zip.h" +#include "fil0fil.h" +#include "log0crypt.h" +#include "srv0mon.h" +#include "fil0pagecompress.h" +#include "lzo/lzo1x.h" +#include "snappy-c.h" + +/** Number of pages flushed via LRU. Protected by buf_pool.mutex. +Also included in buf_pool.stat.n_pages_written. */ +ulint buf_lru_flush_page_count; + +/** Number of pages freed without flushing. Protected by buf_pool.mutex. */ +ulint buf_lru_freed_page_count; + +/** Flag indicating if the page_cleaner is in active state. */ +Atomic_relaxed<bool> buf_page_cleaner_is_active; + +/** Factor for scan length to determine n_pages for intended oldest LSN +progress */ +static constexpr ulint buf_flush_lsn_scan_factor = 3; + +/** Average redo generation rate */ +static lsn_t lsn_avg_rate = 0; + +/** Target oldest_modification for the page cleaner background flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed<lsn_t> buf_flush_async_lsn; +/** Target oldest_modification for the page cleaner furious flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed<lsn_t> buf_flush_sync_lsn; + +#ifdef UNIV_PFS_THREAD +mysql_pfs_key_t page_cleaner_thread_key; +#endif /* UNIV_PFS_THREAD */ + +/** Page cleaner structure */ +static struct +{ + /** total elapsed time in adaptive flushing, in seconds */ + ulint flush_time; + /** number of adaptive flushing passes */ + ulint flush_pass; +} page_cleaner; + +/* @} */ + +#ifdef UNIV_DEBUG +/** Validate the flush list. */ +static void buf_flush_validate_low(); + +/** Validates the flush list some of the time. */ +static void buf_flush_validate_skip() +{ +/** Try buf_flush_validate_low() every this many times */ +# define BUF_FLUSH_VALIDATE_SKIP 23 + + /** The buf_flush_validate_low() call skip counter. + Use a signed type because of the race condition below. */ + static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly buf_flush_validate_low() + check in debug builds. */ + if (--buf_flush_validate_count > 0) { + return; + } + + buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + buf_flush_validate_low(); +} +#endif /* UNIV_DEBUG */ + +void buf_pool_t::page_cleaner_wakeup(bool for_LRU) +{ + ut_d(buf_flush_validate_skip()); + if (!page_cleaner_idle()) + { + if (for_LRU) + /* Ensure that the page cleaner is not in a timed wait. */ + pthread_cond_signal(&do_flush_list); + return; + } + double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + double pct_lwm= srv_max_dirty_pages_pct_lwm; + + /* if pct_lwm != 0.0, adaptive flushing is enabled. + signal buf page cleaner thread + - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow + - if pct_lwm > dirty_pct then it will invoke idle flushing flow. + + idle_flushing: + dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an + idle flushing use-case. + + Why is last_activity_count not updated always? + - let's first understand when is server activity count updated. + - it is updated on commit of a transaction trx_t::commit() and not + on adding a page to the flush list. + - page_cleaner_wakeup is called when a page is added to the flush list. + + - now let's say the first user thread, updates the count from X -> Y but + is yet to commit the transaction (so activity count is still Y). + followup user threads will see the updated count as (Y) that is matching + the universal server activity count (Y), giving a false impression that + the server is idle. + + How to avoid this? + - by allowing last_activity_count to updated when page-cleaner is made + active and has work to do. This ensures that the last_activity signal + is consumed by the page-cleaner before the next one is generated. */ + if (for_LRU || + (pct_lwm != 0.0 && (pct_lwm <= dirty_pct || + last_activity_count == srv_get_activity_count())) || + srv_max_buf_pool_modified_pct <= dirty_pct) + { + page_cleaner_status-= PAGE_CLEANER_IDLE; + pthread_cond_signal(&do_flush_list); + } +} + +/** Remove a block from flush_list. +@param bpage buffer pool page */ +void buf_pool_t::delete_from_flush_list(buf_page_t *bpage) noexcept +{ + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + mysql_mutex_assert_owner(&flush_list_mutex); + flush_hp.adjust(bpage); + UT_LIST_REMOVE(flush_list, bpage); + flush_list_bytes-= bpage->physical_size(); + bpage->clear_oldest_modification(); +#ifdef UNIV_DEBUG + buf_flush_validate_skip(); +#endif /* UNIV_DEBUG */ +} + +/** Remove all dirty pages belonging to a given tablespace when we are +deleting the data file of that tablespace. +The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. +@param id tablespace identifier */ +void buf_flush_remove_pages(uint32_t id) +{ + const page_id_t first(id, 0), end(id + 1, 0); + ut_ad(id); + + for (;;) + { + mysql_mutex_lock(&buf_pool.mutex); + bool deferred= false; + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + const auto s= bpage->state(); + ut_ad(s >= buf_page_t::REMOVE_HASH); + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + const page_id_t bpage_id(bpage->id()); + + if (bpage_id < first || bpage_id >= end); + else if (s >= buf_page_t::WRITE_FIX) + deferred= true; + else + buf_pool.delete_from_flush_list(bpage); + + bpage= prev; + } + + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (!deferred) + break; + + os_aio_wait_until_no_pending_writes(true); + } +} + +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage have already been +copied to dpage. +IMPORTANT: When this function is called bpage and dpage are not +exact copies of each other. For example, they both will have different +::state. Also the ::list pointers in dpage may be stale. We need to +use the current list node (bpage) to do the list manipulation because +the list pointers could have changed between the time that we copied +the contents of bpage to the dpage and the flush list manipulation +below. */ +ATTRIBUTE_COLD +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage) /*!< in/out: destination block */ +{ + buf_page_t* prev; + + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + + const lsn_t lsn = bpage->oldest_modification(); + + if (!lsn) { + return; + } + + ut_ad(lsn == 1 || lsn > 2); + ut_ad(dpage->oldest_modification() == lsn); + + /* Important that we adjust the hazard pointer before removing + the bpage from the flush list. */ + buf_pool.flush_hp.adjust(bpage); + + prev = UT_LIST_GET_PREV(list, bpage); + UT_LIST_REMOVE(buf_pool.flush_list, bpage); + + bpage->clear_oldest_modification(); + + if (lsn == 1) { + buf_pool.flush_list_bytes -= dpage->physical_size(); + dpage->list.prev = nullptr; + dpage->list.next = nullptr; + dpage->clear_oldest_modification(); + } else if (prev) { + ut_ad(prev->oldest_modification()); + UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage); + } else { + UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage); + } + + ut_d(buf_flush_validate_low()); +} + +/** Note that a block is no longer dirty, while not removing +it from buf_pool.flush_list +@param temporary whether the page belongs to the temporary tablespace +@param error whether an error may have occurred while writing */ +inline void buf_page_t::write_complete(bool temporary, bool error) +{ + ut_ad(temporary == fsp_is_system_temporary(id().space())); + if (UNIV_UNLIKELY(error)); + else if (temporary) + { + ut_ad(oldest_modification() == 2); + oldest_modification_= 0; + } + else + { + /* We use release memory order to guarantee that callers of + oldest_modification_acquire() will observe the block as + being detached from buf_pool.flush_list, after reading the value 0. */ + ut_ad(oldest_modification() > 2); + oldest_modification_.store(1, std::memory_order_release); + } + const auto s= state(); + ut_ad(s >= WRITE_FIX); + zip.fix.fetch_sub((s >= WRITE_FIX_REINIT) + ? (WRITE_FIX_REINIT - UNFIXED) + : (WRITE_FIX - UNFIXED)); + lock.u_unlock(true); +} + +inline void buf_pool_t::n_flush_inc() +{ + mysql_mutex_assert_owner(&flush_list_mutex); + page_cleaner_status+= LRU_FLUSH; +} + +inline void buf_pool_t::n_flush_dec() +{ + mysql_mutex_lock(&flush_list_mutex); + ut_ad(page_cleaner_status >= LRU_FLUSH); + if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH) + pthread_cond_broadcast(&done_flush_LRU); + mysql_mutex_unlock(&flush_list_mutex); +} + +inline void buf_pool_t::n_flush_dec_holding_mutex() +{ + mysql_mutex_assert_owner(&flush_list_mutex); + ut_ad(page_cleaner_status >= LRU_FLUSH); + page_cleaner_status-= LRU_FLUSH; +} + +/** Complete write of a file page from buf_pool. +@param request write request +@param error whether the write may have failed */ +void buf_page_write_complete(const IORequest &request, bool error) +{ + ut_ad(request.is_write()); + ut_ad(!srv_read_only_mode); + buf_page_t *bpage= request.bpage; + ut_ad(bpage); + const auto state= bpage->state(); + /* io-fix can only be cleared by buf_page_t::write_complete() + and buf_page_t::read_complete() */ + ut_ad(state >= buf_page_t::WRITE_FIX); + ut_ad(!buf_dblwr.is_inside(bpage->id())); + ut_ad(request.node->space->id == bpage->id().space()); + + if (request.slot) + request.slot->release(); + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(*bpage, false); + DBUG_PRINT("ib_buf", ("write page %u:%u", + bpage->id().space(), bpage->id().page_no())); + + mysql_mutex_assert_not_owner(&buf_pool.mutex); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + + if (request.is_LRU()) + { + const bool temp= bpage->oldest_modification() == 2; + if (!temp && state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()) + buf_dblwr.write_completed(); + /* We must hold buf_pool.mutex while releasing the block, so that + no other thread can access it before we have freed it. */ + mysql_mutex_lock(&buf_pool.mutex); + bpage->write_complete(temp, error); + if (!error) + buf_LRU_free_page(bpage, true); + mysql_mutex_unlock(&buf_pool.mutex); + + buf_pool.n_flush_dec(); + } + else + { + if (state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()) + buf_dblwr.write_completed(); + bpage->write_complete(false, error); + } +} + +/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. +@param[in,out] page page to update +@param[in] size compressed page size */ +void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size) +{ + ut_ad(size > 0); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + page_zip_calc_checksum(page, size, false)); +} + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page) +{ + ut_d(bool compressed = false); + ut_d(bool corrupted = false); + ut_d(const uint size = buf_page_full_crc32_size(page, &compressed, + &corrupted)); + ut_ad(!compressed); + ut_ad(!corrupted); + ut_ad(size == uint(srv_page_size)); + const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(page + payload, my_crc32c(0, page, payload)); +} + +/** Initialize a page for writing to the tablespace. +@param[in] block buffer block; NULL if bypassing + the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if + uncompressed +@param[in] use_full_checksum whether tablespace uses full checksum */ +void +buf_flush_init_for_writing( + const buf_block_t* block, + byte* page, + void* page_zip_, + bool use_full_checksum) +{ + if (block && block->page.frame != page) { + /* If page is encrypted in full crc32 format then + checksum stored already as a part of fil_encrypt_buf() */ + ut_ad(use_full_checksum); + return; + } + + ut_ad(!block || block->page.frame == page); + ut_ad(page); + + if (page_zip_) { + page_zip_des_t* page_zip; + ulint size; + + page_zip = static_cast<page_zip_des_t*>(page_zip_); + ut_ad(!block || &block->page.zip == page_zip); + size = page_zip_get_size(page_zip); + + ut_ad(size); + ut_ad(ut_is_2pow(size)); + ut_ad(size <= UNIV_ZIP_SIZE_MAX); + + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + memcpy(page_zip->data, page, size); + /* fall through */ + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + buf_flush_update_zip_checksum(page_zip->data, size); + return; + } + + ib::error() << "The compressed page to be written" + " seems corrupt:"; + ut_print_buf(stderr, page, size); + fputs("\nInnoDB: Possibly older version of the page:", stderr); + ut_print_buf(stderr, page_zip->data, size); + putc('\n', stderr); + ut_error; + } + + if (use_full_checksum) { + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned"); + static_assert(FIL_PAGE_LSN % 4 == 0, "aligned"); + memcpy_aligned<4>(page + srv_page_size + - FIL_PAGE_FCRC32_END_LSN, + FIL_PAGE_LSN + 4 + page, 4); + return buf_flush_assign_full_crc32_checksum(page); + } + + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned"); + static_assert(FIL_PAGE_LSN % 8 == 0, "aligned"); + memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + FIL_PAGE_LSN + page, 8); + + if (block && srv_page_size == 16384) { + /* The page type could be garbage in old files + created before MySQL 5.5. Such files always + had a page size of 16 kilobytes. */ + ulint page_type = fil_page_get_type(page); + ulint reset_type = page_type; + + switch (block->page.id().page_no() % 16384) { + case 0: + reset_type = block->page.id().page_no() == 0 + ? FIL_PAGE_TYPE_FSP_HDR + : FIL_PAGE_TYPE_XDES; + break; + case 1: + reset_type = FIL_PAGE_IBUF_BITMAP; + break; + case FSP_TRX_SYS_PAGE_NO: + if (block->page.id() + == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) { + reset_type = FIL_PAGE_TYPE_TRX_SYS; + break; + } + /* fall through */ + default: + switch (page_type) { + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + case FIL_PAGE_UNDO_LOG: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_TYPE_SYS: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_IBUF_BITMAP: + /* These pages should have + predetermined page numbers + (see above). */ + default: + reset_type = FIL_PAGE_TYPE_UNKNOWN; + break; + } + } + + if (UNIV_UNLIKELY(page_type != reset_type)) { + ib::info() + << "Resetting invalid page " + << block->page.id() << " type " + << page_type << " to " + << reset_type << " when flushing."; + fil_page_set_type(page, reset_type); + } + } + + const uint32_t checksum = buf_calc_page_crc32(page); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); + mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + checksum); +} + +/** Reserve a buffer for compression. +@param[in,out] slot reserved slot */ +static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot) +{ + if (slot->comp_buf) + return; + /* Both Snappy and LZO compression methods require that the output + buffer be bigger than input buffer. Adjust the allocated size. */ + ulint size= srv_page_size; + if (provider_service_lzo->is_loaded) + size= LZO1X_1_15_MEM_COMPRESS; + else if (provider_service_snappy->is_loaded) + size= snappy_max_compressed_length(size); + slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size)); +} + +/** Encrypt a buffer of temporary tablespace +@param[in] offset Page offset +@param[in] s Page to encrypt +@param[in,out] d Output buffer +@return encrypted buffer or NULL */ +static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d) +{ + /* Calculate the start offset in a page */ + uint srclen= static_cast<uint>(srv_page_size) - + (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true)) + return NULL; + + const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(d + payload, my_crc32c(0, d, payload)); + + srv_stats.pages_encrypted.inc(); + srv_stats.n_temp_blocks_encrypted.inc(); + return d; +} + +/** Encryption and page_compression hook that is called just before +a page is written to disk. +@param[in,out] space tablespace +@param[in,out] bpage buffer page +@param[in] s physical page frame that is being encrypted +@param[in,out] size payload size in bytes +@return page frame to be written to file +(may be src_frame or an encrypted/compressed copy of it) */ +static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s, + buf_tmp_buffer_t **slot, size_t *size) +{ + ut_ad(!bpage->is_freed()); + ut_ad(space->id == bpage->id().space()); + ut_ad(!*slot); + + const uint32_t page_no= bpage->id().page_no(); + + switch (page_no) { + case TRX_SYS_PAGE_NO: + if (bpage->id().space() != TRX_SYS_SPACE) + break; + /* The TRX_SYS page is neither encrypted nor compressed, because + it contains the address of the doublewrite buffer. */ + /* fall through */ + case 0: + /* Page 0 of a tablespace is not encrypted/compressed */ + return s; + } + + fil_space_crypt_t *crypt_data= space->crypt_data; + bool encrypted, page_compressed; + if (space->purpose == FIL_TYPE_TEMPORARY) + { + ut_ad(!crypt_data); + encrypted= innodb_encrypt_temporary_tables; + page_compressed= false; + } + else + { + encrypted= crypt_data && !crypt_data->not_encrypted() && + crypt_data->type != CRYPT_SCHEME_UNENCRYPTED && + (!crypt_data->is_default_encryption() || srv_encrypt_tables); + page_compressed= space->is_compressed(); + } + + const bool full_crc32= space->full_crc32(); + + if (!encrypted && !page_compressed) + { + /* No need to encrypt or compress. Clear key-version & crypt-checksum. */ + static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2, + "not perfect alignment"); + if (full_crc32) + memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4); + else + memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + return s; + } + + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + if (full_crc32) + memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN, + FIL_PAGE_LSN + 4 + s, 4); + + ut_ad(!bpage->zip_size() || !page_compressed); + /* Find free slot from temporary memory array */ + *slot= buf_pool.io_buf_reserve(); + ut_a(*slot); + (*slot)->allocate(); + + byte *d= (*slot)->crypt_buf; + + if (!page_compressed) + { +not_compressed: + d= space->purpose == FIL_TYPE_TEMPORARY + ? buf_tmp_page_encrypt(page_no, s, d) + : fil_space_encrypt(space, page_no, s, d); + } + else + { + ut_ad(space->purpose != FIL_TYPE_TEMPORARY); + /* First we compress the page content */ + buf_tmp_reserve_compression_buf(*slot); + byte *tmp= (*slot)->comp_buf; + ulint len= fil_page_compress(s, tmp, space->flags, + fil_space_get_block_size(space, page_no), + encrypted); + + if (!len) + goto not_compressed; + + *size= len; + + if (full_crc32) + { + ut_d(bool compressed = false); + len= buf_page_full_crc32_size(tmp, +#ifdef UNIV_DEBUG + &compressed, +#else + NULL, +#endif + NULL); + ut_ad(compressed); + } + + /* Workaround for MDEV-15527. */ + memset(tmp + len, 0 , srv_page_size - len); + + if (encrypted) + tmp= fil_space_encrypt(space, page_no, tmp, d); + + if (full_crc32) + { + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4)); + ut_ad(!buf_page_is_corrupted(true, tmp, space->flags)); + } + + d= tmp; + } + + (*slot)->out_buf= d; + return d; +} + +/** Free a page whose underlying file page has been freed. */ +ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept +{ + mysql_mutex_assert_owner(&mutex); + ut_d(const lsn_t oldest_modification= bpage->oldest_modification();) + if (fsp_is_system_temporary(bpage->id().space())) + { + ut_ad(bpage->frame); + ut_ad(oldest_modification == 2); + bpage->clear_oldest_modification(); + } + else + { + mysql_mutex_lock(&flush_list_mutex); + ut_ad(oldest_modification > 2); + delete_from_flush_list(bpage); + mysql_mutex_unlock(&flush_list_mutex); + } + + bpage->lock.u_unlock(true); + buf_LRU_free_page(bpage, true); +} + +/** Write a flushable page to a file or free a freeable block. +@param evict whether to evict the page on write completion +@param space tablespace +@return whether a page write was initiated and buf_pool.mutex released */ +bool buf_page_t::flush(bool evict, fil_space_t *space) +{ + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + ut_ad(in_file()); + ut_ad(in_LRU_list); + ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == + (space == fil_system.temp_space)); + ut_ad(evict || space != fil_system.temp_space); + ut_ad(space->referenced()); + + const auto s= state(); + ut_a(s >= FREED); + + if (s < UNFIXED) + { + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= + mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + (zip.data ? zip.data : frame))); + ut_ad(lsn >= oldest_modification()); + if (lsn > log_sys.get_flushed_lsn()) + { + mysql_mutex_unlock(&buf_pool.mutex); + log_write_up_to(lsn, true); + mysql_mutex_lock(&buf_pool.mutex); + } + } + buf_pool.release_freed_page(this); + return false; + } + + ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); + ut_ad(f >= UNFIXED); + ut_ad(f < READ_FIX); + ut_ad((space == fil_system.temp_space) + ? oldest_modification() == 2 + : oldest_modification() > 2); + + /* Increment the I/O operation count used for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + mysql_mutex_unlock(&buf_pool.mutex); + + IORequest::Type type= IORequest::WRITE_ASYNC; + if (UNIV_UNLIKELY(evict)) + { + type= IORequest::WRITE_LRU; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } + + /* Apart from the U-lock, this block will also be protected by + is_write_fixed() and oldest_modification()>1. + Thus, it cannot be relocated or removed. */ + + DBUG_PRINT("ib_buf", ("%s %u page %u:%u", + evict ? "LRU" : "flush_list", + id().space(), id().page_no())); + + buf_block_t *block= reinterpret_cast<buf_block_t*>(this); + page_t *write_frame= zip.data; + + space->reacquire(); + size_t size; +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + size_t orig_size; +#endif + buf_tmp_buffer_t *slot= nullptr; + + if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ + { + ut_ad(!space->full_crc32()); + ut_ad(!space->is_compressed()); /* not page_compressed */ + size= zip_size(); +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + orig_size= size; +#endif + buf_flush_update_zip_checksum(write_frame, size); + write_frame= buf_page_encrypt(space, this, write_frame, &slot, &size); + ut_ad(size == zip_size()); + } + else + { + byte *page= frame; + size= block->physical_size(); +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + orig_size= size; +#endif + + if (space->full_crc32()) + { + /* innodb_checksum_algorithm=full_crc32 is not implemented for + ROW_FORMAT=COMPRESSED pages. */ + ut_ad(!write_frame); + page= buf_page_encrypt(space, this, page, &slot, &size); + buf_flush_init_for_writing(block, page, nullptr, true); + } + else + { + buf_flush_init_for_writing(block, page, write_frame ? &zip : nullptr, + false); + page= buf_page_encrypt(space, this, write_frame ? write_frame : page, + &slot, &size); + } + +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + if (size != orig_size) + { + switch (space->chain.start->punch_hole) { + case 1: + static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH == + IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, ""); + type= + IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC)); + break; + case 2: + size= orig_size; + } + } +#endif + write_frame= page; + } + + if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) + { + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= + mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN + + (write_frame ? write_frame + : frame))); + ut_ad(lsn >= oldest_modification()); + log_write_up_to(lsn, true); + } + space->io(IORequest{type, this, slot}, physical_offset(), size, + write_frame, this); + } + else + buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type}, + size); + return true; +} + +/** Check whether a page can be flushed from the buf_pool. +@param id page identifier +@param fold id.fold() +@param evict true=buf_pool.LRU; false=buf_pool.flush_list +@return whether the page can be flushed */ +static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, + bool evict) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(fold == id.fold()); + + /* FIXME: cell_get() is being invoked while holding buf_pool.mutex */ + const buf_page_t *bpage= + buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold)); + + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + return false; + + /* We avoid flushing 'non-old' blocks in an eviction flush, because the + flushed blocks are soon freed */ + if (evict && !bpage->is_old()) + return false; + + return bpage->oldest_modification() > 1 && !bpage->is_io_fixed(); +} + +/** Check which neighbors of a page can be flushed from the buf_pool. +@param space tablespace +@param id page identifier of a dirty page +@param contiguous whether to consider contiguous areas of pages +@param evict true=buf_pool.LRU; false=buf_pool.flush_list +@return last page number that can be flushed */ +static page_id_t buf_flush_check_neighbors(const fil_space_t &space, + page_id_t &id, bool contiguous, + bool evict) +{ + ut_ad(id.page_no() < space.size + + (space.physical_size() == 2048 ? 1 + : space.physical_size() == 1024 ? 3 : 0)); + /* When flushed, dirty blocks are searched in neighborhoods of this + size, and flushed along with the original page. */ + const ulint s= buf_pool.curr_size / 16; + const uint32_t read_ahead= buf_pool.read_ahead_area; + const uint32_t buf_flush_area= read_ahead > s + ? static_cast<uint32_t>(s) : read_ahead; + page_id_t low= id - (id.page_no() % buf_flush_area); + page_id_t high= low + buf_flush_area; + high.set_page_no(std::min(high.page_no(), space.last_page_number())); + + if (!contiguous) + { + high= std::max(id + 1, high); + id= low; + return high; + } + + /* Determine the contiguous dirty area around id. */ + const ulint id_fold= id.fold(); + + mysql_mutex_lock(&buf_pool.mutex); + + if (id > low) + { + ulint fold= id_fold; + for (page_id_t i= id - 1;; --i) + { + fold--; + if (!buf_flush_check_neighbor(i, fold, evict)) + { + low= i + 1; + break; + } + if (i == low) + break; + } + } + + page_id_t i= id; + id= low; + ulint fold= id_fold; + while (++i < high) + { + ++fold; + if (!buf_flush_check_neighbor(i, fold, evict)) + break; + } + + mysql_mutex_unlock(&buf_pool.mutex); + return i; +} + +MY_ATTRIBUTE((warn_unused_result)) +/** Apply freed_ranges to the file. +@param writable whether the file is writable +@return number of pages written or hole-punched */ +uint32_t fil_space_t::flush_freed(bool writable) +{ + const bool punch_hole= chain.start->punch_hole == 1; + if (!punch_hole && !srv_immediate_scrub_data_uncompressed) + return 0; + + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); + + for (;;) + { + freed_range_mutex.lock(); + if (freed_ranges.empty()) + { + freed_range_mutex.unlock(); + return 0; + } + const lsn_t flush_lsn= last_freed_lsn; + if (log_sys.get_flushed_lsn() >= flush_lsn) + break; + freed_range_mutex.unlock(); + log_write_up_to(flush_lsn, true); + } + + const unsigned physical{physical_size()}; + + range_set freed= std::move(freed_ranges); + uint32_t written= 0; + + if (!writable); + else if (punch_hole) + { + for (const auto &range : freed) + { + written+= range.last - range.first + 1; + reacquire(); + io(IORequest(IORequest::PUNCH_RANGE), + os_offset_t{range.first} * physical, + (range.last - range.first + 1) * physical, nullptr); + } + } + else + { + for (const auto &range : freed) + { + written+= range.last - range.first + 1; + for (os_offset_t i= range.first; i <= range.last; i++) + { + reacquire(); + io(IORequest(IORequest::WRITE_ASYNC), i * physical, physical, + const_cast<byte*>(field_ref_zero)); + } + } + } + + freed_range_mutex.unlock(); + return written; +} + +/** Flushes to disk all flushable pages within the flush area +and also write zeroes or punch the hole for the freed ranges of pages. +@param space tablespace +@param page_id page identifier +@param bpage buffer page +@param contiguous whether to consider contiguous areas of pages +@param evict true=buf_pool.LRU; false=buf_pool.flush_list +@param n_flushed number of pages flushed so far in this batch +@param n_to_flush maximum number of pages we are allowed to flush +@return number of pages flushed */ +static ulint buf_flush_try_neighbors(fil_space_t *space, + const page_id_t page_id, + buf_page_t *bpage, + bool contiguous, bool evict, + ulint n_flushed, ulint n_to_flush) +{ + mysql_mutex_unlock(&buf_pool.mutex); + + ut_ad(space->id == page_id.space()); + ut_ad(bpage->id() == page_id); + + ulint count= 0; + page_id_t id= page_id; + page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict); + + ut_ad(page_id >= id); + ut_ad(page_id < high); + + for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold) + { + if (UNIV_UNLIKELY(space->is_stopping_writes())) + { + if (bpage) + bpage->lock.u_unlock(true); + break; + } + + if (count + n_flushed >= n_to_flush) + { + if (id > page_id) + break; + /* If the page whose neighbors we are flushing has not been + flushed yet, we must flush the page that we selected originally. */ + id= page_id; + id_fold= id.fold(); + } + + const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold); + mysql_mutex_lock(&buf_pool.mutex); + + if (buf_page_t *b= buf_pool.page_hash.get(id, chain)) + { + ut_ad(b->in_file()); + if (id == page_id) + { + ut_ad(bpage == b); + bpage= nullptr; + ut_ad(!buf_pool.watch_is_sentinel(*b)); + ut_ad(b->oldest_modification() > 1); + flush: + if (b->flush(evict, space)) + { + ++count; + continue; + } + } + /* We avoid flushing 'non-old' blocks in an eviction flush, + because the flushed blocks are soon freed */ + else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) && + b->oldest_modification() > 1 && b->lock.u_lock_try(true)) + { + if (b->oldest_modification() < 2) + b->lock.u_unlock(true); + else + goto flush; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + } + + if (count > 1) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, count - 1); + } + + return count; +} + +/*******************************************************************//** +This utility moves the uncompressed frames of pages to the free list. +Note that this function does not actually flush any data to disk. It +just detaches the uncompressed frames from the compressed pages at the +tail of the unzip_LRU and puts those freed frames in the free list. +@return number of blocks moved to the free list. */ +static ulint buf_free_from_unzip_LRU_list_batch() +{ + ulint scanned = 0; + ulint count = 0; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + + while (block + && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth + && UT_LIST_GET_LEN(buf_pool.unzip_LRU) + > UT_LIST_GET_LEN(buf_pool.LRU) / 10) { + + ++scanned; + if (buf_LRU_free_page(&block->page, false)) { + /* Block was freed. buf_pool.mutex potentially + released and reacquired */ + ++count; + block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + } else { + block = UT_LIST_GET_PREV(unzip_LRU, block); + } + } + + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); + } + + return(count); +} + +/** Acquire a tablespace reference for writing. +@param id tablespace identifier +@return tablespace +@retval nullptr if the tablespace is missing or inaccessible */ +fil_space_t *fil_space_t::get_for_write(uint32_t id) +{ + mysql_mutex_lock(&fil_system.mutex); + fil_space_t *space= fil_space_get_by_id(id); + const uint32_t n= space ? space->acquire_low(STOPPING_WRITES) : 0; + + if (n & STOPPING_WRITES) + space= nullptr; + else if ((n & CLOSING) && !space->prepare_acquired()) + space= nullptr; + + mysql_mutex_unlock(&fil_system.mutex); + return space; +} + +/** Start writing out pages for a tablespace. +@param id tablespace identifier +@return tablespace and number of pages written */ +static std::pair<fil_space_t*, uint32_t> buf_flush_space(const uint32_t id) +{ + if (fil_space_t *space= fil_space_t::get_for_write(id)) + return {space, space->flush_freed(true)}; + return {nullptr, 0}; +} + +struct flush_counters_t +{ + /** number of dirty pages flushed */ + ulint flushed; + /** number of clean pages evicted */ + ulint evicted; +}; + +/** Discard a dirty page, and release buf_pool.flush_list_mutex. +@param bpage dirty page whose tablespace is not accessible */ +static void buf_flush_discard_page(buf_page_t *bpage) +{ + ut_ad(bpage->in_file()); + ut_ad(bpage->oldest_modification()); + + buf_pool.delete_from_flush_list(bpage); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + ut_d(const auto state= bpage->state()); + ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED || + state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT); + bpage->lock.u_unlock(true); + buf_LRU_free_page(bpage, true); +} + +/** Flush dirty blocks from the end buf_pool.LRU, +and move clean blocks to buf_pool.free. +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_flush_LRU_list_batch(ulint max, bool evict, + flush_counters_t *n) +{ + ulint scanned= 0; + ulint free_limit= srv_LRU_scan_depth; + + mysql_mutex_assert_owner(&buf_pool.mutex); + if (buf_pool.withdraw_target && buf_pool.is_shrinking()) + free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw); + + const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN + ? 0 : srv_flush_neighbors; + fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); + bpage && + ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && + UT_LIST_GET_LEN(buf_pool.free) < free_limit) || + recv_recovery_is_on()); + ++scanned, bpage= buf_pool.lru_hp.get()) + { + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + buf_pool.lru_hp.set(prev); + auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(bpage->in_LRU_list); + + if (!bpage->oldest_modification()) + { + evict: + if (state != buf_page_t::FREED && + (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state))) + continue; + buf_LRU_free_page(bpage, true); + ++n->evicted; + if (UNIV_LIKELY(scanned & 31)) + continue; + mysql_mutex_unlock(&buf_pool.mutex); + reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + continue; + } + + if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) + { + ut_ad(!bpage->is_io_fixed()); + bool do_evict= evict; + switch (bpage->oldest_modification()) { + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (ut_d(lsn_t lsn=) bpage->oldest_modification()) + { + ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */ + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + /* fall through */ + case 0: + bpage->lock.u_unlock(true); + goto evict; + case 2: + /* LRU flushing will always evict pages of the temporary tablespace. */ + do_evict= true; + } + /* Block is ready for flush. Dispatch an IO request. + If do_evict, the page may be evicted by buf_page_write_complete(). */ + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) + { + buf_pool.lru_hp.set(bpage); + mysql_mutex_unlock(&buf_pool.mutex); + if (space) + space->release(); + auto p= buf_flush_space(space_id); + space= p.first; + last_space_id= space_id; + if (!space) + { + mysql_mutex_lock(&buf_pool.mutex); + goto no_space; + } + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written+= p.second; + } + else + { + ut_ad(!space); + goto no_space; + } + } + else if (space->is_stopping_writes()) + { + space->release(); + space= nullptr; + no_space: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_discard_page(bpage); + continue; + } + + if (n->flushed >= max && !recv_recovery_is_on()) + { + bpage->lock.u_unlock(true); + break; + } + + if (neighbors && space->is_rotational()) + n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, + neighbors == 1, + do_evict, n->flushed, max); + else if (bpage->flush(do_evict, space)) + ++n->flushed; + else + continue; + + goto reacquire_mutex; + } + else + /* Can't evict or dispatch this block. Go to previous. */ + ut_ad(buf_pool.lru_hp.is_hp(prev)); + } + + buf_pool.lru_hp.set(nullptr); + + if (space) + space->release(); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); +} + +/** Flush and move pages from LRU or unzip_LRU list to the free list. +Whether LRU or unzip_LRU is used depends on the state of the system. +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n) +{ + if (buf_LRU_evict_from_unzip_LRU()) + buf_free_from_unzip_LRU_list_batch(); + n->evicted= 0; + n->flushed= 0; + buf_flush_LRU_list_batch(max, evict, n); + + mysql_mutex_assert_owner(&buf_pool.mutex); + buf_lru_freed_page_count+= n->evicted; + buf_lru_flush_page_count+= n->flushed; + buf_pool.stat.n_pages_written+= n->flushed; +} + +/** This utility flushes dirty blocks from the end of the flush_list. +The calling thread is not allowed to own any latches on pages! +@param max_n maximum mumber of blocks to flush +@param lsn once an oldest_modification>=lsn is found, terminate the batch +@return number of blocks for which the write request was queued */ +static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) +{ + ulint count= 0; + ulint scanned= 0; + + mysql_mutex_assert_owner(&buf_pool.mutex); + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + + const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN + ? 0 : srv_flush_neighbors; + fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); + + /* Start from the end of the list looking for a suitable block to be + flushed. */ + ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); + bpage && len && count < max_n; ++scanned, len--) + { + const lsn_t oldest_modification= bpage->oldest_modification(); + if (oldest_modification >= lsn) + break; + ut_ad(bpage->in_file()); + + { + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + if (oldest_modification == 1) + { + clear: + buf_pool.delete_from_flush_list(bpage); + skip: + bpage= prev; + continue; + } + + ut_ad(oldest_modification > 2); + + if (!bpage->lock.u_lock_try(true)) + goto skip; + + ut_ad(!bpage->is_io_fixed()); + + if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: A concurrent execution of buf_flush_list_space() may + terminate this scan prematurely. The buf_pool.flush_list_active + should prevent multiple threads from executing + buf_do_flush_list_batch() concurrently, + but buf_flush_list_space() is ignoring that. */ + buf_pool.flush_hp.set(prev); + } + + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_unlock(&buf_pool.mutex); + if (space) + space->release(); + auto p= buf_flush_space(space_id); + space= p.first; + last_space_id= space_id; + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written+= p.second; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + else + ut_ad(!space); + } + else if (space->is_stopping_writes()) + { + space->release(); + space= nullptr; + } + + if (!space) + buf_flush_discard_page(bpage); + else + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + do + { + if (neighbors && space->is_rotational()) + count+= buf_flush_try_neighbors(space, page_id, bpage, + neighbors == 1, false, count, max_n); + else if (bpage->flush(false, space)) + ++count; + else + continue; + mysql_mutex_lock(&buf_pool.mutex); + } + while (0); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + bpage= buf_pool.flush_hp.get(); + } + + buf_pool.flush_hp.set(nullptr); + + if (space) + space->release(); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + scanned); + return count; +} + +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end() +{ + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); + + if (buf_pool.n_flush()) + { + tpool::tpool_wait_begin(); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + do + my_cond_wait(&buf_pool.done_flush_LRU, + &buf_pool.flush_list_mutex.m_mutex); + while (buf_pool.n_flush()); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); + } +} + +/** Write out dirty blocks from buf_pool.flush_list. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) +{ + ut_ad(lsn); + mysql_mutex_assert_owner(&buf_pool.mutex); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.flush_list_active()) + { +nothing_to_do: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return 0; + } + if (!buf_pool.get_oldest_modification(0)) + { + pthread_cond_broadcast(&buf_pool.done_flush_list); + goto nothing_to_do; + } + buf_pool.flush_list_set_active(); + const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); + if (n_flushed) + buf_pool.stat.n_pages_written+= n_flushed; + buf_pool.flush_list_set_inactive(); + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (n_flushed) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n_flushed); + + DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); + return n_flushed; +} + +/** Write out dirty blocks from buf_pool.flush_list. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) +{ + mysql_mutex_lock(&buf_pool.mutex); + ulint n= buf_flush_list_holding_mutex(max_n, lsn); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + return n; +} + +/** Try to flush all the dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) +{ + const auto space_id= space->id; + ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND); + + bool may_have_skipped= false; + ulint max_n_flush= srv_io_capacity; + ulint n_flush= 0; + + bool acquired= space->acquire_for_write(); + { + const uint32_t written{space->flush_freed(acquired)}; + mysql_mutex_lock(&buf_pool.mutex); + if (written) + buf_pool.stat.n_pages_written+= written; + } + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->in_file()); + + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + if (bpage->oldest_modification() == 1) + clear: + buf_pool.delete_from_flush_list(bpage); + else if (bpage->id().space() != space_id); + else if (!bpage->lock.u_lock_try(true)) + may_have_skipped= true; + else if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } + else + { + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: Multiple executions of buf_flush_list_space() may be + interleaved, and also buf_do_flush_list_batch() may be running + concurrently. This may terminate our iteration prematurely, + leading us to return may_have_skipped=true. */ + buf_pool.flush_hp.set(prev); + + if (!acquired) + was_freed: + buf_flush_discard_page(bpage); + else + { + if (space->is_stopping_writes()) + { + space->release(); + acquired= false; + goto was_freed; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (bpage->flush(false, space)) + { + ++n_flush; + if (!--max_n_flush) + { + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + may_have_skipped= true; + goto done; + } + mysql_mutex_lock(&buf_pool.mutex); + } + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.flush_hp.is_hp(prev)) + may_have_skipped= true; + bpage= buf_pool.flush_hp.get(); + continue; + } + + bpage= prev; + } + + /* Note: this loop may have been executed concurrently with + buf_do_flush_list_batch() as well as other threads executing + buf_flush_list_space(). We should always return true from + buf_flush_list_space() if that should be the case; in + buf_do_flush_list_batch() we will simply perform less work. */ +done: + buf_pool.flush_hp.set(nullptr); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_pool.stat.n_pages_written+= n_flush; + + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); + mysql_mutex_unlock(&buf_pool.mutex); + + if (n_flushed) + *n_flushed= n_flush; + + if (acquired) + space->release(); + + if (space->purpose == FIL_TYPE_IMPORT) + os_aio_wait_until_no_pending_writes(true); + else + buf_dblwr.flush_buffered_writes(); + + return may_have_skipped; +} + +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. +@param max_n wished maximum mumber of blocks flushed +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written */ +ulint buf_flush_LRU(ulint max_n, bool evict) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + flush_counters_t n; + buf_do_LRU_batch(max_n, evict, &n); + + ulint pages= n.flushed; + + if (n.evicted) + { + if (evict) + pages+= n.evicted; + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); + } + + return pages; +} + +#ifdef HAVE_PMEM +# include <libpmem.h> +#endif + +/** Write checkpoint information to the log header and release mutex. +@param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ +inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept +{ + ut_ad(!srv_read_only_mode); + ut_ad(end_lsn >= next_checkpoint_lsn); + ut_ad(end_lsn <= get_lsn()); + ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() || + srv_shutdown_state > SRV_SHUTDOWN_INITIATED); + + DBUG_PRINT("ib_log", + ("checkpoint at " LSN_PF " written", next_checkpoint_lsn)); + + auto n= next_checkpoint_no; + const size_t offset{(n & 1) ? CHECKPOINT_2 : CHECKPOINT_1}; + static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); + static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); + byte* c= my_assume_aligned<CPU_LEVEL1_DCACHE_LINESIZE> + (is_pmem() ? buf + offset : checkpoint_buf); + memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); + mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); + mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); + mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); + + lsn_t resizing; + +#ifdef HAVE_PMEM + if (is_pmem()) + { + resizing= resize_lsn.load(std::memory_order_relaxed); + + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64); + header_write(resize_buf, resizing, is_encrypted()); + pmem_persist(resize_buf, resize_target); + } + pmem_persist(c, 64); + } + else +#endif + { + ut_ad(!checkpoint_pending); + checkpoint_pending= true; + latch.wr_unlock(); + log_write_and_flush_prepare(); + resizing= resize_lsn.load(std::memory_order_relaxed); + /* FIXME: issue an asynchronous write */ + log.write(offset, {c, get_block_size()}); + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + byte *buf= static_cast<byte*>(aligned_malloc(4096, 4096)); + memset_aligned<4096>(buf, 0, 4096); + header_write(buf, resizing, is_encrypted()); + resize_log.write(0, {buf, 4096}); + aligned_free(buf); + resize_log.write(CHECKPOINT_1, {c, get_block_size()}); + } + + if (srv_file_flush_method != SRV_O_DSYNC) + ut_a(log.flush()); + latch.wr_lock(SRW_LOCK_CALL); + ut_ad(checkpoint_pending); + checkpoint_pending= false; + resizing= resize_lsn.load(std::memory_order_relaxed); + } + + ut_ad(!checkpoint_pending); + next_checkpoint_no++; + const lsn_t checkpoint_lsn{next_checkpoint_lsn}; + last_checkpoint_lsn= checkpoint_lsn; + + DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF, + checkpoint_lsn, get_flushed_lsn())); + if (overwrite_warned) + { + sql_print_information("InnoDB: Crash recovery was broken " + "between LSN=" LSN_PF + " and checkpoint LSN=" LSN_PF ".", + overwrite_warned, checkpoint_lsn); + overwrite_warned= 0; + } + + lsn_t resizing_completed= 0; + + if (resizing > 1 && resizing <= checkpoint_lsn) + { + ut_ad(is_pmem() == !resize_flush_buf); + + if (!is_pmem()) + { + if (srv_file_flush_method != SRV_O_DSYNC) + ut_a(resize_log.flush()); + IF_WIN(log.close(),); + } + + if (resize_rename()) + { + /* Resizing failed. Discard the log_sys.resize_log. */ +#ifdef HAVE_PMEM + if (is_pmem()) + my_munmap(resize_buf, resize_target); + else +#endif + { + ut_free_dodump(resize_buf, buf_size); + ut_free_dodump(resize_flush_buf, buf_size); +#ifdef _WIN32 + ut_ad(!log.is_opened()); + bool success; + log.m_file= + os_file_create_func(get_log_file_path().c_str(), + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, false, &success); + ut_a(success); + ut_a(log.is_opened()); +#endif + } + } + else + { + /* Adopt the resized log. */ +#ifdef HAVE_PMEM + if (is_pmem()) + { + my_munmap(buf, file_size); + buf= resize_buf; + buf_free= START_OFFSET + (get_lsn() - resizing); + } + else +#endif + { + IF_WIN(,log.close()); + std::swap(log, resize_log); + ut_free_dodump(buf, buf_size); + ut_free_dodump(flush_buf, buf_size); + buf= resize_buf; + flush_buf= resize_flush_buf; + } + srv_log_file_size= resizing_completed= file_size= resize_target; + first_lsn= resizing; + set_capacity(); + } + ut_ad(!resize_log.is_opened()); + resize_buf= nullptr; + resize_flush_buf= nullptr; + resize_target= 0; + resize_lsn.store(0, std::memory_order_relaxed); + } + + log_resize_release(); + + if (UNIV_LIKELY(resizing <= 1)); + else if (resizing > checkpoint_lsn) + buf_flush_ahead(resizing, false); + else if (resizing_completed) + ib::info() << "Resized log to " << ib::bytes_iec{resizing_completed} + << "; start LSN=" << resizing; + else + buf_flush_ahead(end_lsn + 1, false); +} + +/** Initiate a log checkpoint, discarding the start of the log. +@param oldest_lsn the checkpoint LSN +@param end_lsn log_sys.get_lsn() +@return true if success, false if a checkpoint write was already running */ +static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) +{ + ut_ad(!srv_read_only_mode); +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + ut_ad(oldest_lsn <= end_lsn); + ut_ad(end_lsn == log_sys.get_lsn()); + + if (oldest_lsn == log_sys.last_checkpoint_lsn || + (oldest_lsn == end_lsn && + !log_sys.resize_in_progress() && + oldest_lsn == log_sys.last_checkpoint_lsn + + (log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT))) + { + /* Do nothing, because nothing was logged (other than a + FILE_CHECKPOINT record) since the previous checkpoint. */ + do_nothing: + log_sys.latch.wr_unlock(); + return true; + } + + ut_ad(!recv_no_log_write); + ut_ad(oldest_lsn > log_sys.last_checkpoint_lsn); + /* Repeat the FILE_MODIFY records after the checkpoint, in case some + log records between the checkpoint and log_sys.lsn need them. + Finally, write a FILE_CHECKPOINT record. Redo log apply expects to + see a FILE_CHECKPOINT after the checkpoint, except on clean + shutdown, where the log will be empty after the checkpoint. + + It is important that we write out the redo log before any further + dirty pages are flushed to the tablespace files. At this point, + because we hold exclusive log_sys.latch, + mtr_t::commit() in other threads will be blocked, + and no pages can be added to buf_pool.flush_list. */ + const lsn_t flush_lsn{fil_names_clear(oldest_lsn)}; + ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); + log_sys.latch.wr_unlock(); + log_write_up_to(flush_lsn, true); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + if (log_sys.last_checkpoint_lsn >= oldest_lsn) + goto do_nothing; + + ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); + + if (log_sys.checkpoint_pending) + { + /* A checkpoint write is running */ + log_sys.latch.wr_unlock(); + return false; + } + + log_sys.next_checkpoint_lsn= oldest_lsn; + log_sys.write_checkpoint(end_lsn); + + return true; +} + +/** Make a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log file. Use log_make_checkpoint() to flush also the pool. +@retval true if the checkpoint was or had been made +@retval false if a checkpoint write was already running */ +static bool log_checkpoint() +{ + if (recv_recovery_is_on()) + recv_sys.apply(true); + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + const lsn_t end_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return log_checkpoint_low(oldest_lsn, end_lsn); +} + +/** Make a checkpoint. */ +ATTRIBUTE_COLD void log_make_checkpoint() +{ + buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire)); + while (!log_checkpoint()); +} + +/** Wait for all dirty pages up to an LSN to be written out. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +static void buf_flush_wait(lsn_t lsn) +{ + ut_ad(lsn <= log_sys.get_lsn()); + + lsn_t oldest_lsn; + + while ((oldest_lsn= buf_pool.get_oldest_modification(lsn)) < lsn) + { + if (buf_flush_sync_lsn < lsn) + { + buf_flush_sync_lsn= lsn; + buf_pool.page_cleaner_set_idle(false); + pthread_cond_signal(&buf_pool.do_flush_list); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + oldest_lsn= buf_pool.get_oldest_modification(lsn); + if (oldest_lsn >= lsn) + break; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + + if (oldest_lsn >= buf_flush_sync_lsn) + { + buf_flush_sync_lsn= 0; + pthread_cond_broadcast(&buf_pool.done_flush_list); + } +} + +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) +{ + ut_ad(sync_lsn); + ut_ad(sync_lsn < LSN_MAX); + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) + { + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + +#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ + if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)) + { + do + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); + if (n_pages) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_pages); + } + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn); + } + else +#endif + { + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + tpool::tpool_wait_begin(); + buf_flush_wait(sync_lsn); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); + } + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn)) + { + /* If the buffer pool was clean, no log write was guaranteed + to happen until now. There could be an outstanding FILE_CHECKPOINT + record from a previous fil_names_clear() call, which we must + write out before we can advance the checkpoint. */ + log_write_up_to(sync_lsn, true); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); + log_checkpoint(); + } +} + +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) +{ + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); + + Atomic_relaxed<lsn_t> &limit= furious + ? buf_flush_sync_lsn : buf_flush_async_lsn; + + if (limit < lsn) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (limit < lsn) + { + limit= lsn; + buf_pool.page_cleaner_set_idle(false); + pthread_cond_signal(&buf_pool.do_flush_list); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } +} + +/** Conduct checkpoint-related flushing for innodb_flush_sync=ON, +and try to initiate checkpoints until the target is met. +@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ +ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + + for (;;) + { + if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_flushed); + } + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + const lsn_t newest_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); + const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; + + if (!recv_recovery_is_on() && + checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_checkpoint_low(checkpoint_lsn, newest_lsn); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + measure= buf_pool.get_oldest_modification(LSN_MAX); + } + else + { + log_sys.latch.wr_unlock(); + if (!measure) + measure= LSN_MAX; + } + + /* After attempting log checkpoint, check if we have reached our target. */ + const lsn_t target= buf_flush_sync_lsn; + + if (measure >= target) + buf_flush_sync_lsn= 0; + else if (measure >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; + + /* wake up buf_flush_wait() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + lsn= std::max(lsn, target); + + if (measure >= lsn) + return; + } +} + +/** Check if the adpative flushing threshold is recommended based on +redo log capacity filled threshold. +@param oldest_lsn buf_pool.get_oldest_modification() +@return true if adaptive flushing is recommended. */ +static bool af_needed_for_redo(lsn_t oldest_lsn) +{ + lsn_t age= (log_sys.get_lsn() - oldest_lsn); + lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm * + static_cast<double>(log_sys.log_capacity) / 100); + + /* if age > af_lwm adaptive flushing is recommended */ + return (age > af_lwm); +} + +/*********************************************************************//** +Calculates if flushing is required based on redo generation rate. +@return percent of io_capacity to flush to manage redo space */ +static +ulint +af_get_pct_for_lsn( +/*===============*/ + lsn_t age) /*!< in: current age of LSN. */ +{ + lsn_t af_lwm = static_cast<lsn_t>( + srv_adaptive_flushing_lwm + * static_cast<double>(log_sys.log_capacity) / 100); + + if (age < af_lwm) { + /* No adaptive flushing. */ + return(0); + } + + lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async; + + ut_ad(srv_max_io_capacity >= srv_io_capacity); + return static_cast<ulint>( + (static_cast<double>(srv_max_io_capacity / srv_io_capacity + * lsn_age_factor) + * sqrt(static_cast<double>(lsn_age_factor)) + / 7.5)); +} + +/** This function is called approximately once every second by +buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0 +and innodb_adaptive_flushing=ON. +Based on various factors it decides if there is a need to do flushing. +@return number of pages recommended to be flushed +@param last_pages_in number of pages flushed in previous batch +@param oldest_lsn buf_pool.get_oldest_modification(0) +@param pct_lwm innodb_max_dirty_pages_pct_lwm, or 0 to ignore it +@param dirty_blocks UT_LIST_GET_LEN(buf_pool.flush_list) +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */ +static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, + lsn_t oldest_lsn, + double pct_lwm, + ulint dirty_blocks, + double dirty_pct) +{ + static lsn_t prev_lsn = 0; + static ulint sum_pages = 0; + static ulint avg_page_rate = 0; + static ulint n_iterations = 0; + static time_t prev_time; + lsn_t lsn_rate; + ulint n_pages = 0; + + const lsn_t cur_lsn = log_sys.get_lsn(); + ut_ad(oldest_lsn <= cur_lsn); + ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn); + time_t curr_time = time(nullptr); + const double max_pct = srv_max_buf_pool_modified_pct; + + if (!prev_lsn || !pct_for_lsn) { + prev_time = curr_time; + prev_lsn = cur_lsn; + if (max_pct > 0.0) { + dirty_pct /= max_pct; + } + + n_pages = ulint(dirty_pct * double(srv_io_capacity)); + if (n_pages < dirty_blocks) { + n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks); + } + +func_exit: + page_cleaner.flush_pass++; + return n_pages; + } + + sum_pages += last_pages_in; + + const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1); + + /* We update our variables every innodb_flushing_avg_loops + iterations to smooth out transition in workload. */ + if (++n_iterations >= srv_flushing_avg_loops + || time_elapsed >= srv_flushing_avg_loops) { + + avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2; + + /* How much LSN we have generated since last call. */ + lsn_rate = (cur_lsn - prev_lsn) / time_elapsed; + + lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; + + if (page_cleaner.flush_pass) { + page_cleaner.flush_time /= page_cleaner.flush_pass; + } + + prev_lsn = cur_lsn; + prev_time = curr_time; + + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, + page_cleaner.flush_time); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, + page_cleaner.flush_pass); + + page_cleaner.flush_time = 0; + page_cleaner.flush_pass = 0; + + n_iterations = 0; + sum_pages = 0; + } + + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); + + double total_ratio; + if (pct_lwm == 0.0 || max_pct == 0.0) { + total_ratio = 1; + } else { + total_ratio = std::max(double(pct_for_lsn) / 100, + (dirty_pct / max_pct)); + } + + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, ulint(total_ratio * 100)); + + /* Estimate pages to be flushed for the lsn progress */ + lsn_t target_lsn = oldest_lsn + + lsn_avg_rate * buf_flush_lsn_scan_factor; + ulint pages_for_lsn = 0; + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list); + b != NULL; + b = UT_LIST_GET_PREV(list, b)) { + if (b->oldest_modification() > target_lsn) { + break; + } + if (++pages_for_lsn >= srv_max_io_capacity) { + break; + } + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + pages_for_lsn /= buf_flush_lsn_scan_factor; + if (pages_for_lsn < 1) { + pages_for_lsn = 1; + } + + n_pages = (ulint(double(srv_io_capacity) * total_ratio) + + avg_page_rate + pages_for_lsn) / 3; + + if (n_pages > srv_max_io_capacity) { + n_pages = srv_max_io_capacity; + } + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn); + + MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); + MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); + + goto func_exit; +} + +#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ +/* Avoid GCC 4.8.5 internal compiler error "could not split insn". +We would only need this for buf_flush_page_cleaner(), +but GCC 4.8.5 does not support pop_options. */ +# pragma GCC optimize ("O0") +#endif +/** page_cleaner thread tasked with flushing dirty pages from the buffer +pools. As of now we'll have only one coordinator. */ +static void buf_flush_page_cleaner() +{ + my_thread_init(); +#ifdef UNIV_PFS_THREAD + pfs_register_thread(page_cleaner_thread_key); +#endif /* UNIV_PFS_THREAD */ + ut_ad(!srv_read_only_mode); + ut_ad(buf_page_cleaner_is_active); + + ulint last_pages= 0; + timespec abstime; + set_timespec(abstime, 1); + + lsn_t lsn_limit; + ulint last_activity_count= srv_get_activity_count(); + + for (;;) + { + lsn_limit= buf_flush_sync_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync)) + { + furious_flush: + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.ran_out()) + goto no_wait; + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; + + if (buf_pool.page_cleaner_idle() && + (!UT_LIST_GET_LEN(buf_pool.flush_list) || + srv_max_dirty_pages_pct_lwm == 0.0)) + /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ + my_cond_wait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + else + my_cond_timedwait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex, &abstime); + no_wait: + set_timespec(abstime, 1); + + lsn_limit= buf_flush_sync_lsn; + lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); + + if (!oldest_lsn) + { + fully_unemployed: + buf_flush_sync_lsn= 0; + set_idle: + buf_pool.page_cleaner_set_idle(true); + set_almost_idle: + pthread_cond_broadcast(&buf_pool.done_flush_LRU); + pthread_cond_broadcast(&buf_pool.done_flush_list); + if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_dblwr.flush_buffered_writes(); + + do + { + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); + + if (!recv_recovery_is_on() && + !srv_startup_is_before_trx_rollback_phase && + srv_operation <= SRV_OPERATION_EXPORT_RESTORED) + log_checkpoint(); + } + while (false); + + if (!buf_pool.ran_out()) + continue; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + oldest_lsn= buf_pool.get_oldest_modification(0); + } + + lsn_t soft_lsn_limit= buf_flush_async_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + if (srv_flush_sync) + goto do_furious_flush; + if (oldest_lsn >= lsn_limit) + { + buf_flush_sync_lsn= 0; + pthread_cond_broadcast(&buf_pool.done_flush_list); + } + else if (lsn_limit > soft_lsn_limit) + soft_lsn_limit= lsn_limit; + } + + double pct_lwm= 0.0; + ulint n_flushed= 0, n; + + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) + { + if (oldest_lsn >= soft_lsn_limit) + buf_flush_async_lsn= soft_lsn_limit= 0; + } + else if (buf_pool.ran_out()) + { + buf_pool.page_cleaner_set_idle(false); + buf_pool.n_flush_inc(); + /* Remove clean blocks from buf_pool.flush_list before the LRU scan. */ + for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; ) + { + const lsn_t lsn{p->oldest_modification()}; + ut_ad(lsn > 2 || lsn == 1); + buf_page_t *n= UT_LIST_GET_NEXT(list, p); + if (lsn <= 1) + buf_pool.delete_from_flush_list(p); + p= n; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + n= srv_max_io_capacity; + mysql_mutex_lock(&buf_pool.mutex); + LRU_flush: + n= buf_flush_LRU(n, false); + mysql_mutex_unlock(&buf_pool.mutex); + last_pages+= n; + check_oldest_and_set_idle: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_dec_holding_mutex(); + oldest_lsn= buf_pool.get_oldest_modification(0); + if (!oldest_lsn) + goto fully_unemployed; + if (oldest_lsn >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; + buf_pool.page_cleaner_set_idle(false); + goto set_almost_idle; + } + else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; + + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + /* We perform dirty reads of the LRU+free list lengths here. + Division by zero is not possible, because buf_pool.flush_list is + guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ + const double dirty_pct= double(dirty_blocks) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + pct_lwm= srv_max_dirty_pages_pct_lwm; + if (pct_lwm != 0.0) + { + const ulint activity_count= srv_get_activity_count(); + if (activity_count != last_activity_count) + { + last_activity_count= activity_count; + goto maybe_unemployed; + } + else if (buf_pool.page_cleaner_idle() && !os_aio_pending_reads()) + { + /* reaching here means 3 things: + - last_activity_count == activity_count: suggesting server is idle + (no trx_t::commit() activity) + - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) + - there are no pending reads but there are dirty pages to flush */ + buf_pool.update_last_activity_count(activity_count); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + goto idle_flush; + } + else + { + maybe_unemployed: + const bool below{dirty_pct < pct_lwm}; + pct_lwm= 0.0; + if (below) + goto possibly_unemployed; + } + } + else if (dirty_pct < srv_max_buf_pool_modified_pct) + possibly_unemployed: + if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn)) + goto set_idle; + + buf_pool.page_cleaner_set_idle(false); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) + { + n= srv_max_io_capacity; + goto background_flush; + } + + if (!srv_adaptive_flushing) + { + idle_flush: + n= srv_io_capacity; + soft_lsn_limit= LSN_MAX; + background_flush: + mysql_mutex_lock(&buf_pool.mutex); + n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit); + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); + } + else if ((n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + pct_lwm, + dirty_blocks, + dirty_pct)) != 0) + { + const ulint tm= ut_time_ms(); + mysql_mutex_lock(&buf_pool.mutex); + last_pages= n_flushed= buf_flush_list_holding_mutex(n); + page_cleaner.flush_time+= ut_time_ms() - tm; + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); + } + else if (buf_flush_async_lsn <= oldest_lsn) + goto check_oldest_and_set_idle; + + n= n >= n_flushed ? n - n_flushed : 0; + goto LRU_flush; + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (srv_fast_shutdown != 2) + { + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + os_aio_wait_until_no_pending_writes(false); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + do_furious_flush: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + goto furious_flush; + } + buf_page_cleaner_is_active= false; + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + my_thread_end(); + +#ifdef UNIV_PFS_THREAD + pfs_delete_thread(); +#endif +} + +/** Initialize page_cleaner. */ +ATTRIBUTE_COLD void buf_flush_page_cleaner_init() +{ + ut_ad(!buf_page_cleaner_is_active); + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + buf_flush_async_lsn= 0; + buf_flush_sync_lsn= 0; + buf_page_cleaner_is_active= true; + std::thread(buf_flush_page_cleaner).detach(); +} + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool() +{ + ut_ad(!os_aio_pending_reads()); + ut_ad(!buf_page_cleaner_is_active); + ut_ad(!buf_flush_sync_lsn); + + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush the buffer pool"); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + while (buf_pool.get_oldest_modification(0)) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_flush_list(srv_max_io_capacity); + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + UT_LIST_GET_LEN(buf_pool.flush_list)); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!os_aio_pending_reads()); +} + +/** Synchronously flush dirty blocks during recv_sys_t::apply(). +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync_batch(lsn_t lsn) +{ + lsn= std::max(lsn, log_sys.get_lsn()); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait(lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} + +/** Synchronously flush dirty blocks. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync() +{ + if (recv_recovery_is_on()) + { + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.apply(true); + mysql_mutex_unlock(&recv_sys.mutex); + } + + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + tpool::tpool_wait_begin(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + for (;;) + { + const lsn_t lsn= log_sys.get_lsn(); + buf_flush_wait(lsn); + /* Wait for the page cleaner to be idle (for log resizing at startup) */ + while (buf_flush_sync_lsn) + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + if (lsn == log_sys.get_lsn()) + break; + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); +} + +#ifdef UNIV_DEBUG +/** Functor to validate the flush list. */ +struct Check { + void operator()(const buf_page_t* elem) const + { + ut_ad(elem->oldest_modification()); + ut_ad(!fsp_is_system_temporary(elem->id().space())); + } +}; + +/** Validate the flush list. */ +static void buf_flush_validate_low() +{ + buf_page_t* bpage; + + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + + ut_list_validate(buf_pool.flush_list, Check()); + + bpage = UT_LIST_GET_FIRST(buf_pool.flush_list); + + while (bpage != NULL) { + const lsn_t om = bpage->oldest_modification(); + /* A page in buf_pool.flush_list can be in + BUF_BLOCK_REMOVE_HASH state. This happens when a page + is in the middle of being relocated. In that case the + original descriptor can have this state and still be + in the flush list waiting to acquire the + buf_pool.flush_list_mutex to complete the relocation. */ + ut_d(const auto s= bpage->state()); + ut_ad(s >= buf_page_t::REMOVE_HASH); + ut_ad(om == 1 || om > 2); + + bpage = UT_LIST_GET_NEXT(list, bpage); + ut_ad(om == 1 || !bpage || recv_recovery_is_on() + || om >= bpage->oldest_modification()); + } +} + +/** Validate the flush list. */ +void buf_flush_validate() +{ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_validate_low(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc new file mode 100644 index 00000000..65ee8fa3 --- /dev/null +++ b/storage/innobase/buf/buf0lru.cc @@ -0,0 +1,1452 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0lru.cc +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" +#include "fil0fil.h" +#include "btr0btr.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "btr0sea.h" +#include "os0file.h" +#include "page0zip.h" +#include "log0recv.h" +#include "srv0srv.h" +#include "srv0mon.h" +#include "my_cpu.h" + +/** Flush this many pages in buf_LRU_get_free_block() */ +size_t innodb_lru_flush_size; + +/** The number of blocks from the LRU_old pointer onward, including +the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV +of the whole LRU list length, except that the tolerance defined below +is allowed. Note that the tolerance must be small enough such that for +even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not +allowed to point to either end of the LRU list. */ + +static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20; + +/** The minimum amount of non-old blocks when the LRU_old list exists +(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks). +@see buf_LRU_old_adjust_len */ +#define BUF_LRU_NON_OLD_MIN_LEN 5 + +/** If we switch on the InnoDB monitor because there are too few available +frames in the buffer pool, we set this to TRUE */ +static bool buf_lru_switched_on_innodb_mon = false; + +/** True if diagnostic message about difficult to find free blocks +in the buffer bool has already printed. */ +static bool buf_lru_free_blocks_error_printed; + +/******************************************************************//** +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics, +buf_LRU_evict_from_unzip_LRU() decides if we want to evict from +unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the +uncompressed frame (meaning we can evict dirty blocks as well). From +the regular LRU, we will evict the entire block (i.e.: both the +uncompressed and compressed data), which must be clean. */ + +/* @{ */ + +/** Number of intervals for which we keep the history of these stats. +Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */ +static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4; + +/** Co-efficient with which we multiply I/O operations to equate them +with page_zip_decompress() operations. */ +static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50; + +/** Sampled values buf_LRU_stat_cur. +Not protected by any mutex. Updated by buf_LRU_stat_update(). */ +static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL]; + +/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */ +static ulint buf_LRU_stat_arr_ind; + +/** Current operation counters. Not protected by any mutex. Cleared +by buf_LRU_stat_update(). */ +buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Not Protected by any mutex. */ +buf_LRU_stat_t buf_LRU_stat_sum; + +/* @} */ + +/** @name Heuristics for detecting index scan @{ */ +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +uint buf_LRU_old_threshold_ms; +/* @} */ + +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If !bpage->frame && bpage->oldest_modification() <= 1, +the object will be freed. + +@param bpage buffer block +@param id page identifier +@param chain locked buf_pool.page_hash chain (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed + +If a compressed page is freed other compressed pages may be relocated. +@retval true if bpage with bpage->frame was removed from page_hash. The +caller needs to free the page to the free list +@retval false if block without bpage->frame was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + buf_pool_t::hash_chain &chain, + bool zip); + +/** Free a block to buf_pool */ +static void buf_LRU_block_free_hashed_page(buf_block_t *block) +{ + block->page.free_file_page(); + buf_LRU_block_free_non_file_page(block); +} + +/** Increase LRU size in bytes by the page size. +@param[in] bpage control block */ +static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + buf_pool.stat.LRU_bytes += bpage->physical_size(); + + ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size); +} + +/** @return whether the unzip_LRU list should be used for evicting a victim +instead of the general LRU list */ +bool buf_LRU_evict_from_unzip_LRU() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + /* If the unzip_LRU list is empty, we can only use the LRU. */ + if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) { + return false; + } + + /* If unzip_LRU is at most 10% of the size of the LRU list, + then use the LRU. This slack allows us to keep hot + decompressed pages in the buffer pool. */ + if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) + <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) { + return false; + } + + /* If eviction hasn't started yet, we assume by default + that a workload is disk bound. */ + if (buf_pool.freed_page_clock == 0) { + return true; + } + + /* Calculate the average over past intervals, and add the values + of the current interval. */ + ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.io; + + ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.unzip; + + /* Decide based on our formula. If the load is I/O bound + (unzip_avg is smaller than the weighted io_avg), evict an + uncompressed frame from unzip_LRU. Otherwise we assume that + the load is CPU bound and evict from the regular LRU. */ + return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR); +} + +/** Try to free an uncompressed page of a compressed block from the unzip +LRU list. The compressed page is preserved, and it need not be clean. +@param limit maximum number of blocks to scan +@return true if freed */ +static bool buf_LRU_free_from_unzip_LRU_list(ulint limit) +{ + if (!buf_LRU_evict_from_unzip_LRU()) { + return(false); + } + + ulint scanned = 0; + bool freed = false; + + for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + block && scanned < limit; ++scanned) { + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + + ut_ad(block->page.in_file()); + ut_ad(block->page.belongs_to_unzip_LRU()); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + freed = buf_LRU_free_page(&block->page, false); + if (freed) { + scanned++; + break; + } + + block = prev_block; + } + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + scanned); + } + + return(freed); +} + +/** Try to free a clean page from the common LRU list. +@param limit maximum number of blocks to scan +@return whether a page was freed */ +static bool buf_LRU_free_from_common_LRU_list(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + ulint scanned = 0; + bool freed = false; + + for (buf_page_t* bpage = buf_pool.lru_scan_itr.start(); + bpage && scanned < limit; + ++scanned, bpage = buf_pool.lru_scan_itr.get()) { + buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + buf_pool.lru_scan_itr.set(prev); + + const auto accessed = bpage->is_accessed(); + + if (buf_LRU_free_page(bpage, true)) { + if (!accessed) { + /* Keep track of pages that are evicted without + ever being accessed. This gives us a measure of + the effectiveness of readahead */ + ++buf_pool.stat.n_ra_pages_evicted; + } + + freed = true; + scanned++; + break; + } + } + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + scanned); + + return(freed); +} + +/** @return a buffer block from the buf_pool.free list +@retval NULL if the free list is empty */ +buf_block_t* buf_LRU_get_free_only() +{ + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(buf_pool.free)); + + while (block != NULL) { + ut_ad(block->page.in_free_list); + ut_d(block->page.in_free_list = FALSE); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + UT_LIST_REMOVE(buf_pool.free, &block->page); + + if (!buf_pool.is_shrinking() + || UT_LIST_GET_LEN(buf_pool.withdraw) + >= buf_pool.withdraw_target + || !buf_pool.will_be_withdrawn(block->page)) { + /* No adaptive hash index entries may point to + a free block. */ + assert_block_ahi_empty(block); + + block->page.set_state(buf_page_t::MEMORY); + block->page.set_os_used(); + break; + } + + /* This should be withdrawn */ + UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page); + ut_d(block->in_withdraw_list = true); + + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(buf_pool.free)); + } + + return(block); +} + +/******************************************************************//** +Checks how much of buf_pool is occupied by non-data objects like +AHI, lock heaps etc. Depending on the size of non-data objects this +function will either assert or issue a warning and switch on the +status monitor. */ +static void buf_LRU_check_size_of_non_data_objects() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks) + return; + + const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + + if (s < buf_pool.curr_size / 20) + ib::fatal() << "Over 95 percent of the buffer pool is" + " occupied by lock heaps" +#ifdef BTR_CUR_HASH_ADAPT + " or the adaptive hash index" +#endif /* BTR_CUR_HASH_ADAPT */ + "! Check that your transactions do not set too many" + " row locks, or review if innodb_buffer_pool_size=" + << (buf_pool.curr_size >> (20U - srv_page_size_shift)) + << "M could be bigger."; + + if (s < buf_pool.curr_size / 3) + { + if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer) + { + /* Over 67 % of the buffer pool is occupied by lock heaps or + the adaptive hash index. This may be a memory leak! */ + ib::warn() << "Over 67 percent of the buffer pool is" + " occupied by lock heaps" +#ifdef BTR_CUR_HASH_ADAPT + " or the adaptive hash index" +#endif /* BTR_CUR_HASH_ADAPT */ + "! Check that your transactions do not set too many row locks." + " innodb_buffer_pool_size=" + << (buf_pool.curr_size >> (20U - srv_page_size_shift)) + << "M. Starting the InnoDB Monitor to print diagnostics."; + buf_lru_switched_on_innodb_mon= true; + srv_print_innodb_monitor= TRUE; + srv_monitor_timer_schedule_now(); + } + } + else if (buf_lru_switched_on_innodb_mon) + { + /* Switch off the InnoDB Monitor; this is a simple way to stop the + monitor if the situation becomes less urgent, but may also + surprise users who did SET GLOBAL innodb_status_output=ON earlier! */ + buf_lru_switched_on_innodb_mon= false; + srv_print_innodb_monitor= FALSE; + } +} + +/** Get a block from the buf_pool.free list. +If the list is empty, blocks will be moved from the end of buf_pool.LRU +to buf_pool.free. + +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from the buf_pool.free list, success:done + * if buf_pool.try_LRU_scan is set + * scan LRU up to 100 pages to free a clean block + * success:retry the free list + * flush up to innodb_lru_flush_size LRU blocks to data files + (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) + * on buf_page_write_complete() the blocks will put on buf_pool.free list + * success: retry the free list +* subsequent iterations: same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool.try_LRU_scan is not set + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t *buf_LRU_get_free_block(bool have_mutex) +{ + ulint n_iterations = 0; + ulint flush_failures = 0; + MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); + if (have_mutex) { + mysql_mutex_assert_owner(&buf_pool.mutex); + goto got_mutex; + } + DBUG_EXECUTE_IF("recv_ran_out_of_buffer", + if (recv_recovery_is_on() + && recv_sys.apply_log_recs) { + mysql_mutex_lock(&buf_pool.mutex); + goto flush_lru; + }); +get_mutex: + mysql_mutex_lock(&buf_pool.mutex); +got_mutex: + buf_LRU_check_size_of_non_data_objects(); + buf_block_t* block; + + DBUG_EXECUTE_IF("ib_lru_force_no_free_page", + if (!buf_lru_free_blocks_error_printed) { + n_iterations = 21; + goto not_found;}); + +retry: + /* If there is a block in the free list, take it */ + if ((block = buf_LRU_get_free_only()) != nullptr) { +got_block: + if (!have_mutex) { + mysql_mutex_unlock(&buf_pool.mutex); + } + block->page.zip.clear(); + return block; + } + + MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS ); + if (n_iterations || buf_pool.try_LRU_scan) { + /* If no block was in the free list, search from the + end of the LRU list and try to free a block there. + If we are doing for the first time we'll scan only + tail of the LRU list otherwise we scan the whole LRU + list. */ + if (buf_LRU_scan_and_free_block(n_iterations + ? ULINT_UNDEFINED : 100)) { + goto retry; + } + + /* Tell other threads that there is no point + in scanning the LRU list. */ + buf_pool.try_LRU_scan = false; + } + + for (;;) { + if ((block = buf_LRU_get_free_only()) != nullptr) { + goto got_block; + } + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto n_flush = buf_pool.n_flush(); + if (!buf_pool.try_LRU_scan) { + buf_pool.page_cleaner_wakeup(true); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + if (!n_flush) { + goto not_found; + } + if (!buf_pool.try_LRU_scan) { + my_cond_wait(&buf_pool.done_free, + &buf_pool.mutex.m_mutex); + } + } + +not_found: + if (n_iterations > 1) { + MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + } + + if (n_iterations == 21 && !buf_lru_free_blocks_error_printed + && srv_buf_pool_old_size == srv_buf_pool_size) { + buf_lru_free_blocks_error_printed = true; + mysql_mutex_unlock(&buf_pool.mutex); + ib::warn() << "Difficult to find free blocks in the buffer pool" + " (" << n_iterations << " search iterations)! " + << flush_failures << " failed attempts to" + " flush a page!" + " Consider increasing innodb_buffer_pool_size." + " Pending flushes (fsync): " + << fil_n_pending_tablespace_flushes + << ". " << os_n_file_reads << " OS file reads, " + << os_n_file_writes << " OS file writes, " + << os_n_fsyncs + << " OS fsyncs."; + mysql_mutex_lock(&buf_pool.mutex); + } + + /* No free block was found: try to flush the LRU list. + The freed blocks will be up for grabs for all threads. + + TODO: A more elegant way would have been to return one freed + up block to the caller here but the code that deals with + removing the block from buf_pool.page_hash and buf_pool.LRU is fairly + involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We + can do that in a separate patch sometime in future. */ +#ifndef DBUG_OFF +flush_lru: +#endif + if (!buf_flush_LRU(innodb_lru_flush_size, true)) { + MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); + ++flush_failures; + } + + n_iterations++; + buf_pool.stat.LRU_waits++; + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + goto get_mutex; +} + +/** Move the LRU_old pointer so that the length of the old blocks list +is inside the allowed limits. */ +static void buf_LRU_old_adjust_len() +{ + ulint old_len; + ulint new_len; + + ut_a(buf_pool.LRU_old); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); + ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); + compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN + > BUF_LRU_OLD_RATIO_DIV + * (BUF_LRU_OLD_TOLERANCE + 5)); + compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN); + +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + + old_len = buf_pool.LRU_old_len; + new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU) + * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool.LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + for (;;) { + buf_page_t* LRU_old = buf_pool.LRU_old; + + ut_a(LRU_old); + ut_ad(LRU_old->in_LRU_list); +#ifdef UNIV_LRU_DEBUG + ut_a(LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + + /* Update the LRU_old pointer if necessary */ + + if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) { + + buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV( + LRU, LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(!LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + old_len = ++buf_pool.LRU_old_len; + LRU_old->set_old(true); + + } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { + + buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old); + old_len = --buf_pool.LRU_old_len; + LRU_old->set_old(false); + } else { + return; + } + } +} + +/** Initialize the old blocks pointer in the LRU list. This function should be +called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ +static void buf_LRU_old_init() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN); + + /* We first initialize all blocks in the LRU list as old and then use + the adjust function to move the LRU_old pointer to the right + position */ + + for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage)) { + + ut_ad(bpage->in_LRU_list); + + /* This loop temporarily violates the + assertions of buf_page_t::set_old(). */ + bpage->old = true; + } + + buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU); + buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU); + + buf_LRU_old_adjust_len(); +} + +/** Remove a block from the unzip_LRU list if it belonged to the list. +@param[in] bpage control block */ +static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage) +{ + ut_ad(bpage->in_file()); + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (bpage->belongs_to_unzip_LRU()) { + buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); + + ut_ad(block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = false); + + UT_LIST_REMOVE(buf_pool.unzip_LRU, block); + } +} + +/** Removes a block from the LRU list. +@param[in] bpage control block */ +static inline void buf_LRU_remove_block(buf_page_t* bpage) +{ + /* Important that we adjust the hazard pointers before removing + bpage from the LRU list. */ + buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage); + + /* If the LRU_old pointer is defined and points to just this block, + move it backward one step */ + + if (bpage == buf_pool.LRU_old) { + + /* Below: the previous block is guaranteed to exist, + because the LRU_old pointer is only allowed to differ + by BUF_LRU_OLD_TOLERANCE from strict + buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU + list length. */ + ut_a(prev_bpage); +#ifdef UNIV_LRU_DEBUG + ut_a(!prev_bpage->old); +#endif /* UNIV_LRU_DEBUG */ + buf_pool.LRU_old = prev_bpage; + prev_bpage->set_old(true); + + buf_pool.LRU_old_len++; + } + + buf_pool.stat.LRU_bytes -= bpage->physical_size(); + + buf_unzip_LRU_remove_block_if_needed(bpage); + + /* If the LRU list is so short that LRU_old is not defined, + clear the "old" flags and return */ + if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) { + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + + /* This loop temporarily violates the + assertions of buf_page_t::set_old(). */ + bpage->old = false; + } + + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + + return; + } + + ut_ad(buf_pool.LRU_old); + + /* Update the LRU_old_len field if necessary */ + if (bpage->old) { + buf_pool.LRU_old_len--; + } + + /* Adjust the length of the old block list if necessary */ + buf_LRU_old_adjust_len(); +} + +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old) /*!< in: TRUE if should be put to the end + of the list, else put to the start */ +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(block->page.belongs_to_unzip_LRU()); + ut_ad(!block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = true); + + if (old) { + UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block); + } else { + UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block); + } +} + +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the page_size is +already set when invoking the function, so that we can get correct +page_size from the buffer page when adding a block into LRU */ +void +buf_LRU_add_block( + buf_page_t* bpage, /*!< in: control block */ + bool old) /*!< in: true if should be put to the old blocks + in the LRU list, else put to the start; if the + LRU list is very short, the block is added to + the start, regardless of this parameter */ +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!bpage->in_LRU_list); + + if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) { + + UT_LIST_ADD_FIRST(buf_pool.LRU, bpage); + + bpage->freed_page_clock = buf_pool.freed_page_clock + & ((1U << 31) - 1); + } else { +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old, + bpage); + + buf_pool.LRU_old_len++; + } + + ut_d(bpage->in_LRU_list = TRUE); + + incr_LRU_size_in_bytes(bpage); + + if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool.LRU_old); + + /* Adjust the length of the old block list if necessary */ + + bpage->set_old(old); + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } else { + bpage->set_old(buf_pool.LRU_old != NULL); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (bpage->belongs_to_unzip_LRU()) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, old); + } +} + +/** Move a block to the start of the LRU list. */ +void buf_page_make_young(buf_page_t *bpage) +{ + if (bpage->is_read_fixed()) + return; + + ut_ad(bpage->in_file()); + + mysql_mutex_lock(&buf_pool.mutex); + + if (UNIV_UNLIKELY(bpage->old)) + buf_pool.stat.n_pages_made_young++; + + buf_LRU_remove_block(bpage); + buf_LRU_add_block(bpage, false); + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) +{ + const page_id_t id{bpage->id()}; + buf_page_t* b = nullptr; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + /* First, perform a quick check before we acquire hash_lock. */ + if (!bpage->can_relocate()) { + return false; + } + + /* We must hold an exclusive hash_lock to prevent + bpage->can_relocate() from changing due to a concurrent + execution of buf_page_get_low(). */ + buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold()); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); + /* We cannot use transactional_lock_guard here, + because buf_buddy_relocate() in buf_buddy_free() could get stuck. */ + hash_lock.lock(); + const lsn_t oldest_modification = bpage->oldest_modification_acquire(); + + if (UNIV_UNLIKELY(!bpage->can_relocate())) { + /* Do not free buffer fixed and I/O-fixed blocks. */ + goto func_exit; + } + + switch (oldest_modification) { + case 2: + ut_ad(id.space() == SRV_TMP_SPACE_ID); + ut_ad(!bpage->zip.data); + if (!bpage->is_freed()) { + goto func_exit; + } + bpage->clear_oldest_modification(); + break; + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (const lsn_t om = bpage->oldest_modification()) { + ut_ad(om == 1); + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!bpage->oldest_modification()); + /* fall through */ + case 0: + if (zip || !bpage->zip.data || !bpage->frame) { + break; + } +relocate_compressed: + b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b)); + ut_a(b); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + new (b) buf_page_t(*bpage); + b->frame = nullptr; + { + ut_d(uint32_t s=) b->fix(); + ut_ad(s == buf_page_t::FREED + || s == buf_page_t::UNFIXED + || s == buf_page_t::IBUF_EXIST + || s == buf_page_t::REINIT); + } + break; + default: + if (zip || !bpage->zip.data || !bpage->frame) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ +func_exit: + hash_lock.unlock(); + return(false); + } + goto relocate_compressed; + } + + mysql_mutex_assert_owner(&buf_pool.mutex); + + DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no())); + + ut_ad(bpage->can_relocate()); + + if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) { + ut_ad(!b); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + return(true); + } + + /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr + then it was a compressed page with an uncompressed frame and + we are interested in freeing only the uncompressed frame. + Therefore we have to reinsert the compressed page descriptor + into the LRU and page_hash (and possibly flush_list). + if !b then it was a regular page that has been freed */ + + if (UNIV_LIKELY_NULL(b)) { + buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); + + ut_ad(!buf_pool.page_hash.get(id, chain)); + ut_ad(b->zip_size()); + + /* The field in_LRU_list of + the to-be-freed block descriptor should have + been cleared in + buf_LRU_block_remove_hashed(), which + invokes buf_LRU_remove_block(). */ + ut_ad(!bpage->in_LRU_list); + ut_ad(bpage->frame); + ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list); + + /* The fields of bpage were copied to b before + buf_LRU_block_remove_hashed() was invoked. */ + ut_ad(!b->in_zip_hash); + ut_ad(b->in_LRU_list); + ut_ad(b->in_page_hash); + ut_d(b->in_page_hash = false); + b->hash = nullptr; + + buf_pool.page_hash.append(chain, b); + + /* Insert b where bpage was in the LRU list. */ + if (prev_b) { + ulint lru_len; + + ut_ad(prev_b->in_LRU_list); + ut_ad(prev_b->in_file()); + + UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b); + + incr_LRU_size_in_bytes(b); + + if (b->is_old()) { + buf_pool.LRU_old_len++; + if (buf_pool.LRU_old + == UT_LIST_GET_NEXT(LRU, b)) { + + buf_pool.LRU_old = b; + } + } + + lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + if (lru_len > BUF_LRU_OLD_MIN_LEN) { + ut_ad(buf_pool.LRU_old); + /* Adjust the length of the + old block list if necessary */ + buf_LRU_old_adjust_len(); + } else if (lru_len == BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is now long + enough for LRU_old to become + defined: init it */ + buf_LRU_old_init(); + } +#ifdef UNIV_LRU_DEBUG + /* Check that the "old" flag is consistent + in the block and its neighbours. */ + b->set_old(b->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } else { + ut_d(b->in_LRU_list = FALSE); + buf_LRU_add_block(b, b->old); + } + + buf_flush_relocate_on_flush_list(bpage, b); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + bpage->zip.data = nullptr; + + page_zip_set_size(&bpage->zip, 0); + + b->lock.x_lock(); + hash_lock.unlock(); + } else if (!zip) { + hash_lock.unlock(); + } + + buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); + +#ifdef BTR_CUR_HASH_ADAPT + if (block->index) { + mysql_mutex_unlock(&buf_pool.mutex); + + /* Remove the adaptive hash index on the page. + The page was declared uninitialized by + buf_LRU_block_remove_hashed(). We need to flag + the contents of the page valid (which it still is) in + order to avoid bogus Valgrind or MSAN warnings.*/ + + MEM_MAKE_DEFINED(block->page.frame, srv_page_size); + btr_search_drop_page_hash_index(block, false); + MEM_UNDEFINED(block->page.frame, srv_page_size); + mysql_mutex_lock(&buf_pool.mutex); + } +#endif + if (UNIV_LIKELY_NULL(b)) { + ut_ad(b->zip_size()); + b->lock.x_unlock(); + b->unfix(); + } + + buf_LRU_block_free_hashed_page(block); + + return(true); +} + +/******************************************************************//** +Puts a block back to the free list. */ +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block) /*!< in: block, must not contain a file page */ +{ + void* data; + + ut_ad(block->page.state() == buf_page_t::MEMORY); + assert_block_ahi_empty(block); + ut_ad(!block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_ad(!block->page.hash); + + block->page.set_state(buf_page_t::NOT_USED); + + MEM_UNDEFINED(block->page.frame, srv_page_size); + data = block->page.zip.data; + + if (data != NULL) { + block->page.zip.data = NULL; + buf_pool_mutex_exit_forbid(); + + ut_ad(block->zip_size()); + + buf_buddy_free(data, block->zip_size()); + + buf_pool_mutex_exit_allow(); + page_zip_set_size(&block->page.zip, 0); + } + + if (buf_pool.is_shrinking() + && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target + && buf_pool.will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_ADD_LAST( + buf_pool.withdraw, + &block->page); + ut_d(block->in_withdraw_list = true); + } else { + UT_LIST_ADD_FIRST(buf_pool.free, &block->page); + ut_d(block->page.in_free_list = true); + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); + } + + block->page.set_os_unused(); +} + +/** Release a memory block to the buffer pool. */ +ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block) +{ + ut_ad(this == &buf_pool); + mysql_mutex_lock(&mutex); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&mutex); +} + + +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If !bpage->frame && !bpage->oldest_modification(), the object will be freed. + +@param bpage buffer block +@param id page identifier +@param chain locked buf_pool.page_hash chain (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed + +If a compressed page is freed other compressed pages may be relocated. +@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The +caller needs to free the page to the free list +@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + buf_pool_t::hash_chain &chain, + bool zip) +{ + ut_a(bpage->can_relocate()); + ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); + + buf_LRU_remove_block(bpage); + + buf_pool.freed_page_clock += 1; + + if (UNIV_LIKELY(!bpage->zip.data)) { + MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t)); + MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size); + buf_block_modify_clock_inc((buf_block_t*) bpage); + } else if (const page_t *page = bpage->frame) { + MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t)); + MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size); + buf_block_modify_clock_inc((buf_block_t*) bpage); + + ut_a(!zip || !bpage->oldest_modification()); + ut_ad(bpage->zip_size()); + /* Skip consistency checks if the page was freed. + In recovery, we could get a sole FREE_PAGE record + and nothing else, for a ROW_FORMAT=COMPRESSED page. + Its contents would be garbage. */ + if (!bpage->is_freed()) + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + if (!zip) { + /* InnoDB writes the data to the + uncompressed page frame. Copy it + to the compressed page, which will + be preserved. */ + memcpy(bpage->zip.data, page, + bpage->zip_size()); + } + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + break; + default: + ib::error() << "The compressed page to be" + " evicted seems corrupt:"; + ut_print_buf(stderr, page, srv_page_size); + + ib::error() << "Possibly older version of" + " the page:"; + + ut_print_buf(stderr, bpage->zip.data, + bpage->zip_size()); + putc('\n', stderr); + ut_error; + } + } else { + ut_a(!bpage->oldest_modification()); + MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); + } + + ut_ad(!bpage->in_zip_hash); + buf_pool.page_hash.remove(chain, bpage); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); + + if (UNIV_UNLIKELY(!bpage->frame)) { + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->in_LRU_list); + ut_a(bpage->zip.data); + ut_a(bpage->zip.ssize); + ut_ad(!bpage->oldest_modification()); + + hash_lock.unlock(); + buf_pool_mutex_exit_forbid(); + + buf_buddy_free(bpage->zip.data, bpage->zip_size()); + + buf_pool_mutex_exit_allow(); + bpage->lock.free(); + ut_free(bpage); + return false; + } else { + static_assert(FIL_NULL == 0xffffffffU, "fill pattern"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(bpage->frame + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(bpage->frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(bpage->frame, srv_page_size); + bpage->set_state(buf_page_t::REMOVE_HASH); + + if (!zip) { + return true; + } + + hash_lock.unlock(); + + if (bpage->zip.data) { + /* Free the compressed page. */ + void* data = bpage->zip.data; + bpage->zip.data = NULL; + + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->oldest_modification()); + ut_ad(!bpage->in_LRU_list); + buf_pool_mutex_exit_forbid(); + + buf_buddy_free(data, bpage->zip_size()); + + buf_pool_mutex_exit_allow(); + + page_zip_set_size(&bpage->zip, 0); + } + + return true; + } +} + +/** Release and evict a corrupted page. +@param bpage x-latched page that was found corrupted +@param state expected current state of the page */ +ATTRIBUTE_COLD +void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) +{ + const page_id_t id{bpage->id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + + recv_sys.free_corrupted_page(id); + mysql_mutex_lock(&mutex); + hash_lock.lock(); + + ut_ad(!bpage->oldest_modification()); + bpage->set_corrupt_id(); + auto unfix= state - buf_page_t::FREED; + auto s= bpage->zip.fix.fetch_sub(unfix) - unfix; + bpage->lock.x_unlock(true); + + while (s != buf_page_t::FREED || bpage->lock.is_locked_or_waiting()) + { + ut_ad(s >= buf_page_t::FREED); + ut_ad(s < buf_page_t::UNFIXED); + /* Wait for other threads to release the fix count + before releasing the bpage from LRU list. */ + (void) LF_BACKOFF(); + s= bpage->state(); + } + + /* remove from LRU and page_hash */ + if (buf_LRU_block_remove_hashed(bpage, id, chain, true)) + buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage)); + + mysql_mutex_unlock(&mutex); +} + +/** Update buf_pool.LRU_old_ratio. +@param[in] old_pct Reserve this percentage of + the buffer pool for "old" blocks +@param[in] adjust true=adjust the LRU list; + false=just assign buf_pool.LRU_old_ratio + during the initialization of InnoDB +@return updated old_pct */ +uint buf_LRU_old_ratio_update(uint old_pct, bool adjust) +{ + uint ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100; + if (ratio < BUF_LRU_OLD_RATIO_MIN) { + ratio = BUF_LRU_OLD_RATIO_MIN; + } else if (ratio > BUF_LRU_OLD_RATIO_MAX) { + ratio = BUF_LRU_OLD_RATIO_MAX; + } + + if (adjust) { + mysql_mutex_lock(&buf_pool.mutex); + + if (ratio != buf_pool.LRU_old_ratio) { + buf_pool.LRU_old_ratio = ratio; + + if (UT_LIST_GET_LEN(buf_pool.LRU) + >= BUF_LRU_OLD_MIN_LEN) { + buf_LRU_old_adjust_len(); + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + } else { + buf_pool.LRU_old_ratio = ratio; + } + /* the reverse of + ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */ + return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5)); +} + +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +void +buf_LRU_stat_update() +{ + buf_LRU_stat_t* item; + buf_LRU_stat_t cur_stat; + + if (!buf_pool.freed_page_clock) { + goto func_exit; + } + + /* Update the index. */ + item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; + buf_LRU_stat_arr_ind++; + buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL; + + /* Add the current value and subtract the obsolete entry. + Since buf_LRU_stat_cur is not protected by any mutex, + it can be changing between adding to buf_LRU_stat_sum + and copying to item. Assign it to local variables to make + sure the same value assign to the buf_LRU_stat_sum + and item */ + cur_stat = buf_LRU_stat_cur; + + buf_LRU_stat_sum.io += cur_stat.io - item->io; + buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip; + + /* Put current entry in the array. */ + memcpy(item, &cur_stat, sizeof *item); + +func_exit: + /* Clear the current entry. */ + memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); +} + +#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ +/* Avoid GCC 4.8.5 internal compiler error "could not split insn". +We would only need this for buf_LRU_scan_and_free_block(), +but GCC 4.8.5 does not support pop_options. */ +# pragma GCC optimize ("O0") +#endif +/** Try to free a replaceable block. +@param limit maximum number of blocks to scan +@return true if found and freed */ +bool buf_LRU_scan_and_free_block(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + return buf_LRU_free_from_unzip_LRU_list(limit) || + buf_LRU_free_from_common_LRU_list(limit); +} + +#ifdef UNIV_DEBUG +/** Validate the LRU list. */ +void buf_LRU_validate() +{ + ulint old_len; + ulint new_len; + + mysql_mutex_lock(&buf_pool.mutex); + + if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) { + + ut_a(buf_pool.LRU_old); + old_len = buf_pool.LRU_old_len; + + new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU) + * buf_pool.LRU_old_ratio + / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool.LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); + ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); + } + + CheckInLRUList::validate(); + + old_len = 0; + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + ut_ad(bpage->in_file()); + ut_ad(!bpage->frame + || reinterpret_cast<buf_block_t*>(bpage) + ->in_unzip_LRU_list + == bpage->belongs_to_unzip_LRU()); + + if (bpage->is_old()) { + const buf_page_t* prev + = UT_LIST_GET_PREV(LRU, bpage); + const buf_page_t* next + = UT_LIST_GET_NEXT(LRU, bpage); + + if (!old_len++) { + ut_a(buf_pool.LRU_old == bpage); + } else { + ut_a(!prev || prev->is_old()); + } + + ut_a(!next || next->is_old()); + } + } + + ut_a(buf_pool.LRU_old_len == old_len); + + CheckInFreeList::validate(); + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_a(bpage->state() == buf_page_t::NOT_USED); + } + + CheckUnzipLRUAndLRUList::validate(); + + for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU); + block != NULL; + block = UT_LIST_GET_NEXT(unzip_LRU, block)) { + + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + ut_a(block->page.belongs_to_unzip_LRU()); + } + + mysql_mutex_unlock(&buf_pool.mutex); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Dump the LRU list to stderr. */ +void buf_LRU_print() +{ + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + const page_id_t id(bpage->id()); + + fprintf(stderr, "BLOCK space %u page %u ", + id.space(), id.page_no()); + + if (bpage->is_old()) { + fputs("old ", stderr); + } + + const unsigned s = bpage->state(); + if (s > buf_page_t::UNFIXED) { + fprintf(stderr, "fix %u ", s - buf_page_t::UNFIXED); + } else { + ut_ad(s == buf_page_t::UNFIXED + || s == buf_page_t::REMOVE_HASH); + } + + if (bpage->oldest_modification()) { + fputs("modif. ", stderr); + } + + if (const byte* frame = bpage->zip.data) { + fprintf(stderr, "\ntype %u size " ULINTPF + " index id " IB_ID_FMT "\n", + fil_page_get_type(frame), + bpage->zip_size(), + btr_page_get_index_id(frame)); + } else { + fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n", + fil_page_get_type(bpage->frame), + btr_page_get_index_id(bpage->frame)); + } + } + + mysql_mutex_unlock(&buf_pool.mutex); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc new file mode 100644 index 00000000..c4f07738 --- /dev/null +++ b/storage/innobase/buf/buf0rea.cc @@ -0,0 +1,710 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0rea.cc +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include <mysql/service_thd_wait.h> + +#include "buf0rea.h" +#include "fil0fil.h" +#include "mtr0mtr.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "ibuf0ibuf.h" +#include "log0recv.h" +#include "trx0sys.h" +#include "os0file.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "log.h" +#include "mariadb_stats.h" + +/** If there are buf_pool.curr_size per the number below pending reads, then +read-ahead is not done: this is to prevent flooding the buffer pool with +i/o-fixed buffer blocks */ +#define BUF_READ_AHEAD_PEND_LIMIT 2 + +/** Remove the sentinel block for the watch before replacing it with a +real block. watch_unset() or watch_occurred() will notice +that the block has been replaced with the real block. +@param w sentinel +@param chain locked hash table chain +@return w->state() */ +inline uint32_t buf_pool_t::watch_remove(buf_page_t *w, + buf_pool_t::hash_chain &chain) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked()); + ut_ad(w >= &watch[0]); + ut_ad(w < &watch[array_elements(watch)]); + ut_ad(!w->in_zip_hash); + ut_ad(!w->zip.data); + + uint32_t s{w->state()}; + w->set_state(buf_page_t::NOT_USED); + ut_ad(s >= buf_page_t::UNFIXED); + ut_ad(s < buf_page_t::READ_FIX); + + if (~buf_page_t::LRU_MASK & s) + page_hash.remove(chain, w); + + ut_ad(!w->in_page_hash); + w->id_= page_id_t(~0ULL); + return s; +} + +/** Initialize a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip whether the uncompressed page is + requested (for ROW_FORMAT=COMPRESSED) +@return pointer to the block +@retval NULL in case of an error */ +TRANSACTIONAL_TARGET +static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, + ulint zip_size, bool unzip) +{ + mtr_t mtr; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) + { + /* It is a read-ahead within an ibuf routine */ + ut_ad(!ibuf_bitmap_page(page_id, zip_size)); + ibuf_mtr_start(&mtr); + + if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr)) + { + ibuf_mtr_commit(&mtr); + return nullptr; + } + } + else + ut_ad(mode == BUF_READ_ANY_PAGE); + + buf_page_t *bpage= nullptr; + buf_block_t *block= nullptr; + if (!zip_size || unzip || recv_recovery_is_on()) + { + block= buf_LRU_get_free_block(false); + block->initialise(page_id, zip_size, buf_page_t::READ_FIX); + /* x_unlock() will be invoked + in buf_page_t::read_complete() by the io-handler thread. */ + block->page.lock.x_lock(true); + } + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + + mysql_mutex_lock(&buf_pool.mutex); + + buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain); + if (hash_page && !buf_pool.watch_is_sentinel(*hash_page)) + { + /* The page is already in the buffer pool. */ + if (block) + { + block->page.lock.x_unlock(true); + ut_d(block->page.set_state(buf_page_t::MEMORY)); + buf_LRU_block_free_non_file_page(block); + } + goto func_exit; + } + + if (UNIV_LIKELY(block != nullptr)) + { + bpage= &block->page; + + /* Insert into the hash table of file pages */ + if (hash_page) + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + bpage->set_state(buf_pool.watch_remove(hash_page, chain) + + (buf_page_t::READ_FIX - buf_page_t::UNFIXED)); + buf_pool.page_hash.append(chain, &block->page); + } + else + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + buf_pool.page_hash.append(chain, &block->page); + } + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(&block->page, true/* to old blocks */); + + if (UNIV_UNLIKELY(zip_size)) + { + /* buf_pool.mutex may be released and reacquired by + buf_buddy_alloc(). We must defer this operation until after the + block descriptor has been added to buf_pool.LRU and + buf_pool.page_hash. */ + block->page.zip.data= static_cast<page_zip_t*> + (buf_buddy_alloc(zip_size)); + + /* To maintain the invariant + block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU() + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(block->page.belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(block, TRUE); + } + } + else + { + /* The compressed page must be allocated before the + control block (bpage), in order to avoid the + invocation of buf_buddy_relocate_block() on + uninitialized data. */ + bool lru= false; + void *data= buf_buddy_alloc(zip_size, &lru); + + /* If buf_buddy_alloc() allocated storage from the LRU list, + it released and reacquired buf_pool.mutex. Thus, we must + check the page_hash again, as it may have been modified. */ + if (UNIV_UNLIKELY(lru)) + { + hash_page= buf_pool.page_hash.get(page_id, chain); + + if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page))) + { + /* The block was added by some other thread. */ + buf_buddy_free(data, zip_size); + goto func_exit; + } + } + + bpage= static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *bpage)); + + page_zip_des_init(&bpage->zip); + page_zip_set_size(&bpage->zip, zip_size); + bpage->zip.data = (page_zip_t*) data; + + bpage->init(buf_page_t::READ_FIX, page_id); + bpage->lock.x_lock(true); + + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + + if (hash_page) + bpage->set_state(buf_pool.watch_remove(hash_page, chain) + + (buf_page_t::READ_FIX - buf_page_t::UNFIXED)); + + buf_pool.page_hash.append(chain, bpage); + } + + /* The block must be put to the LRU list, to the old blocks. + The zip size is already set into the page zip */ + buf_LRU_add_block(bpage, true/* to old blocks */); + } + + buf_pool.stat.n_pages_read++; +func_exit: + mysql_mutex_unlock(&buf_pool.mutex); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) + ibuf_mtr_commit(&mtr); + + ut_ad(!bpage || bpage->in_file()); + + return bpage; +} + +/** Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. + +@param[in,out] space tablespace +@param[in] sync true if synchronous aio is desired +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ..., +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip true=request uncompressed page +@return error code +@retval DB_SUCCESS if the page was read +@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */ +static +dberr_t +buf_read_page_low( + fil_space_t* space, + bool sync, + ulint mode, + const page_id_t page_id, + ulint zip_size, + bool unzip) +{ + buf_page_t* bpage; + + if (buf_dblwr.is_inside(page_id)) { + space->release(); + return DB_PAGE_CORRUPTED; + } + + if (sync) { + } else if (trx_sys_hdr_page(page_id) + || ibuf_bitmap_page(page_id, zip_size) + || (!recv_no_ibuf_operations + && ibuf_page(page_id, zip_size, nullptr))) { + + /* Trx sys header is so low in the latching order that we play + safe and do not leave the i/o-completion to an asynchronous + i/o-thread. Change buffer pages must always be read with + synchronous i/o, to make sure they do not get involved in + thread deadlocks. */ + sync = true; + } + + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ + bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip); + + if (!bpage) { + space->release(); + return DB_SUCCESS_LOCKED_REC; + } + + ut_ad(bpage->in_file()); + ulonglong mariadb_timer= 0; + + if (sync) { + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + if (mariadb_stats_active()) + mariadb_timer= mariadb_measure(); + } + + DBUG_LOG("ib_buf", + "read page " << page_id << " zip_size=" << zip_size + << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); + + void* dst = zip_size ? bpage->zip.data : bpage->frame; + const ulint len = zip_size ? zip_size : srv_page_size; + + auto fio = space->io(IORequest(sync + ? IORequest::READ_SYNC + : IORequest::READ_ASYNC), + os_offset_t{page_id.page_no()} * len, len, + dst, bpage); + + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { + buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX); + } else if (sync) { + thd_wait_end(NULL); + /* The i/o was already completed in space->io() */ + fio.err = bpage->read_complete(*fio.node); + space->release(); + if (fio.err == DB_FAIL) { + fio.err = DB_PAGE_CORRUPTED; + } + if (mariadb_timer) + mariadb_increment_pages_read_time(mariadb_timer); + } + + return fio.err; +} + +/** Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@param[in] page_id page id of a page which the current thread +wants to access +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +TRANSACTIONAL_TARGET +ulint +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) +{ + if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID) + /* Disable the read-ahead for temporary tablespace */ + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + if (os_aio_pending_reads_approx() > + buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + fil_space_t* space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + ulint count= 5 + buf_read_ahead_area / 8; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + page_id_t high= low + buf_read_ahead_area; + high.set_page_no(std::min(high.page_no(), space->last_page_number())); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (page_id_t i= low; i < high; ++i) + { + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain)) + if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count) + goto read_ahead; + } + +no_read_ahead: + space->release(); + return 0; + +read_ahead: + if (space->is_stopping()) + goto no_read_ahead; + + /* Read all the suitable blocks within the area */ + const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + + for (page_id_t i= low; i < high; ++i) + { + if (ibuf_bitmap_page(i, zip_size)) + continue; + if (space->is_stopping()) + break; + space->reacquire(); + if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) == + DB_SUCCESS) + count++; + } + + if (count) + { + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + low.page_no())); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read_rnd+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } + + space->release(); + return count; +} + +/** High-level function which reads a page from a file to buf_pool +if it is not already there. Sets the io_fix and an exclusive lock +on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@retval DB_SUCCESS if the page was read and is not corrupted +@retval DB_SUCCESS_LOCKED_REC if the page was not read +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. +@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) +{ + fil_space_t *space= fil_space_t::get(page_id.space()); + if (!space) + { + ib::info() << "trying to read page " << page_id + << " in nonexisting or being-dropped tablespace"; + return DB_TABLESPACE_DELETED; + } + + buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */ + return buf_read_page_low(space, true, BUF_READ_ANY_PAGE, + page_id, zip_size, false); +} + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in,out] space tablespace +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size) +{ + buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + page_id, zip_size, false); + + /* We do not increment number of I/O operations used for LRU policy + here (buf_LRU_stat_inc_io()). We use this in heuristics to decide + about evicting uncompressed version of compressed pages from the + buffer pool. Since this function is called from buffer pool load + these IOs are deliberate and are not part of normal workload we can + ignore these in our heuristics. */ +} + +/** Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@param[in] page_id page id; see NOTE 3 above +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine +@return number of page read requests issued */ +TRANSACTIONAL_TARGET +ulint +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) +{ + /* check if readahead is disabled. + Disable the read ahead logic for temporary tablespace */ + if (!srv_read_ahead_threshold || page_id.space() >= SRV_TMP_SPACE_ID) + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (os_aio_pending_reads_approx() > + buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + const page_id_t high_1= low + (buf_read_ahead_area - 1); + + /* We will check that almost all pages in the area have been accessed + in the desired order. */ + const bool descending= page_id != low; + + if (!descending && page_id != high_1) + /* This is not a border page of the area */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + fil_space_t *space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + + if (high_1.page_no() > space->last_page_number()) + { + /* The area is not whole. */ +fail: + space->release(); + return 0; + } + + /* How many out of order accessed pages can we ignore + when working out the access pattern for linear readahead */ + ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES - + srv_read_ahead_threshold, + uint32_t{buf_pool.read_ahead_area}); + page_id_t new_low= low, new_high_1= high_1; + unsigned prev_accessed= 0; + for (page_id_t i= low; i <= high_1; ++i) + { + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard here, + because we would have many complex conditions inside the memory + transaction. */ + hash_lock.lock_shared(); + + const buf_page_t* bpage= buf_pool.page_hash.get(i, chain); + if (!bpage) + { + hash_lock.unlock_shared(); + if (i == page_id) + goto fail; +failed: + if (--count) + continue; + goto fail; + } + const unsigned accessed= bpage->is_accessed(); + if (i == page_id) + { + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. The hash_lock is only protecting the + buf_pool.page_hash for page i, not the bpage contents itself. */ + const byte *f= bpage->frame ? bpage->frame : bpage->zip.data; + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT)); + hash_lock.unlock_shared(); + if (prev == FIL_NULL || next == FIL_NULL) + goto fail; + page_id_t id= page_id; + if (descending) + { + if (id == high_1) + ++id; + else if (next - 1 != page_id.page_no()) + goto fail; + else + id.set_page_no(prev); + } + else + { + if (prev + 1 != page_id.page_no()) + goto fail; + id.set_page_no(next); + } + + new_low= id - (id.page_no() % buf_read_ahead_area); + new_high_1= new_low + (buf_read_ahead_area - 1); + + if (id != new_low && id != new_high_1) + /* This is not a border page of the area: return */ + goto fail; + if (new_high_1.page_no() > space->last_page_number()) + /* The area is not whole */ + goto fail; + } + else + hash_lock.unlock_shared(); + + if (!accessed) + goto failed; + /* Note that buf_page_t::is_accessed() returns the time of the + first access. If some blocks of the extent existed in the buffer + pool at the time of a linear access pattern, the first access + times may be nonmonotonic, even though the latest access times + were linear. The threshold (srv_read_ahead_factor) should help a + little against this. */ + bool fail= prev_accessed && + (descending ? prev_accessed > accessed : prev_accessed < accessed); + prev_accessed= accessed; + if (fail) + goto failed; + } + + /* If we got this far, read-ahead can be sensible: do it */ + count= 0; + for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + new_low <= new_high_1; ++new_low) + { + if (ibuf_bitmap_page(new_low, zip_size)) + continue; + if (space->is_stopping()) + break; + space->reacquire(); + if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) == + DB_SUCCESS) + count++; + } + + if (count) + { + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + new_low.page_no())); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } + + space->release(); + return count; +} + +/** Schedule a page for recovery. +@param space tablespace +@param page_id page identifier +@param recs log records +@param init page initialization, or nullptr if the page needs to be read */ +void buf_read_recover(fil_space_t *space, const page_id_t page_id, + page_recv_t &recs, recv_init *init) +{ + ut_ad(space->id == page_id.space()); + space->reacquire(); + const ulint zip_size= space->zip_size(); + + if (init) + { + if (buf_page_t *bpage= buf_page_init_for_read(BUF_READ_ANY_PAGE, page_id, + zip_size, true)) + { + ut_ad(bpage->in_file()); + os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs, + UT_LIST_GET_FIRST(space->chain), + IORequest::READ_ASYNC}, ptrdiff_t(init)); + } + } + else if (dberr_t err= buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + page_id, zip_size, true)) + { + if (err != DB_SUCCESS_LOCKED_REC) + sql_print_error("InnoDB: Recovery failed to read page " + UINT32PF " from %s", + page_id.page_no(), space->chain.start->name); + } +} |