diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/buf | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/buf')
-rw-r--r-- | storage/innobase/buf/buf0block_hint.cc | 59 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buddy.cc | 764 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buf.cc | 4728 | ||||
-rw-r--r-- | storage/innobase/buf/buf0checksum.cc | 129 | ||||
-rw-r--r-- | storage/innobase/buf/buf0dblwr.cc | 764 | ||||
-rw-r--r-- | storage/innobase/buf/buf0dump.cc | 824 | ||||
-rw-r--r-- | storage/innobase/buf/buf0flu.cc | 2530 | ||||
-rw-r--r-- | storage/innobase/buf/buf0lru.cc | 1477 | ||||
-rw-r--r-- | storage/innobase/buf/buf0rea.cc | 785 |
9 files changed, 12060 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc new file mode 100644 index 00000000..6d99d0b6 --- /dev/null +++ b/storage/innobase/buf/buf0block_hint.cc @@ -0,0 +1,59 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#include "buf0block_hint.h" +namespace buf { + +void Block_hint::buffer_fix_block_if_still_valid() +{ + /* To check if m_block belongs to the current buf_pool, we must + prevent freeing memory while we check, and until we buffer-fix the + block. For this purpose it is enough to latch any of the many + latches taken by buf_pool_t::resize(). + + Similar to buf_page_optimistic_get(), we must validate + m_block->page.id() after acquiring the hash_lock, because the object + may have been freed and not actually attached to buf_pool.page_hash + at the moment. (The block could have been reused to store a + different page, and that slice of buf_pool.page_hash could be protected + by another hash_lock that we are not holding.) + + Finally, assuming that we have correct hash bucket latched, we must + validate m_block->state() to ensure that the block is not being freed. */ + if (m_block) + { + const ulint fold= m_page_id.fold(); + page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold); + if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() && + m_block->page.state() == BUF_BLOCK_FILE_PAGE) + buf_block_buf_fix_inc(m_block, __FILE__, __LINE__); + else + clear(); + hash_lock->read_unlock(); + } +} +} // namespace buf diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc new file mode 100644 index 00000000..f822adc3 --- /dev/null +++ b/storage/innobase/buf/buf0buddy.cc @@ -0,0 +1,764 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buddy.cc +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "page0zip.h" +#include "srv0start.h" + +/** When freeing a buf we attempt to coalesce by looking at its buddy +and deciding whether it is free or not. To ascertain if the buddy is +free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET +within the buddy. The question is how we can be sure that it is +safe to look at BUF_BUDDY_STAMP_OFFSET. +The answer lies in following invariants: +* All blocks allocated by buddy allocator are used for compressed +page frame. +* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND +* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in +a frame. + -- The above is true because we look at these fields when the + corresponding buddy block is free which implies that: + * The block we are looking at must have an address aligned at + the same size that its free buddy has. For example, if we have + a free block of 8K then its buddy's address must be aligned at + 8K as well. + * It is possible that the block we are looking at may have been + further divided into smaller sized blocks but its starting + address must still remain the start of a page frame i.e.: it + cannot be middle of a block. For example, if we have a free + block of size 8K then its buddy may be divided into blocks + of, say, 1K, 1K, 2K, 4K but the buddy's address will still be + the starting address of first 1K compressed page. + * What is important to note is that for any given block, the + buddy's address cannot be in the middle of a larger block i.e.: + in above example, our 8K block cannot have a buddy whose address + is aligned on 8K but it is part of a larger 16K block. +*/ + +/** Offset within buf_buddy_free_t where free or non_free stamps +are written.*/ +#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +/** Value that we stamp on all buffers that are currently on the zip_free +list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */ +#define BUF_BUDDY_STAMP_FREE SRV_SPACE_ID_UPPER_BOUND + +/** Stamp value for non-free buffers. Will be overwritten by a non-zero +value by the consumer of the block */ +#define BUF_BUDDY_STAMP_NONFREE 0XFFFFFFFFUL + +/** Return type of buf_buddy_is_free() */ +enum buf_buddy_state_t { + BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */ + BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */ + BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy + are in use */ +}; + +/**********************************************************************//** +Invalidate memory area that we won't access while page is free */ +UNIV_INLINE +void +buf_buddy_mem_invalid( +/*==================*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of zip_free[] */ +{ + ut_ad(i <= BUF_BUDDY_SIZES); + + MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i); + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i); +} + +/**********************************************************************//** +Check if a buddy is stamped free. +@return whether the buddy is free */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +bool +buf_buddy_stamp_is_free( +/*====================*/ + const buf_buddy_free_t* buf) /*!< in: block to check */ +{ + compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE); + return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_FREE); +} + +/**********************************************************************//** +Stamps a buddy free. */ +UNIV_INLINE +void +buf_buddy_stamp_free( +/*=================*/ + buf_buddy_free_t* buf, /*!< in/out: block to stamp */ + ulint i) /*!< in: block size */ +{ + ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i)); + buf_buddy_mem_invalid(buf, i); + mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, + BUF_BUDDY_STAMP_FREE); + buf->stamp.size = i; +} + +/**********************************************************************//** +Stamps a buddy nonfree. +@param[in,out] buf block to stamp +@param[in] i block size */ +static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i) +{ + buf_buddy_mem_invalid(buf, i); + compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU); + memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); +} + +/**********************************************************************//** +Get the offset of the buddy of a compressed page frame. +@return the buddy relative of page */ +UNIV_INLINE +void* +buf_buddy_get( +/*==========*/ + byte* page, /*!< in: compressed page */ + ulint size) /*!< in: page size in bytes */ +{ + ut_ad(ut_is_2pow(size)); + ut_ad(size >= BUF_BUDDY_LOW); + ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN); + ut_ad(size < BUF_BUDDY_HIGH); + ut_ad(BUF_BUDDY_HIGH == srv_page_size); + ut_ad(!ut_align_offset(page, size)); + + if (((ulint) page) & size) { + return(page - size); + } else { + return(page + size); + } +} + +#ifdef UNIV_DEBUG +/** Validate a given zip_free list. */ +struct CheckZipFree { + CheckZipFree(ulint i) : m_i(i) {} + + void operator()(const buf_buddy_free_t* elem) const + { + ut_ad(buf_buddy_stamp_is_free(elem)); + ut_ad(elem->stamp.size <= m_i); + } + + const ulint m_i; +}; + +/** Validate a buddy list. +@param[in] i buddy size to validate */ +static void buf_buddy_list_validate(ulint i) +{ + ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i)); +} + +/**********************************************************************//** +Debug function to validate that a buffer is indeed free i.e.: in the +zip_free[]. +@param[in] buf block to check +@param[in] i index of buf_pool.zip_free[] +@return true if free */ +static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i) +{ + const ulint size = BUF_BUDDY_LOW << i; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + buf_buddy_free_t* itr; + + for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + itr && itr != buf; + itr = UT_LIST_GET_NEXT(list, itr)) { + } + + return(itr == buf); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Checks if a buf is free i.e.: in the zip_free[]. +@retval BUF_BUDDY_STATE_FREE if fully free +@retval BUF_BUDDY_STATE_USED if currently in use +@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */ +static MY_ATTRIBUTE((warn_unused_result)) +buf_buddy_state_t +buf_buddy_is_free( +/*==============*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of + buf_pool.zip_free[] */ +{ +#ifdef UNIV_DEBUG + const ulint size = BUF_BUDDY_LOW << i; + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); +#endif /* UNIV_DEBUG */ + + /* We assume that all memory from buf_buddy_alloc() + is used for compressed page frames. */ + + /* We look inside the allocated objects returned by + buf_buddy_alloc() and assume that each block is a compressed + page that contains one of the following in space_id. + * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or + * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but + not initialized yet or + * A valid space_id of a compressed tablespace + + The call below attempts to read from free memory. The memory + is "owned" by the buddy allocator (and it has been allocated + from the buffer pool), so there is nothing wrong about this. */ + if (!buf_buddy_stamp_is_free(buf)) { + return(BUF_BUDDY_STATE_USED); + } + + /* A block may be free but a fragment of it may still be in use. + To guard against that we write the free block size in terms of + zip_free index at start of stamped block. Note that we can + safely rely on this value only if the buf is free. */ + ut_ad(buf->stamp.size <= i); + return(buf->stamp.size == i + ? BUF_BUDDY_STATE_FREE + : BUF_BUDDY_STATE_PARTIALLY_USED); +} + +/** Add a block to the head of the appropriate buddy free list. +@param[in,out] buf block to be freed +@param[in] i index of buf_pool.zip_free[] */ +UNIV_INLINE +void +buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.zip_free[i].start != buf); + + buf_buddy_stamp_free(buf, i); + UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf); + ut_d(buf_buddy_list_validate(i)); +} + +/** Remove a block from the appropriate buddy free list. +@param[in,out] buf block to be freed +@param[in] i index of buf_pool.zip_free[] */ +UNIV_INLINE +void +buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_buddy_check_free(buf, i)); + + UT_LIST_REMOVE(buf_pool.zip_free[i], buf); + buf_buddy_stamp_nonfree(buf, i); +} + +/** Try to allocate a block from buf_pool.zip_free[]. +@param[in] i index of buf_pool.zip_free[] +@return allocated block, or NULL if buf_pool.zip_free[] was empty */ +static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i) +{ + buf_buddy_free_t* buf; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + ut_d(buf_buddy_list_validate(i)); + + buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + if (buf_pool.curr_size < buf_pool.old_size + && UT_LIST_GET_LEN(buf_pool.withdraw) + < buf_pool.withdraw_target) { + + while (buf != NULL + && buf_pool.will_be_withdrawn( + reinterpret_cast<byte*>(buf))) { + /* This should be withdrawn, not to be allocated */ + buf = UT_LIST_GET_NEXT(list, buf); + } + } + + if (buf) { + buf_buddy_remove_from_free(buf, i); + } else if (i + 1 < BUF_BUDDY_SIZES) { + /* Attempt to split. */ + buf = buf_buddy_alloc_zip(i + 1); + + if (buf) { + buf_buddy_free_t* buddy = + reinterpret_cast<buf_buddy_free_t*>( + reinterpret_cast<byte*>(buf) + + (BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool.contains_zip(buddy)); + buf_buddy_add_to_free(buddy, i); + } + } + + if (buf) { + /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */ + MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET); + MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes, + (BUF_BUDDY_LOW << i) + - (BUF_BUDDY_STAMP_OFFSET + 4)); + ut_ad(mach_read_from_4(buf->stamp.bytes + + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_NONFREE); + } + + return(buf); +} + +/** Deallocate a buffer frame of srv_page_size. +@param[in] buf buffer frame to deallocate */ +static +void +buf_buddy_block_free(void* buf) +{ + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); + buf_page_t* bpage; + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(!ut_align_offset(buf, srv_page_size)); + + HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->state() == BUF_BLOCK_MEMORY + && bpage->in_zip_hash), + ((buf_block_t*) bpage)->frame == buf); + ut_a(bpage); + ut_a(bpage->state() == BUF_BLOCK_MEMORY); + ut_ad(bpage->in_zip_hash); + ut_d(bpage->in_zip_hash = false); + HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage); + + ut_d(memset(buf, 0, srv_page_size)); + MEM_UNDEFINED(buf, srv_page_size); + + block = (buf_block_t*) bpage; + buf_LRU_block_free_non_file_page(block); + + ut_ad(buf_pool.buddy_n_frames > 0); + ut_d(buf_pool.buddy_n_frames--); +} + +/**********************************************************************//** +Allocate a buffer block to the buddy allocator. */ +static +void +buf_buddy_block_register( +/*=====================*/ + buf_block_t* block) /*!< in: buffer frame to allocate */ +{ + const ulint fold = BUF_POOL_ZIP_FOLD(block); + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); + + ut_a(block->frame); + ut_a(!ut_align_offset(block->frame, srv_page_size)); + + ut_ad(!block->page.in_zip_hash); + ut_d(block->page.in_zip_hash = true); + HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page); + + ut_d(buf_pool.buddy_n_frames++); +} + +/** Allocate a block from a bigger object. +@param[in] buf a block that is free to use +@param[in] i index of buf_pool.zip_free[] +@param[in] j size of buf as an index of buf_pool.zip_free[] +@return allocated block */ +static +void* +buf_buddy_alloc_from(void* buf, ulint i, ulint j) +{ + ulint offs = BUF_BUDDY_LOW << j; + ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(j >= i); + ut_ad(!ut_align_offset(buf, offs)); + + /* Add the unused parts of the block to the free lists. */ + while (j > i) { + buf_buddy_free_t* zip_buf; + + offs >>= 1; + j--; + + zip_buf = reinterpret_cast<buf_buddy_free_t*>( + reinterpret_cast<byte*>(buf) + offs); + buf_buddy_add_to_free(zip_buf, j); + } + + buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i); + return(buf); +} + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +byte *buf_buddy_alloc_low(ulint i, bool *lru) +{ + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = (buf_block_t*) buf_buddy_alloc_zip(i); + + if (block) { + goto func_exit; + } + } + + /* Try allocating from the buf_pool.free list. */ + block = buf_LRU_get_free_only(); + + if (block) { + goto alloc_big; + } + + /* Try replacing an uncompressed page in the buffer pool. */ + block = buf_LRU_get_free_block(true); + if (lru) { + *lru = true; + } + +alloc_big: + buf_buddy_block_register(block); + + block = (buf_block_t*) buf_buddy_alloc_from( + block->frame, i, BUF_BUDDY_SIZES); + +func_exit: + buf_pool.buddy_stat[i].used++; + return reinterpret_cast<byte*>(block); +} + +/** Try to relocate a block. The caller must hold zip_free_mutex, and this +function will release and lock it again. +@param[in] src block to relocate +@param[in] dst free block to relocated to +@param[in] i index of buf_pool.zip_free[] +@param[in] force true if we must relocated always +@return true if relocated */ +static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) +{ + buf_page_t* bpage; + const ulint size = BUF_BUDDY_LOW << i; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!ut_align_offset(src, size)); + ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + MEM_CHECK_ADDRESSABLE(dst, size); + + uint32_t space = mach_read_from_4(static_cast<const byte*>(src) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + uint32_t offset = mach_read_from_4(static_cast<const byte*>(src) + + FIL_PAGE_OFFSET); + + /* Suppress Valgrind or MSAN warnings. */ + MEM_MAKE_DEFINED(&space, sizeof space); + MEM_MAKE_DEFINED(&offset, sizeof offset); + + ut_ad(space != BUF_BUDDY_STAMP_FREE); + + const page_id_t page_id(space, offset); + const ulint fold= page_id.fold(); + + bpage = buf_pool.page_hash_get_low(page_id, fold); + + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool.page_hash yet. Obviously, + it cannot be relocated. */ + + if (!force || space != 0 || offset != 0) { + return(false); + } + + /* It might be just uninitialized page. + We should search from LRU list also. */ + + bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + while (bpage != NULL) { + if (bpage->zip.data == src) { + ut_ad(bpage->id() == page_id); + break; + } + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + + if (bpage == NULL) { + return(false); + } + } + + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + return(false); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + MEM_CHECK_ADDRESSABLE(src, size); + + if (!bpage->can_relocate()) { + return false; + } + + page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); + + if (bpage->can_relocate()) { + /* Relocate the compressed page. */ + const ulonglong ns = my_interval_timer(); + + ut_a(bpage->zip.data == src); + + memcpy(dst, src, size); + bpage->zip.data = reinterpret_cast<page_zip_t*>(dst); + + hash_lock->write_unlock(); + + buf_buddy_mem_invalid( + reinterpret_cast<buf_buddy_free_t*>(src), i); + + buf_buddy_stat_t* buddy_stat = &buf_pool.buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000; + return(true); + } + + hash_lock->write_unlock(); + + return(false); +} + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +void buf_buddy_free_low(void* buf, ulint i) +{ + buf_buddy_free_t* buddy; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(buf_pool.buddy_stat[i].used > 0); + + buf_pool.buddy_stat[i].used--; +recombine: + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i); + + if (i == BUF_BUDDY_SIZES) { + buf_buddy_block_free(buf); + return; + } + + ut_ad(i < BUF_BUDDY_SIZES); + ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool.contains_zip(buf)); + + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16 + && buf_pool.curr_size >= buf_pool.old_size) { + goto func_exit; + } + + /* Try to combine adjacent blocks. */ + buddy = reinterpret_cast<buf_buddy_free_t*>( + buf_buddy_get(reinterpret_cast<byte*>(buf), + BUF_BUDDY_LOW << i)); + + switch (buf_buddy_is_free(buddy, i)) { + case BUF_BUDDY_STATE_FREE: + /* The buddy is free: recombine */ + buf_buddy_remove_from_free(buddy, i); +buddy_is_free: + ut_ad(!buf_pool.contains_zip(buddy)); + i++; + buf = ut_align_down(buf, BUF_BUDDY_LOW << i); + + goto recombine; + + case BUF_BUDDY_STATE_USED: + ut_d(buf_buddy_list_validate(i)); + + /* The buddy is not free. Is there a free block of + this size? */ + if (buf_buddy_free_t* zip_buf = + UT_LIST_GET_FIRST(buf_pool.zip_free[i])) { + + /* Remove the block from the free list, because + a successful buf_buddy_relocate() will overwrite + zip_free->list. */ + buf_buddy_remove_from_free(zip_buf, i); + + /* Try to relocate the buddy of buf to the free + block. */ + if (buf_buddy_relocate(buddy, zip_buf, i, false)) { + goto buddy_is_free; + } + + buf_buddy_add_to_free(zip_buf, i); + } + + break; + case BUF_BUDDY_STATE_PARTIALLY_USED: + /* Some sub-blocks in the buddy are still in use. + Relocation will fail. No need to try. */ + break; + } + +func_exit: + /* Free the block to the buddy list. */ + buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i); +} + +/** Try to reallocate a block. +@param[in] buf buf_pool block to be reallocated +@param[in] size block size, up to srv_page_size +@return whether the reallocation succeeded */ +bool +buf_buddy_realloc(void* buf, ulint size) +{ + buf_block_t* block = NULL; + ulint i = buf_buddy_get_slot(size); + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i)); + } + + if (block == NULL) { + /* Try allocating from the buf_pool.free list. */ + block = buf_LRU_get_free_only(); + + if (block == NULL) { + return(false); /* free_list was not enough */ + } + + buf_buddy_block_register(block); + + block = reinterpret_cast<buf_block_t*>( + buf_buddy_alloc_from( + block->frame, i, BUF_BUDDY_SIZES)); + } + + buf_pool.buddy_stat[i].used++; + + /* Try to relocate the buddy of buf to the free block. */ + if (buf_buddy_relocate(buf, block, i, true)) { + /* succeeded */ + buf_buddy_free_low(buf, i); + } else { + /* failed */ + buf_buddy_free_low(block, i); + } + + return(true); /* free_list was enough */ +} + +/** Combine all pairs of free buddies. */ +void buf_buddy_condense_free() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.curr_size < buf_pool.old_size); + + for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) { + buf_buddy_free_t* buf = + UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + /* seek to withdraw target */ + while (buf != NULL + && !buf_pool.will_be_withdrawn( + reinterpret_cast<byte*>(buf))) { + buf = UT_LIST_GET_NEXT(list, buf); + } + + while (buf != NULL) { + buf_buddy_free_t* next = + UT_LIST_GET_NEXT(list, buf); + + buf_buddy_free_t* buddy = + reinterpret_cast<buf_buddy_free_t*>( + buf_buddy_get( + reinterpret_cast<byte*>(buf), + BUF_BUDDY_LOW << i)); + + /* seek to the next withdraw target */ + while (true) { + while (next != NULL + && !buf_pool.will_be_withdrawn( + reinterpret_cast<byte*>(next))) { + next = UT_LIST_GET_NEXT(list, next); + } + + if (buddy != next) { + break; + } + + next = UT_LIST_GET_NEXT(list, next); + } + + if (buf_buddy_is_free(buddy, i) + == BUF_BUDDY_STATE_FREE) { + /* Both buf and buddy are free. + Try to combine them. */ + buf_buddy_remove_from_free(buf, i); + buf_pool.buddy_stat[i].used++; + + buf_buddy_free_low(buf, i); + } + + buf = next; + } + } +} diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc new file mode 100644 index 00000000..b658bdfc --- /dev/null +++ b/storage/innobase/buf/buf0buf.cc @@ -0,0 +1,4728 @@ +/***************************************************************************** + +Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buf.cc +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "assume_aligned.h" +#include "mtr0types.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0checksum.h" +#include "ut0crc32.h" +#include <string.h> + +#ifndef UNIV_INNOCHECKSUM +#include "my_cpu.h" +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "lock0lock.h" +#include "sync0rw.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "log0log.h" +#include "dict0stats_bg.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "srv0mon.h" +#include "log0crypt.h" +#include "fil0pagecompress.h" +#endif /* !UNIV_INNOCHECKSUM */ +#include "page0zip.h" +#include "sync0sync.h" +#include "buf0dump.h" +#include <map> +#include <sstream> + +using st_::span; + +#ifdef HAVE_LIBNUMA +#include <numa.h> +#include <numaif.h> +struct set_numa_interleave_t +{ + set_numa_interleave_t() + { + if (srv_numa_interleave) { + + struct bitmask *numa_mems_allowed = numa_get_mems_allowed(); + ib::info() << "Setting NUMA memory policy to" + " MPOL_INTERLEAVE"; + if (set_mempolicy(MPOL_INTERLEAVE, + numa_mems_allowed->maskp, + numa_mems_allowed->size) != 0) { + + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_INTERLEAVE: " + << strerror(errno); + } + numa_bitmask_free(numa_mems_allowed); + } + } + + ~set_numa_interleave_t() + { + if (srv_numa_interleave) { + + ib::info() << "Setting NUMA memory policy to" + " MPOL_DEFAULT"; + if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) { + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_DEFAULT: " + << strerror(errno); + } + } + } +}; + +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa +#else +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE +#endif /* HAVE_LIBNUMA */ + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool.mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool.mutex. + +The buf_pool.mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool.mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool.mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. + +The free list (buf_pool.free) contains blocks which are currently not +used. + +The common LRU list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would normally be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +end of the LRU list, we make sure that most of the buf_pool stays +in the main memory, undisturbed. + +The unzip_LRU list contains a subset of the common LRU list. The +blocks on the unzip_LRU list hold a compressed file page and the +corresponding uncompressed page frame. A block is in unzip_LRU if and +only if the predicate block->page.belongs_to_unzip_LRU() +holds. The blocks in unzip_LRU will be in same order as they are in +the common LRU list. That is, each manipulation of the common LRU +list will result in the same manipulation of the unzip_LRU list. + +The chain of modified blocks (buf_pool.flush_list) contains the blocks +holding persistent file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. +The access to this list is protected by buf_pool.flush_list_mutex. + +The control blocks for uncompressed pages are accessible via +buf_block_t objects that are reachable via buf_pool.chunks[]. +The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages +that are not in buf_pool.flush_list and for which no uncompressed +page has been allocated in buf_pool are only accessible via +buf_pool.LRU. + +The chains of free memory blocks (buf_pool.zip_free[]) are used by +the buddy allocator (buf0buddy.cc) to keep track of currently unused +memory blocks of size sizeof(buf_page_t)..srv_page_size / 2. These +blocks are inside the srv_page_size-sized memory blocks of type +BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer +pool. The buddy allocator is solely used for allocating control +blocks for compressed pages (buf_page_t) and compressed page frames. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix field is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and resets the io_fix field +when the io operation completes. + +A thread may request the above operation using the function +buf_page_get(). It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. +*/ + +#ifndef UNIV_INNOCHECKSUM +void page_hash_latch::read_lock_wait() +{ + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + ut_delay(srv_spin_wait_delay); + if (read_trylock()) + return; + } + /* Fall back to yielding to other threads. */ + do + os_thread_yield(); + while (!read_trylock()); +} + +void page_hash_latch::write_lock_wait() +{ + write_lock_wait_start(); + + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + if (write_lock_poll()) + return; + ut_delay(srv_spin_wait_delay); + } + + /* Fall back to yielding to other threads. */ + do + os_thread_yield(); + while (!write_lock_poll()); +} + +/** Value in microseconds */ +constexpr int WAIT_FOR_READ= 100; +constexpr int WAIT_FOR_WRITE= 100; +/** Number of attempts made to read in a page in the buffer pool */ +constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100; +/** The maximum portion of the buffer pool that can be used for the +read-ahead buffer. (Divide buf_pool size by this amount) */ +constexpr uint32_t BUF_READ_AHEAD_PORTION= 32; + +/** A 64KiB buffer of NUL bytes, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to NUL bytes, in +dtuple_convert_big_rec(). */ +const byte *field_ref_zero; + +/** The InnoDB buffer pool */ +buf_pool_t buf_pool; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref; + +#ifdef UNIV_DEBUG +/** Disable resizing buffer pool to make assertion code not expensive. */ +my_bool buf_disable_resize_buffer_pool_debug = TRUE; + +/** This is used to insert validation operations in execution +in the debug version */ +static ulint buf_dbg_counter; +#endif /* UNIV_DEBUG */ + +/** Macro to determine whether the read of write counter is used depending +on the io_type */ +#define MONITOR_RW_COUNTER(io_type, counter) \ + ((io_type == BUF_IO_READ) \ + ? (counter##_READ) \ + : (counter##_WRITTEN)) + + +/** Decrypt a page for temporary tablespace. +@param[in,out] tmp_frame Temporary buffer +@param[in] src_frame Page to decrypt +@return true if temporary tablespace decrypted, false if not */ +static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) +{ + if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) { + return true; + } + + /* read space & lsn */ + uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + /* Copy FIL page header, it is not encrypted */ + memcpy(tmp_frame, src_frame, header_len); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + header_len; + byte* dst = tmp_frame + header_len; + uint srclen = uint(srv_page_size) + - (header_len + FIL_PAGE_FCRC32_CHECKSUM); + ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + + if (!log_tmp_block_decrypt(src, srclen, dst, + (offset * srv_page_size))) { + return false; + } + + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); + + memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(src_frame, tmp_frame, + srv_page_size); + srv_stats.pages_decrypted.inc(); + srv_stats.n_temp_blocks_decrypted.inc(); + + return true; /* page was decrypted */ +} + +/** Decrypt a page. +@param[in,out] bpage Page control block +@param[in] node data file +@return whether the operation was successful */ +static bool buf_page_decrypt_after_read(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + ut_ad(node.space->id == bpage->id().space()); + const auto flags = node.space->flags; + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : + ((buf_block_t*) bpage)->frame; + bool page_compressed = node.space->is_compressed() + && buf_page_is_compressed(dst_frame, flags); + const page_id_t id(bpage->id()); + + if (id.page_no() == 0) { + /* File header pages are not encrypted/compressed */ + return (true); + } + + if (node.space->purpose == FIL_TYPE_TEMPORARY + && innodb_encrypt_temporary_tables) { + buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve(); + ut_a(slot); + slot->allocate(); + + if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) { + slot->release(); + ib::error() << "Encrypted page " << id + << " in file " << node.name; + return false; + } + + slot->release(); + return true; + } + + /* Page is encrypted if encryption information is found from + tablespace and page contains used key_version. This is true + also for pages first compressed and then encrypted. */ + + buf_tmp_buffer_t* slot; + uint key_version = buf_page_get_key_version(dst_frame, flags); + + if (page_compressed && !key_version) { + /* the page we read is unencrypted */ + /* Find free slot from temporary memory array */ +decompress: + if (fil_space_t::full_crc32(flags) + && buf_page_is_corrupted(true, dst_frame, flags)) { + return false; + } + + slot = buf_pool.io_buf_reserve(); + ut_a(slot); + slot->allocate(); + +decompress_with_slot: + ut_d(fil_page_type_validate(node.space, dst_frame)); + + ulint write_size = fil_page_decompress( + slot->crypt_buf, dst_frame, flags); + slot->release(); + ut_ad(!write_size + || fil_page_type_validate(node.space, dst_frame)); + ut_ad(node.space->referenced()); + return write_size != 0; + } + + if (key_version && node.space->crypt_data) { + /* Verify encryption checksum before we even try to + decrypt. */ + if (!buf_page_verify_crypt_checksum(dst_frame, flags)) { +decrypt_failed: + ib::error() << "Encrypted page " << id + << " in file " << node.name + << " looks corrupted; key_version=" + << key_version; + return false; + } + + slot = buf_pool.io_buf_reserve(); + ut_a(slot); + slot->allocate(); + ut_d(fil_page_type_validate(node.space, dst_frame)); + + /* decrypt using crypt_buf to dst_frame */ + if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) { + slot->release(); + goto decrypt_failed; + } + + ut_d(fil_page_type_validate(node.space, dst_frame)); + + if ((fil_space_t::full_crc32(flags) && page_compressed) + || fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress_with_slot; + } + + slot->release(); + } else if (fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress; + } + + ut_ad(node.space->referenced()); + return true; +} +#endif /* !UNIV_INNOCHECKSUM */ + +/** Checks if the page is in crc32 checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in crc32 checksum format. */ +bool +buf_page_is_checksum_valid_crc32( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) +{ + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + +#ifdef UNIV_INNOCHECKSUM + if (log_file + && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) { + fprintf(log_file, "page::" UINT32PF ";" + " crc32 calculated = " UINT32PF ";" + " recorded checksum field1 = " ULINTPF " recorded" + " checksum field2 =" ULINTPF "\n", cur_page_num, + crc32, checksum_field1, checksum_field2); + } +#endif /* UNIV_INNOCHECKSUM */ + + if (checksum_field1 != checksum_field2) { + return false; + } + + return checksum_field1 == crc32; +} + +/** Checks if the page is in innodb checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in innodb checksum format. */ +bool +buf_page_is_checksum_valid_innodb( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) +{ + /* There are 2 valid formulas for + checksum_field2 (old checksum field) which algo=innodb could have + written to the page: + + 1. Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. + + 2. Newer InnoDB versions store the old formula checksum + (buf_calc_page_old_checksum()). */ + + ulint old_checksum = buf_calc_page_old_checksum(read_buf); + ulint new_checksum = buf_calc_page_new_checksum(read_buf); + +#ifdef UNIV_INNOCHECKSUM + if (log_file + && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) { + fprintf(log_file, "page::" UINT32PF ";" + " old style: calculated =" + " " ULINTPF "; recorded = " ULINTPF "\n", + cur_page_num, old_checksum, + checksum_field2); + fprintf(log_file, "page::" UINT32PF ";" + " new style: calculated =" + " " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n", + cur_page_num, new_checksum, + buf_calc_page_crc32(read_buf), checksum_field1); + } + + if (log_file + && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) { + fprintf(log_file, "page::" UINT32PF ";" + " old style: calculated =" + " " ULINTPF "; recorded checksum = " ULINTPF "\n", + cur_page_num, old_checksum, + checksum_field2); + fprintf(log_file, "page::" UINT32PF ";" + " new style: calculated =" + " " ULINTPF "; recorded checksum = " ULINTPF "\n", + cur_page_num, new_checksum, + checksum_field1); + } +#endif /* UNIV_INNOCHECKSUM */ + + + if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && checksum_field2 != old_checksum) { + DBUG_LOG("checksum", + "Page checksum crc32 not valid" + << " field1 " << checksum_field1 + << " field2 " << checksum_field2 + << " crc32 " << buf_calc_page_old_checksum(read_buf) + << " lsn " << mach_read_from_4( + read_buf + FIL_PAGE_LSN)); + return(false); + } + + /* old field is fine, check the new field */ + + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */ + + if (checksum_field1 != 0 && checksum_field1 != new_checksum) { + DBUG_LOG("checksum", + "Page checksum crc32 not valid" + << " field1 " << checksum_field1 + << " field2 " << checksum_field2 + << " crc32 " << buf_calc_page_new_checksum(read_buf) + << " lsn " << mach_read_from_4( + read_buf + FIL_PAGE_LSN)); + return(false); + } + + return(true); +} + +/** Checks if the page is in none checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in none checksum format. */ +bool +buf_page_is_checksum_valid_none( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) +{ +#ifndef DBUG_OFF + if (checksum_field1 != checksum_field2 + && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) { + DBUG_LOG("checksum", + "Page checksum crc32 not valid" + << " field1 " << checksum_field1 + << " field2 " << checksum_field2 + << " crc32 " << BUF_NO_CHECKSUM_MAGIC + << " lsn " << mach_read_from_4(read_buf + + FIL_PAGE_LSN)); + } +#endif /* DBUG_OFF */ + +#ifdef UNIV_INNOCHECKSUM + if (log_file + && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) { + fprintf(log_file, + "page::" UINT32PF "; none checksum: calculated" + " = %lu; recorded checksum_field1 = " ULINTPF + " recorded checksum_field2 = " ULINTPF "\n", + cur_page_num, BUF_NO_CHECKSUM_MAGIC, + checksum_field1, checksum_field2); + } +#endif /* UNIV_INNOCHECKSUM */ + + return(checksum_field1 == checksum_field2 + && checksum_field1 == BUF_NO_CHECKSUM_MAGIC); +} + +/** Checks whether the lsn present in the page is lesser than the +peek current lsn. +@param[in] check_lsn lsn to check +@param[in] read_buf page. */ +static void buf_page_check_lsn(bool check_lsn, const byte* read_buf) +{ +#ifndef UNIV_INNOCHECKSUM + if (check_lsn && recv_lsn_checks_on) { + const lsn_t current_lsn = log_sys.get_lsn(); + const lsn_t page_lsn + = mach_read_from_8(read_buf + FIL_PAGE_LSN); + + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + if (current_lsn < page_lsn) { + + const uint32_t space_id = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_ID); + const uint32_t page_no = mach_read_from_4( + read_buf + FIL_PAGE_OFFSET); + + ib::error() << "Page " << page_id_t(space_id, page_no) + << " log sequence number " << page_lsn + << " is in the future! Current system" + << " log sequence number " + << current_lsn << "."; + + ib::error() << "Your database may be corrupt or" + " you may have copied the InnoDB" + " tablespace but not the InnoDB" + " log files. " + << FORCE_RECOVERY_MSG; + + } + } +#endif /* !UNIV_INNOCHECKSUM */ +} + + +/** Check if a buffer is all zeroes. +@param[in] buf data to check +@return whether the buffer is all zeroes */ +bool buf_is_zeroes(span<const byte> buf) +{ + ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX); + return memcmp(buf.data(), field_ref_zero, buf.size()) == 0; +} + +/** Check if a page is corrupt. +@param[in] check_lsn whether the LSN should be checked +@param[in] read_buf database page +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] space tablespace +@return whether the page is corrupted */ +bool +buf_page_is_corrupted( + bool check_lsn, + const byte* read_buf, + ulint fsp_flags) +{ +#ifndef UNIV_INNOCHECKSUM + DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); ); +#endif + if (fil_space_t::full_crc32(fsp_flags)) { + bool compressed = false, corrupted = false; + const uint size = buf_page_full_crc32_size( + read_buf, &compressed, &corrupted); + if (corrupted) { + return true; + } + const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM); + uint crc32 = mach_read_from_4(end); + + if (!crc32 && size == srv_page_size + && buf_is_zeroes(span<const byte>(read_buf, size))) { + return false; + } + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 2) { + crc32++; + } + }); + + if (crc32 != ut_crc32(read_buf, + size - FIL_PAGE_FCRC32_CHECKSUM)) { + return true; + } + static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + if (!compressed + && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION + + read_buf) + && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4), + end - (FIL_PAGE_FCRC32_END_LSN + - FIL_PAGE_FCRC32_CHECKSUM), + 4)) { + return true; + } + + buf_page_check_lsn(check_lsn, read_buf); + return false; + } + + size_t checksum_field1 = 0; + size_t checksum_field2 = 0; + uint32_t crc32 = 0; + bool crc32_inited = false; + bool crc32_chksum = false; + const ulint zip_size = fil_space_t::zip_size(fsp_flags); + const uint16_t page_type = fil_page_get_type(read_buf); + + /* We can trust page type if page compression is set on tablespace + flags because page compression flag means file must have been + created with 10.1 (later than 5.5 code base). In 10.1 page + compressed tables do not contain post compression checksum and + FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can + be null if we are in fil_check_first_page() and first page + is not compressed or encrypted. Page checksum is verified + after decompression (i.e. normally pages are already + decompressed at this stage). */ + if ((page_type == FIL_PAGE_PAGE_COMPRESSED || + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) +#ifndef UNIV_INNOCHECKSUM + && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags) +#endif + ) { + return(false); + } + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment"); + + if (!zip_size + && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4, + read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(true); + } + + buf_page_check_lsn(check_lsn, read_buf); + + /* Check whether the checksum fields have correct values */ + + const srv_checksum_algorithm_t curr_algo = + static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm); + + if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) { + return(false); + } + + if (zip_size) { + return !page_zip_verify_checksum(read_buf, zip_size); + } + + checksum_field1 = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + checksum_field2 = mach_read_from_4( + read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); + + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + + /* A page filled with NUL bytes is considered not corrupted. + Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7), + the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero + for the first page of each file of the system tablespace. + We want to ignore it for the system tablespace, but because + we do not know the expected tablespace here, we ignore the + field for all data files, except for + innodb_checksum_algorithm=full_crc32 which we handled above. */ + if (!checksum_field1 && !checksum_field2) { + /* Checksum fields can have valid value as zero. + If the page is not empty then do the checksum + calculation for the page. */ + bool all_zeroes = true; + for (size_t i = 0; i < srv_page_size; i++) { +#ifndef UNIV_INNOCHECKSUM + if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) { + i += 8; + } +#endif + if (read_buf[i]) { + all_zeroes = false; + break; + } + } + + if (all_zeroes) { + return false; + } + } + + switch (curr_algo) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + return !buf_page_is_checksum_valid_crc32( + read_buf, checksum_field1, checksum_field2); + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + return !buf_page_is_checksum_valid_innodb( + read_buf, checksum_field1, checksum_field2); + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return !buf_page_is_checksum_valid_none( + read_buf, checksum_field1, checksum_field2); + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_INNODB: + if (buf_page_is_checksum_valid_none(read_buf, + checksum_field1, checksum_field2)) { +#ifdef UNIV_INNOCHECKSUM + if (log_file) { + fprintf(log_file, "page::" UINT32PF ";" + " old style: calculated = %u;" + " recorded = " ULINTPF ";\n", + cur_page_num, + buf_calc_page_old_checksum(read_buf), + checksum_field2); + fprintf(log_file, "page::" UINT32PF ";" + " new style: calculated = " UINT32PF ";" + " crc32 = " UINT32PF "; recorded = " ULINTPF ";\n", + cur_page_num, + buf_calc_page_new_checksum(read_buf), + buf_calc_page_crc32(read_buf), + checksum_field1); + } +#endif /* UNIV_INNOCHECKSUM */ + return false; + } + + crc32_chksum = curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32 + || curr_algo == SRV_CHECKSUM_ALGORITHM_FULL_CRC32; + + /* Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. */ + + /* Since innodb_checksum_algorithm is not strict_* allow + any of the algos to match for the old field */ + + if (checksum_field2 + != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) { + + if (crc32_chksum) { + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = true; + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 2) { + crc32++; + } + }); + + if (checksum_field2 != crc32 + && checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + return true; + } + } else { + ut_ad(curr_algo + == SRV_CHECKSUM_ALGORITHM_INNODB); + + if (checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = true; + + if (checksum_field2 != crc32) { + return true; + } + } + } + } + + if (checksum_field1 == 0 + || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) { + } else if (crc32_chksum) { + + if (!crc32_inited) { + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = true; + } + + if (checksum_field1 != crc32 + && checksum_field1 + != buf_calc_page_new_checksum(read_buf)) { + return true; + } + } else { + ut_ad(curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB); + + if (checksum_field1 + != buf_calc_page_new_checksum(read_buf)) { + + if (!crc32_inited) { + crc32 = buf_calc_page_crc32(read_buf); + crc32_inited = true; + } + + if (checksum_field1 != crc32) { + return true; + } + } + } + + if (crc32_inited + && ((checksum_field1 == crc32 + && checksum_field2 != crc32) + || (checksum_field1 != crc32 + && checksum_field2 == crc32))) { + return true; + } + + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + /* should have returned false earlier */ + break; + } + + return false; +} + +#ifndef UNIV_INNOCHECKSUM + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +/** Enable buffers to be dumped to core files + +A convience function, not called anyhwere directly however +it is left available for gdb or any debugger to call +in the event that you want all of the memory to be dumped +to a core file. + +Returns number of errors found in madvise calls. */ +int +buf_madvise_do_dump() +{ + int ret= 0; + + /* mirrors allocation in log_t::create() */ + if (log_sys.buf) { + ret += madvise(log_sys.buf, + srv_log_buffer_size, + MADV_DODUMP); + ret += madvise(log_sys.flush_buf, + srv_log_buffer_size, + MADV_DODUMP); + } + /* mirrors recv_sys_t::create() */ + if (recv_sys.buf) + { + ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP); + } + + mysql_mutex_lock(&buf_pool.mutex); + auto chunk = buf_pool.chunks; + + for (ulint n = buf_pool.n_chunks; n--; chunk++) { + ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return ret; +} +#endif + +/** Dump a page to stderr. +@param[in] read_buf database page +@param[in] zip_size compressed page size, or 0 */ +void buf_page_print(const byte* read_buf, ulint zip_size) +{ + dict_index_t* index; + +#ifndef UNIV_DEBUG + const ulint size = zip_size ? zip_size : srv_page_size; + ib::info() << "Page dump in ascii and hex (" + << size << " bytes):"; + + ut_print_buf(stderr, read_buf, size); + fputs("\nInnoDB: End of page dump\n", stderr); +#endif + + if (zip_size) { + /* Print compressed page. */ + ib::info() << "Compressed page type (" + << fil_page_get_type(read_buf) + << "); stored checksum in field1 " + << mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM) + << "; calculated checksums for field1: " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_CRC32) + << " " + << page_zip_calc_checksum( + read_buf, zip_size, + SRV_CHECKSUM_ALGORITHM_CRC32) + << ", " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_INNODB) + << " " + << page_zip_calc_checksum( + read_buf, zip_size, + SRV_CHECKSUM_ALGORITHM_INNODB) + << ", " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_NONE) + << " " + << page_zip_calc_checksum( + read_buf, zip_size, + SRV_CHECKSUM_ALGORITHM_NONE) + << "; page LSN " + << mach_read_from_8(read_buf + FIL_PAGE_LSN) + << "; page number (if stored to page" + << " already) " + << mach_read_from_4(read_buf + FIL_PAGE_OFFSET) + << "; space id (if stored to page already) " + << mach_read_from_4( + read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + } else { + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + ulint page_type = fil_page_get_type(read_buf); + + ib::info() << "Uncompressed page, stored checksum in field1 " + << mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM) + << ", calculated checksums for field1: " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_CRC32) << " " + << crc32 + << ", " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_INNODB) << " " + << buf_calc_page_new_checksum(read_buf) + << ", " + << " page type " << page_type << " == " + << fil_get_page_type_name(page_type) << "." + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_NONE) << " " + << BUF_NO_CHECKSUM_MAGIC + << ", stored checksum in field2 " + << mach_read_from_4(read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM) + << ", calculated checksums for field2: " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_CRC32) << " " + << crc32 + << ", " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_INNODB) << " " + << buf_calc_page_old_checksum(read_buf) + << ", " + << buf_checksum_algorithm_name( + SRV_CHECKSUM_ALGORITHM_NONE) << " " + << BUF_NO_CHECKSUM_MAGIC + << ", page LSN " + << mach_read_from_4(read_buf + FIL_PAGE_LSN) + << " " + << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) + << ", low 4 bytes of LSN at page end " + << mach_read_from_4(read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4) + << ", page number (if stored to page already) " + << mach_read_from_4(read_buf + FIL_PAGE_OFFSET) + << ", space id (if created with >= MySQL-4.1.1" + " and stored already) " + << mach_read_from_4( + read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + switch (fil_page_get_type(read_buf)) { + index_id_t index_id; + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + index_id = btr_page_get_index_id(read_buf); + ib::info() << "Page may be an index page where" + " index id is " << index_id; + + index = dict_index_find_on_id_low(index_id); + if (index) { + ib::info() + << "Index " << index_id + << " is " << index->name + << " in table " << index->table->name; + } + break; + case FIL_PAGE_UNDO_LOG: + fputs("InnoDB: Page may be an undo log page\n", stderr); + break; + case FIL_PAGE_INODE: + fputs("InnoDB: Page may be an 'inode' page\n", stderr); + break; + case FIL_PAGE_IBUF_FREE_LIST: + fputs("InnoDB: Page may be an insert buffer free list page\n", + stderr); + break; + case FIL_PAGE_TYPE_ALLOCATED: + fputs("InnoDB: Page may be a freshly allocated page\n", + stderr); + break; + case FIL_PAGE_IBUF_BITMAP: + fputs("InnoDB: Page may be an insert buffer bitmap page\n", + stderr); + break; + case FIL_PAGE_TYPE_SYS: + fputs("InnoDB: Page may be a system page\n", + stderr); + break; + case FIL_PAGE_TYPE_TRX_SYS: + fputs("InnoDB: Page may be a transaction system page\n", + stderr); + break; + case FIL_PAGE_TYPE_FSP_HDR: + fputs("InnoDB: Page may be a file space header page\n", + stderr); + break; + case FIL_PAGE_TYPE_XDES: + fputs("InnoDB: Page may be an extent descriptor page\n", + stderr); + break; + case FIL_PAGE_TYPE_BLOB: + fputs("InnoDB: Page may be a BLOB page\n", + stderr); + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + fputs("InnoDB: Page may be a compressed BLOB page\n", + stderr); + break; + } +} + +/** Initialize a buffer page descriptor. +@param[in,out] block buffer page descriptor +@param[in] frame buffer page frame */ +static +void +buf_block_init(buf_block_t* block, byte* frame) +{ + /* This function should only be executed at database startup or by + buf_pool.resize(). Either way, adaptive hash index must not exist. */ + assert_block_ahi_empty_on_init(block); + + block->frame = frame; + + block->modify_clock = 0; + block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL)); +#ifdef BTR_CUR_HASH_ADAPT + block->index = NULL; +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->in_unzip_LRU_list = false); + ut_d(block->in_withdraw_list = false); + + page_zip_des_init(&block->page.zip); + + ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t))); + + rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING); + + ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch, + SYNC_LEVEL_VARYING)); + + block->lock.is_block_lock = 1; + + ut_ad(rw_lock_validate(&(block->lock))); +} + +/** Allocate a chunk of buffer frames. +@param bytes requested size +@return whether the allocation succeeded */ +inline bool buf_pool_t::chunk_t::create(size_t bytes) +{ + DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;); + /* Round down to a multiple of page size, although it already should be. */ + bytes= ut_2pow_round<size_t>(bytes, srv_page_size); + + mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx); + + if (UNIV_UNLIKELY(!mem)) + return false; + + MEM_UNDEFINED(mem, mem_size()); + +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) + { + struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); + if (mbind(mem, mem_size(), MPOL_INTERLEAVE, + numa_mems_allowed->maskp, numa_mems_allowed->size, + MPOL_MF_MOVE)) + { + ib::warn() << "Failed to set NUMA memory policy of" + " buffer pool page frames to MPOL_INTERLEAVE" + " (error: " << strerror(errno) << ")."; + } + numa_bitmask_free(numa_mems_allowed); + } +#endif /* HAVE_LIBNUMA */ + + + /* Allocate the block descriptors from + the start of the memory block. */ + blocks= reinterpret_cast<buf_block_t*>(mem); + + /* Align a pointer to the first frame. Note that when + opt_large_page_size is smaller than srv_page_size, + (with max srv_page_size at 64k don't think any hardware + makes this true), + we may allocate one fewer block than requested. When + it is bigger, we may allocate more blocks than requested. */ + static_assert(sizeof(byte*) == sizeof(ulint), "pointer size"); + + byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) + + srv_page_size - 1) & + ~ulint{srv_page_size - 1}); + size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem); + + /* Subtract the space needed for block descriptors. */ + { + ulint s= size; + + while (frame < reinterpret_cast<const byte*>(blocks + s)) + { + frame+= srv_page_size; + s--; + } + + size= s; + } + + /* Init block structs and assign frames for them. Then we assign the + frames to the first blocks (we already mapped the memory above). */ + + buf_block_t *block= blocks; + + for (auto i= size; i--; ) { + buf_block_init(block, frame); + MEM_UNDEFINED(block->frame, srv_page_size); + /* Add the block to the free list */ + UT_LIST_ADD_LAST(buf_pool.free, &block->page); + + ut_d(block->page.in_free_list = TRUE); + block++; + frame+= srv_page_size; + } + + reg(); + + return true; +} + +#ifdef UNIV_DEBUG +/** Check that all file pages in the buffer chunk are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const +{ + buf_block_t *block= blocks; + for (auto i= size; i--; block++) + { + switch (block->page.state()) { + case BUF_BLOCK_ZIP_PAGE: + /* The uncompressed buffer pool should never + contain ROW_FORMAT=COMPRESSED block descriptors. */ + ut_error; + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + /* Skip blocks that are not being used for file pages. */ + break; + case BUF_BLOCK_FILE_PAGE: + const lsn_t lsn= block->page.oldest_modification(); + + if (srv_read_only_mode) + { + /* The page cleaner is disabled in read-only mode. No pages + can be dirtied, so all of them must be clean. */ + ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn || + srv_force_recovery == SRV_FORCE_NO_LOG_REDO); + ut_ad(!block->page.buf_fix_count()); + ut_ad(block->page.io_fix() == BUF_IO_NONE); + break; + } + + if (fsp_is_system_temporary(block->page.id().space())) + { + ut_ad(lsn == 0 || lsn == 2); + break; + } + + if (lsn > 1 || !block->page.can_relocate()) + return block; + + break; + } + } + + return nullptr; +} +#endif /* UNIV_DEBUG */ + +/** Free the synchronization objects of a buffer pool block descriptor +@param[in,out] block buffer pool block descriptor */ +static void buf_block_free_mutexes(buf_block_t* block) +{ + rw_lock_free(&block->lock); + ut_d(rw_lock_free(block->debug_latch)); + ut_d(ut_free(block->debug_latch)); +} + +/** Create the hash table. +@param n the lower bound of n_cells */ +void buf_pool_t::page_hash_table::create(ulint n) +{ + n_cells= ut_find_prime(n); + const size_t size= pad(n_cells) * sizeof *array; + void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE); + memset(v, 0, size); + array= static_cast<hash_cell_t*>(v); +} + +/** Create the buffer pool. +@return whether the creation failed */ +bool buf_pool_t::create() +{ + ut_ad(this == &buf_pool); + ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0); + ut_ad(!is_initialised()); + ut_ad(srv_buf_pool_size > 0); + ut_ad(!resizing); + ut_ad(!chunks_old); + ut_ad(!field_ref_zero); + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096)) + field_ref_zero= static_cast<const byte*> + (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX)); + else + return true; + + chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map()); + + new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool); + + n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit; + const size_t chunk_size= srv_buf_pool_chunk_unit; + + chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks)); + UT_LIST_INIT(free, &buf_page_t::list); + curr_size= 0; + auto chunk= chunks; + + do + { + if (!chunk->create(chunk_size)) + { + while (--chunk >= chunks) + { + buf_block_t* block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + buf_block_free_mutexes(block); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + ut_free(chunks); + chunks= nullptr; + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= nullptr; + aligned_free(const_cast<byte*>(field_ref_zero)); + field_ref_zero= nullptr; + ut_ad(!is_initialised()); + return true; + } + + curr_size+= chunk->size; + } + while (++chunk < chunks + n_chunks); + + ut_ad(is_initialised()); + mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); + + UT_LIST_INIT(LRU, &buf_page_t::LRU); + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target= 0; + UT_LIST_INIT(flush_list, &buf_page_t::list); + UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); + + for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i) + UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list); + ulint s= curr_size; + old_size= s; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= srv_buf_pool_size; + + n_chunks_new= n_chunks; + + page_hash.create(2 * curr_size); + zip_hash.create(2 * curr_size); + last_printout_time= time(NULL); + + mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex, + MY_MUTEX_INIT_FAST); + + pthread_cond_init(&done_flush_LRU, nullptr); + pthread_cond_init(&done_flush_list, nullptr); + pthread_cond_init(&do_flush_list, nullptr); + pthread_cond_init(&done_free, nullptr); + + try_LRU_scan= true; + + ut_d(flush_hp.m_mutex= &flush_list_mutex;); + ut_d(lru_hp.m_mutex= &mutex); + ut_d(lru_scan_itr.m_mutex= &mutex); + + io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) * + OS_AIO_N_PENDING_IOS_PER_THREAD); + + /* FIXME: remove some of these variables */ + srv_buf_pool_curr_size= curr_pool_size; + srv_buf_pool_old_size= srv_buf_pool_size; + srv_buf_pool_base_size= srv_buf_pool_size; + + last_activity_count= srv_get_activity_count(); + + chunk_t::map_ref= chunk_t::map_reg; + buf_LRU_old_ratio_update(100 * 3 / 8, false); + btr_search_sys_create(); + ut_ad(is_initialised()); + return false; +} + +/** Clean up after successful create() */ +void buf_pool_t::close() +{ + ut_ad(this == &buf_pool); + if (!is_initialised()) + return; + + mysql_mutex_destroy(&mutex); + mysql_mutex_destroy(&flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage; + bpage= prev_bpage) + { + prev_bpage= UT_LIST_GET_PREV(LRU, bpage); + ut_ad(bpage->in_file()); + ut_ad(bpage->in_LRU_list); + /* The buffer pool must be clean during normal shutdown. + Only on aborted startup (with recovery) or with innodb_fast_shutdown=2 + we may discard changes. */ + ut_d(const lsn_t oldest= bpage->oldest_modification();) + ut_ad(fsp_is_system_temporary(bpage->id().space()) + ? (oldest == 0 || oldest == 2) + : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2); + + if (bpage->state() != BUF_BLOCK_FILE_PAGE) + buf_page_free_descriptor(bpage); + } + + for (auto chunk= chunks + n_chunks; --chunk >= chunks; ) + { + buf_block_t *block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + buf_block_free_mutexes(block); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + + pthread_cond_destroy(&done_flush_LRU); + pthread_cond_destroy(&done_flush_list); + pthread_cond_destroy(&do_flush_list); + pthread_cond_destroy(&done_free); + + ut_free(chunks); + chunks= nullptr; + page_hash.free(); + zip_hash.free(); + + io_buf.close(); + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= chunk_t::map_ref= nullptr; + aligned_free(const_cast<byte*>(field_ref_zero)); + field_ref_zero= nullptr; +} + +/** Try to reallocate a control block. +@param block control block to reallocate +@return whether the reallocation succeeded */ +inline bool buf_pool_t::realloc(buf_block_t *block) +{ + buf_block_t* new_block; + + mysql_mutex_assert_owner(&mutex); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + + new_block = buf_LRU_get_free_only(); + + if (new_block == NULL) { + return(false); /* free list was not enough */ + } + + const page_id_t id(block->page.id()); + page_hash_latch* hash_lock = hash_lock_get(id); + hash_lock->write_lock(); + + if (block->page.can_relocate()) { + memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>( + new_block->frame, block->frame, srv_page_size); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + new (&new_block->page) buf_page_t(block->page); + + /* relocate LRU list */ + if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) { + UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page); + } else { + UT_LIST_ADD_FIRST(LRU, &new_block->page); + } + + if (LRU_old == &block->page) { + LRU_old = &new_block->page; + } + + ut_ad(new_block->page.in_LRU_list); + + /* relocate unzip_LRU list */ + if (block->page.zip.data != NULL) { + ut_ad(block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = true); + + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + UT_LIST_REMOVE(unzip_LRU, block); + + ut_d(block->in_unzip_LRU_list = false); + block->page.zip.data = NULL; + page_zip_set_size(&block->page.zip, 0); + + if (prev_block != NULL) { + UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block); + } else { + UT_LIST_ADD_FIRST(unzip_LRU, new_block); + } + } else { + ut_ad(!block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = false); + } + + /* relocate page_hash */ + ut_ad(block->page.in_page_hash); + ut_ad(new_block->page.in_page_hash); + const ulint fold = id.fold(); + ut_ad(&block->page == page_hash_get_low(id, fold)); + ut_d(block->page.in_page_hash = false); + HASH_REPLACE(buf_page_t, hash, &page_hash, fold, + &block->page, &new_block->page); + + buf_block_modify_clock_inc(block); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(block->frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(block->frame, srv_page_size); + block->page.set_state(BUF_BLOCK_REMOVE_HASH); + if (!fsp_is_system_temporary(id.space())) { + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + block->page.set_corrupt_id(); + + /* set other flags of buf_block_t */ + +#ifdef BTR_CUR_HASH_ADAPT + /* This code should only be executed by resize(), + while the adaptive hash index is disabled. */ + assert_block_ahi_empty(block); + assert_block_ahi_empty_on_init(new_block); + ut_ad(!block->index); + new_block->index = NULL; + new_block->n_hash_helps = 0; + new_block->n_fields = 1; + new_block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->page.set_state(BUF_BLOCK_MEMORY)); + /* free block */ + new_block = block; + } + + hash_lock->write_unlock(); + buf_LRU_block_free_non_file_page(new_block); + return(true); /* free_list was enough */ +} + +/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). +@param[in] fmt format +@param[in] ... extra parameters according to fmt */ +static +void +buf_resize_status( + const char* fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_resize_status, + sizeof(export_vars.innodb_buffer_pool_resize_status), + fmt, ap); + + va_end(ap); + + ib::info() << export_vars.innodb_buffer_pool_resize_status; +} + +/** Withdraw blocks from the buffer pool until meeting withdraw_target. +@return whether retry is needed */ +inline bool buf_pool_t::withdraw_blocks() +{ + buf_block_t* block; + ulint loop_count = 0; + + ib::info() << "start to withdraw the last " + << withdraw_target << " blocks"; + + /* Minimize zip_free[i] lists */ + mysql_mutex_lock(&mutex); + buf_buddy_condense_free(); + mysql_mutex_unlock(&mutex); + + while (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + + /* try to withdraw from free_list */ + ulint count1 = 0; + + mysql_mutex_lock(&mutex); + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(free)); + while (block != NULL + && UT_LIST_GET_LEN(withdraw) < withdraw_target) { + ut_ad(block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + + buf_block_t* next_block; + next_block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_NEXT( + list, &block->page)); + + if (will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_REMOVE(free, &block->page); + UT_LIST_ADD_LAST(withdraw, &block->page); + ut_d(block->in_withdraw_list = true); + count1++; + } + + block = next_block; + } + mysql_mutex_unlock(&mutex); + + /* reserve free_list length */ + if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + ulint n_flushed = buf_flush_LRU( + std::max<ulint>(withdraw_target + - UT_LIST_GET_LEN(withdraw), + srv_LRU_scan_depth)); + buf_flush_wait_batch_end_acquiring_mutex(true); + + if (n_flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_COUNT, + MONITOR_LRU_BATCH_FLUSH_PAGES, + n_flushed); + } + } + + /* relocate blocks/buddies in withdrawn area */ + ulint count2 = 0; + + mysql_mutex_lock(&mutex); + buf_page_t* bpage; + bpage = UT_LIST_GET_FIRST(LRU); + while (bpage != NULL) { + buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage); + if (bpage->zip.data != NULL + && will_be_withdrawn(bpage->zip.data) + && bpage->can_relocate()) { + buf_pool_mutex_exit_forbid(); + if (!buf_buddy_realloc( + bpage->zip.data, + page_zip_get_size(&bpage->zip))) { + /* failed to allocate block */ + buf_pool_mutex_exit_allow(); + break; + } + buf_pool_mutex_exit_allow(); + count2++; + } + + if (bpage->state() == BUF_BLOCK_FILE_PAGE + && will_be_withdrawn(*bpage)) { + if (bpage->can_relocate()) { + buf_pool_mutex_exit_forbid(); + if (!realloc( + reinterpret_cast<buf_block_t*>( + bpage))) { + /* failed to allocate block */ + buf_pool_mutex_exit_allow(); + break; + } + buf_pool_mutex_exit_allow(); + count2++; + } + /* NOTE: if the page is in use, + not relocated yet */ + } + + bpage = next_bpage; + } + mysql_mutex_unlock(&mutex); + + buf_resize_status( + "withdrawing blocks. (" ULINTPF "/" ULINTPF ")", + UT_LIST_GET_LEN(withdraw), + withdraw_target); + + ib::info() << "withdrew " + << count1 << " blocks from free list." + << " Tried to relocate " << count2 << " pages (" + << UT_LIST_GET_LEN(withdraw) << "/" + << withdraw_target << ")"; + + if (++loop_count >= 10) { + /* give up for now. + retried after user threads paused. */ + + ib::info() << "will retry to withdraw later"; + + /* need retry later */ + return(true); + } + } + + /* confirm withdrawn enough */ + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; chunk != echunk; chunk++) { + block = chunk->blocks; + for (ulint j = chunk->size; j--; block++) { + ut_a(block->page.state() == BUF_BLOCK_NOT_USED); + ut_ad(block->in_withdraw_list); + } + } + + ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw) + << " blocks"; + + return(false); +} + + + +inline void buf_pool_t::page_hash_table::write_lock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast<page_hash_latch&>(array[n]).write_lock(); + if (!n) + break; + } +} + + +inline void buf_pool_t::page_hash_table::write_unlock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast<page_hash_latch&>(array[n]).write_unlock(); + if (!n) + break; + } +} + + +namespace +{ + +struct find_interesting_trx +{ + void operator()(const trx_t &trx) + { + if (trx.state == TRX_STATE_NOT_STARTED) + return; + if (trx.mysql_thd == nullptr) + return; + if (withdraw_started <= trx.start_time) + return; + + if (!found) + { + ib::warn() << "The following trx might hold " + "the blocks in buffer pool to " + "be withdrawn. Buffer pool " + "resizing can complete only " + "after all the transactions " + "below release the blocks."; + found= true; + } + + lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time); + } + + bool &found; + time_t withdraw_started; + time_t current_time; +}; + +} // namespace + +/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ +inline void buf_pool_t::resize() +{ + ut_ad(this == &buf_pool); + + bool warning = false; + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + ut_ad(!resize_in_progress()); + ut_ad(srv_buf_pool_chunk_unit > 0); + + ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift; + + buf_resize_status("Resizing buffer pool from " ULINTPF " to " + ULINTPF " (unit=" ULINTPF ").", + srv_buf_pool_old_size, srv_buf_pool_size, + srv_buf_pool_chunk_unit); + + mysql_mutex_lock(&mutex); + ut_ad(curr_size == old_size); + ut_ad(n_chunks_new == n_chunks); + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + + n_chunks_new = (new_instance_size << srv_page_size_shift) + / srv_buf_pool_chunk_unit; + curr_size = n_chunks_new * chunks->size; + mysql_mutex_unlock(&mutex); + +#ifdef BTR_CUR_HASH_ADAPT + /* disable AHI if needed */ + const bool btr_search_disabled = btr_search_enabled; + + buf_resize_status("Disabling adaptive hash index."); + + btr_search_s_lock_all(); + if (btr_search_disabled) { + btr_search_s_unlock_all(); + } else { + btr_search_s_unlock_all(); + } + + btr_search_disable(); + + if (btr_search_disabled) { + ib::info() << "disabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (curr_size < old_size) { + /* set withdraw target */ + size_t w = 0; + + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; + chunk != echunk; chunk++) + w += chunk->size; + + ut_ad(withdraw_target == 0); + withdraw_target = w; + } + + buf_resize_status("Withdrawing blocks to be shrunken."); + + time_t withdraw_started = time(NULL); + double message_interval = 60; + ulint retry_interval = 1; + +withdraw_retry: + /* wait for the number of blocks fit to the new size (if needed)*/ + bool should_retry_withdraw = curr_size < old_size + && withdraw_blocks(); + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + /* abort to resize for shutdown. */ + return; + } + + /* abort buffer pool load */ + buf_load_abort(); + + const time_t current_time = time(NULL); + + if (should_retry_withdraw + && difftime(current_time, withdraw_started) >= message_interval) { + + if (message_interval > 900) { + message_interval = 1800; + } else { + message_interval *= 2; + } + + lock_mutex_enter(); + bool found = false; + trx_sys.trx_list.for_each(find_interesting_trx{ + found, withdraw_started, current_time}); + lock_mutex_exit(); + + withdraw_started = current_time; + } + + if (should_retry_withdraw) { + ib::info() << "Will retry to withdraw " << retry_interval + << " seconds later."; + os_thread_sleep(retry_interval * 1000000); + + if (retry_interval > 5) { + retry_interval = 10; + } else { + retry_interval *= 2; + } + + goto withdraw_retry; + } + + buf_resize_status("Latching whole of buffer pool."); + +#ifndef DBUG_OFF + { + bool should_wait = true; + + while (should_wait) { + should_wait = false; + DBUG_EXECUTE_IF( + "ib_buf_pool_resize_wait_before_resize", + should_wait = true; os_thread_sleep(10000);); + } + } +#endif /* !DBUG_OFF */ + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } + + /* Indicate critical path */ + resizing.store(true, std::memory_order_relaxed); + + mysql_mutex_lock(&mutex); + page_hash.write_lock_all(); + + chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); + + /* add/delete chunks */ + + buf_resize_status("buffer pool resizing with chunks " + ULINTPF " to " ULINTPF ".", + n_chunks, n_chunks_new); + + if (n_chunks_new < n_chunks) { + /* delete chunks */ + chunk_t* chunk = chunks + n_chunks_new; + const chunk_t* const echunk = chunks + n_chunks; + + ulint sum_freed = 0; + + while (chunk < echunk) { + /* buf_LRU_block_free_non_file_page() invokes + MEM_NOACCESS() on any buf_pool.free blocks. + We must cancel the effect of that. In + MemorySanitizer, MEM_NOACCESS() is no-op, so + we must not do anything special for it here. */ +#ifdef HAVE_valgrind +# if !__has_feature(memory_sanitizer) + MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size()); +# endif +#else + MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size); +#endif + + buf_block_t* block = chunk->blocks; + + for (ulint j = chunk->size; j--; block++) { + buf_block_free_mutexes(block); + } + + allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx); + sum_freed += chunk->size; + ++chunk; + } + + /* discard withdraw list */ + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target = 0; + + ib::info() << n_chunks - n_chunks_new + << " chunks (" << sum_freed + << " blocks) were freed."; + + n_chunks = n_chunks_new; + } + + { + /* reallocate chunks */ + const size_t new_chunks_size + = n_chunks_new * sizeof(chunk_t); + + chunk_t* new_chunks = static_cast<chunk_t*>( + ut_zalloc_nokey_nofatal(new_chunks_size)); + + DBUG_EXECUTE_IF("buf_pool_resize_chunk_null", + ut_free(new_chunks); new_chunks= nullptr; ); + + if (!new_chunks) { + ib::error() << "failed to allocate" + " the chunk array."; + n_chunks_new = n_chunks; + warning = true; + chunks_old = NULL; + goto calc_buf_pool_size; + } + + ulint n_chunks_copy = ut_min(n_chunks_new, + n_chunks); + + memcpy(new_chunks, chunks, + n_chunks_copy * sizeof *new_chunks); + + for (ulint j = 0; j < n_chunks_copy; j++) { + new_chunks[j].reg(); + } + + chunks_old = chunks; + chunks = new_chunks; + } + + if (n_chunks_new > n_chunks) { + /* add chunks */ + ulint sum_added = 0; + ulint n = n_chunks; + const size_t unit = srv_buf_pool_chunk_unit; + + for (chunk_t* chunk = chunks + n_chunks, + * const echunk = chunks + n_chunks_new; + chunk != echunk; chunk++) { + if (!chunk->create(unit)) { + ib::error() << "failed to allocate" + " memory for buffer pool chunk"; + + warning = true; + n_chunks_new = n_chunks; + break; + } + + sum_added += chunk->size; + ++n; + } + + ib::info() << n_chunks_new - n_chunks + << " chunks (" << sum_added + << " blocks) were added."; + + n_chunks = n; + } +calc_buf_pool_size: + /* recalc curr_size */ + ulint new_size = 0; + + { + chunk_t* chunk = chunks; + const chunk_t* const echunk = chunk + n_chunks; + do { + new_size += chunk->size; + } while (++chunk != echunk); + } + + curr_size = new_size; + n_chunks_new = n_chunks; + + if (chunks_old) { + ut_free(chunks_old); + chunks_old = NULL; + } + + chunk_t::map* chunk_map_old = chunk_t::map_ref; + chunk_t::map_ref = chunk_t::map_reg; + + /* set size */ + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + ulint s= curr_size; + old_size= s; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= n_chunks * srv_buf_pool_chunk_unit; + srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/ + innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size)); + + const bool new_size_too_diff + = srv_buf_pool_base_size > srv_buf_pool_size * 2 + || srv_buf_pool_base_size * 2 < srv_buf_pool_size; + + mysql_mutex_unlock(&mutex); + page_hash.write_unlock_all(); + + UT_DELETE(chunk_map_old); + + resizing.store(false, std::memory_order_relaxed); + + /* Normalize other components, if the new size is too different */ + if (!warning && new_size_too_diff) { + srv_buf_pool_base_size = srv_buf_pool_size; + + buf_resize_status("Resizing also other hash tables."); + + srv_lock_table_size = 5 + * (srv_buf_pool_size >> srv_page_size_shift); + lock_sys.resize(srv_lock_table_size); + dict_sys.resize(); + + ib::info() << "Resized hash tables at lock_sys," +#ifdef BTR_CUR_HASH_ADAPT + " adaptive hash index," +#endif /* BTR_CUR_HASH_ADAPT */ + " dictionary."; + } + + /* normalize ibuf.max_size */ + ibuf_max_size_update(srv_change_buffer_max_size); + + if (srv_buf_pool_old_size != srv_buf_pool_size) { + + ib::info() << "Completed to resize buffer pool from " + << srv_buf_pool_old_size + << " to " << srv_buf_pool_size << "."; + srv_buf_pool_old_size = srv_buf_pool_size; + } + +#ifdef BTR_CUR_HASH_ADAPT + /* enable AHI if needed */ + if (btr_search_disabled) { + btr_search_enable(true); + ib::info() << "Re-enabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + char now[32]; + + ut_sprintf_timestamp(now); + if (!warning) { + buf_resize_status("Completed resizing buffer pool at %s.", + now); + } else { + buf_resize_status("Resizing buffer pool failed," + " finished resizing at %s.", now); + } + + ut_d(validate()); + + return; +} + +/** Thread pool task invoked by innodb_buffer_pool_size changes. */ +static void buf_resize_callback(void *) +{ + DBUG_ENTER("buf_resize_callback"); + ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); + mysql_mutex_lock(&buf_pool.mutex); + const auto size= srv_buf_pool_size; + const bool work= srv_buf_pool_old_size != size; + mysql_mutex_unlock(&buf_pool.mutex); + + if (work) + buf_pool.resize(); + else + { + std::ostringstream sout; + sout << "Size did not change: old size = new size = " << size; + buf_resize_status(sout.str().c_str()); + } + DBUG_VOID_RETURN; +} + +/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */ +static tpool::task_group single_threaded_group(1); +static tpool::waitable_task buf_resize_task(buf_resize_callback, + nullptr, &single_threaded_group); + +void buf_resize_start() +{ + srv_thread_pool->submit_task(&buf_resize_task); +} + +void buf_resize_shutdown() +{ + buf_resize_task.wait(); +} + + +/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and +buf_pool.page_hash. +The caller must relocate bpage->list. +@param bpage BUF_BLOCK_ZIP_PAGE block +@param dpage destination control block */ +static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) +{ + const ulint fold= bpage->id().fold(); + ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked()); + ut_a(bpage->io_fix() == BUF_IO_NONE); + ut_a(!bpage->buf_fix_count()); + ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold)); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); + ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE); + + new (dpage) buf_page_t(*bpage); + + /* Important that we adjust the hazard pointer before + removing bpage from LRU list. */ + if (buf_page_t *b= buf_pool.LRU_remove(bpage)) + UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage); + else + UT_LIST_ADD_FIRST(buf_pool.LRU, dpage); + + if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage)) + { + buf_pool.LRU_old= dpage; +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) || + !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) || + UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); + } + else + { + /* Check that the "old" flag is consistent in + the block and its neighbours. */ + dpage->set_old(dpage->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } + + ut_d(CheckInLRUList::validate()); + + /* relocate buf_pool.page_hash */ + ut_ad(bpage->in_page_hash); + ut_ad(dpage->in_page_hash); + ut_d(bpage->in_page_hash= false); + HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage); +} + +/** Register a watch for a page identifier. The caller must hold an +exclusive page hash latch. The *hash_lock may be released, +relocated, and reacquired. +@param id page identifier +@param hash_lock exclusively held page_hash latch +@return a buffer pool block corresponding to id +@retval nullptr if the block was not present, and a watch was installed */ +inline buf_page_t *buf_pool_t::watch_set(const page_id_t id, + page_hash_latch **hash_lock) +{ + const ulint fold= id.fold(); + ut_ad(*hash_lock == page_hash.lock_get(fold)); + ut_ad((*hash_lock)->is_write_locked()); + +retry: + if (buf_page_t *bpage= page_hash_get_low(id, fold)) + { + if (!watch_is_sentinel(*bpage)) + /* The page was loaded meanwhile. */ + return bpage; + /* Add to an existing watch. */ + bpage->fix(); + return nullptr; + } + + (*hash_lock)->write_unlock(); + /* Allocate a watch[] and then try to insert it into the page_hash. */ + mysql_mutex_lock(&mutex); + + /* The maximum number of purge tasks should never exceed + the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a + watch when setting another watch. */ + for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; ) + { + ut_ad(w->access_time == 0); + ut_ad(!w->oldest_modification()); + ut_ad(!w->zip.data); + ut_ad(!w->in_zip_hash); + if (w->state() == BUF_BLOCK_ZIP_PAGE) + /* This watch may be in use for some other page. */ + continue; + ut_ad(w->state() == BUF_BLOCK_NOT_USED); + ut_ad(!w->buf_fix_count()); + /* w is pointing to watch[], which is protected by mutex. + Normally, buf_page_t::id for objects that are reachable by + page_hash_get_low(id, fold) are protected by hash_lock. */ + w->set_state(BUF_BLOCK_ZIP_PAGE); + w->id_= id; + + *hash_lock= page_hash.lock_get(fold); + (*hash_lock)->write_lock(); + mysql_mutex_unlock(&mutex); + + buf_page_t *bpage= page_hash_get_low(id, fold); + if (UNIV_LIKELY_NULL(bpage)) + { + (*hash_lock)->write_unlock(); + mysql_mutex_lock(&mutex); + w->set_state(BUF_BLOCK_NOT_USED); + *hash_lock= page_hash.lock_get(fold); + (*hash_lock)->write_lock(); + mysql_mutex_unlock(&mutex); + goto retry; + } + + ut_ad(!w->buf_fix_count_); + w->buf_fix_count_= 1; + ut_ad(!w->in_page_hash); + ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */ + HASH_INSERT(buf_page_t, hash, &page_hash, fold, w); + return nullptr; + } + + ut_error; + mysql_mutex_unlock(&mutex); + return nullptr; +} + +/** Mark the page status as FREED for the given tablespace id and +page number. If the page is not in buffer pool then ignore it. +@param[in,out] space tablespace +@param[in] page page number +@param[in,out] mtr mini-transaction +@param[in] file file name +@param[in] line line where called */ +void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr, + const char *file, unsigned line) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + + if (srv_immediate_scrub_data_uncompressed +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + || space->is_compressed() +#endif + ) + mtr->add_freed_offset(space, page); + + buf_pool.stat.n_page_gets++; + const page_id_t page_id(space->id, page); + const ulint fold= page_id.fold(); + page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold); + if (buf_block_t *block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash_get_low(page_id, fold))) + { + if (block->page.state() != BUF_BLOCK_FILE_PAGE) + /* FIXME: convert, but avoid buf_zip_decompress() */; + else + { + buf_block_buf_fix_inc(block, file, line); + ut_ad(block->page.buf_fix_count()); + hash_lock->read_unlock(); + + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + rw_lock_x_lock_inline(&block->lock, 0, file, line); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + block->page.status= buf_page_t::FREED; + return; + } + } + + hash_lock->read_unlock(); +} + +/** Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size +@return pointer to the block */ +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) +{ + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + buf_pool.stat.n_page_gets++; + + bool discard_attempted= false; + const ulint fold= page_id.fold(); + buf_page_t *bpage; + page_hash_latch *hash_lock; + + for (;;) + { +lookup: + bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock); + if (bpage) + break; + + dberr_t err= buf_read_page(page_id, zip_size); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { + ib::error() << "Reading compressed page " << page_id + << " failed with error: " << err; + goto err_exit; + } + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + } + + ut_ad(hash_lock->is_read_locked()); + + if (!bpage->zip.data) + { + /* There is no compressed page. */ +err_exit: + hash_lock->read_unlock(); + return nullptr; + } + + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); + + switch (bpage->state()) { + case BUF_BLOCK_ZIP_PAGE: + bpage->fix(); + goto got_block; + case BUF_BLOCK_FILE_PAGE: + /* Discard the uncompressed page frame if possible. */ + if (!discard_attempted) + { + discard_attempted= true; + hash_lock->read_unlock(); + mysql_mutex_lock(&buf_pool.mutex); + if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold)) + buf_LRU_free_page(bpage, false); + mysql_mutex_unlock(&buf_pool.mutex); + goto lookup; + } + + buf_block_buf_fix_inc(reinterpret_cast<buf_block_t*>(bpage), + __FILE__, __LINE__); + goto got_block; + default: + break; + } + + ut_error; + goto err_exit; + +got_block: + bool must_read= bpage->io_fix() == BUF_IO_READ; + hash_lock->read_unlock(); + + DBUG_ASSERT(bpage->status != buf_page_t::FREED); + + bpage->set_accessed(); + buf_page_make_young_if_needed(bpage); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(bpage->buf_fix_count()); + ut_ad(bpage->in_file()); + + if (must_read) + /* Let us wait until the read operation completes */ + while (bpage->io_fix() == BUF_IO_READ) + os_thread_sleep(WAIT_FOR_READ); + + return bpage; +} + +/********************************************************************//** +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_block_init_low( +/*===============*/ + buf_block_t* block) /*!< in: block to init */ +{ +#ifdef BTR_CUR_HASH_ADAPT + /* No adaptive hash index entries may point to a previously + unused (and now freshly allocated) block. */ + assert_block_ahi_empty_on_init(block); + block->index = NULL; + + block->n_hash_helps = 0; + block->n_fields = 1; + block->n_bytes = 0; + block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ +} + +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check) /*!< in: TRUE=verify the page checksum */ +{ + const byte* frame = block->page.zip.data; + ulint size = page_zip_get_size(&block->page.zip); + /* The tablespace will not be found if this function is called + during IMPORT. */ + fil_space_t* space= fil_space_t::get(block->page.id().space()); + const unsigned key_version = mach_read_from_4( + frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL; + const bool encrypted = crypt_data + && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED + && (!crypt_data->is_default_encryption() + || srv_encrypt_tables); + + ut_ad(block->zip_size()); + ut_a(block->page.id().space() != 0); + + if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { + + ib::error() << "Compressed page checksum mismatch for " + << (space ? space->chain.start->name : "") + << block->page.id() << ": stored: " + << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) + << ", crc32: " + << page_zip_calc_checksum( + frame, size, SRV_CHECKSUM_ALGORITHM_CRC32) + << " innodb: " + << page_zip_calc_checksum( + frame, size, SRV_CHECKSUM_ALGORITHM_INNODB) + << ", none: " + << page_zip_calc_checksum( + frame, size, SRV_CHECKSUM_ALGORITHM_NONE) + << " (algorithm: " << srv_checksum_algorithm << ")"; + goto err_exit; + } + + switch (fil_page_get_type(frame)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + if (page_zip_decompress(&block->page.zip, + block->frame, TRUE)) { + if (space) { + space->release(); + } + return(TRUE); + } + + ib::error() << "Unable to decompress " + << (space ? space->chain.start->name : "") + << block->page.id(); + goto err_exit; + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + /* Copy to uncompressed storage. */ + memcpy(block->frame, frame, block->zip_size()); + if (space) { + space->release(); + } + + return(TRUE); + } + + ib::error() << "Unknown compressed page type " + << fil_page_get_type(frame) + << " in " << (space ? space->chain.start->name : "") + << block->page.id(); + +err_exit: + if (encrypted) { + ib::info() << "Row compressed page could be encrypted" + " with key_version " << key_version; + } + + if (space) { + if (encrypted) { + dict_set_encrypted_by_space(space); + } else { + dict_set_corrupted_by_space(space); + } + + space->release(); + } + + return(FALSE); +} + +/** Wait for the block to be read in. +@param[in] block The block to check */ +static +void +buf_wait_for_read( + buf_block_t* block) +{ + /* Note: + + We are using the block->lock to check for IO state. + We set the IO_READ state under the protection of the hash_lock. + This is safe because another thread can only + access the block (and check for IO state) after the block has been + added to the page hashtable. */ + + while (block->page.io_fix() == BUF_IO_READ) { + rw_lock_s_lock(&block->lock); + rw_lock_s_unlock(&block->lock); + } +} + +#ifdef BTR_CUR_HASH_ADAPT +/** If a stale adaptive hash index exists on the block, drop it. +Multiple executions of btr_search_drop_page_hash_index() on the +same block must be prevented by exclusive page latch. */ +ATTRIBUTE_COLD +static void buf_defer_drop_ahi(buf_block_t *block, mtr_memo_type_t fix_type) +{ + switch (fix_type) { + case MTR_MEMO_BUF_FIX: + /* We do not drop the adaptive hash index, because safely doing + so would require acquiring block->lock, and that is not safe + to acquire in some RW_NO_LATCH access paths. Those code paths + should have no business accessing the adaptive hash index anyway. */ + break; + case MTR_MEMO_PAGE_S_FIX: + /* Temporarily release our S-latch. */ + rw_lock_s_unlock(&block->lock); + rw_lock_x_lock(&block->lock); + if (dict_index_t *index= block->index) + if (index->freed()) + btr_search_drop_page_hash_index(block); + rw_lock_x_unlock(&block->lock); + rw_lock_s_lock(&block->lock); + break; + case MTR_MEMO_PAGE_SX_FIX: + rw_lock_sx_unlock(&block->lock); + rw_lock_x_lock(&block->lock); + if (dict_index_t *index= block->index) + if (index->freed()) + btr_search_drop_page_hash_index(block); + rw_lock_x_unlock(&block->lock); + rw_lock_sx_lock(&block->lock); + break; + default: + ut_ad(fix_type == MTR_MEMO_PAGE_X_FIX); + btr_search_drop_page_hash_index(block); + } +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/** Lock the page with the given latch type. +@param[in,out] block block to be locked +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] mtr mini-transaction +@param[in] file file name +@param[in] line line where called +@return pointer to locked block */ +static buf_block_t* buf_page_mtr_lock(buf_block_t *block, + ulint rw_latch, + mtr_t* mtr, + const char *file, + unsigned line) +{ + mtr_memo_type_t fix_type; + switch (rw_latch) + { + case RW_NO_LATCH: + fix_type= MTR_MEMO_BUF_FIX; + goto done; + case RW_S_LATCH: + rw_lock_s_lock_inline(&block->lock, 0, file, line); + fix_type= MTR_MEMO_PAGE_S_FIX; + break; + case RW_SX_LATCH: + rw_lock_sx_lock_inline(&block->lock, 0, file, line); + fix_type= MTR_MEMO_PAGE_SX_FIX; + break; + default: + ut_ad(rw_latch == RW_X_LATCH); + rw_lock_x_lock_inline(&block->lock, 0, file, line); + fix_type= MTR_MEMO_PAGE_X_FIX; + break; + } + +#ifdef BTR_CUR_HASH_ADAPT + { + dict_index_t *index= block->index; + if (index && index->freed()) + buf_defer_drop_ahi(block, fix_type); + } +#endif /* BTR_CUR_HASH_ADAPT */ + +done: + mtr_memo_push(mtr, block, fix_type); + return block; +} + +/** Low level function used to get access to a database page. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge to happen +while reading the page from file +then it makes sure that it does merging of change buffer changes while +reading the page from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_low( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + const char* file, + unsigned line, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + buf_block_t* block; + unsigned access_time; + ulint retries = 0; + const ulint fold = page_id.fold(); + + ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL)); + ut_ad(!mtr || mtr->is_active()); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH) + || (rw_latch == RW_NO_LATCH)); + ut_ad(!allow_ibuf_merge + || mode == BUF_GET + || mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_GET_IF_IN_POOL + || mode == BUF_GET_IF_IN_POOL_OR_WATCH); + + if (err) { + *err = DB_SUCCESS; + } + +#ifdef UNIV_DEBUG + switch (mode) { + case BUF_EVICT_IF_IN_POOL: + /* After DISCARD TABLESPACE, the tablespace would not exist, + but in IMPORT TABLESPACE, PageConverter::operator() must + replace any old pages, which were not evicted during DISCARD. + Skip the assertion on space_page_size. */ + break; + case BUF_PEEK_IF_IN_POOL: + case BUF_GET_IF_IN_POOL: + /* The caller may pass a dummy page size, + because it does not really matter. */ + break; + default: + ut_error; + case BUF_GET_POSSIBLY_FREED: + break; + case BUF_GET_NO_LATCH: + ut_ad(rw_latch == RW_NO_LATCH); + /* fall through */ + case BUF_GET: + case BUF_GET_IF_IN_POOL_OR_WATCH: + fil_space_t* s = fil_space_get(page_id.space()); + ut_ad(s); + ut_ad(s->zip_size() == zip_size); + } +#endif /* UNIV_DEBUG */ + + ut_ad(!mtr || !ibuf_inside(mtr) + || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL)); + + buf_pool.stat.n_page_gets++; +loop: + buf_block_t* fix_block; + block = guess; + + page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold); + + if (block) { + + /* If the guess is a compressed page descriptor that + has been allocated by buf_page_alloc_descriptor(), + it may have been freed by buf_relocate(). */ + + if (!buf_pool.is_uncompressed(block) + || page_id != block->page.id() + || block->page.state() != BUF_BLOCK_FILE_PAGE) { + /* Our guess was bogus or things have changed + since. */ + guess = nullptr; + goto lookup; + } else { + ut_ad(!block->page.in_zip_hash); + } + } else { +lookup: + block = reinterpret_cast<buf_block_t*>( + buf_pool.page_hash_get_low(page_id, fold)); + } + + if (!block || buf_pool.watch_is_sentinel(block->page)) { + hash_lock->read_unlock(); + block = nullptr; + } + + if (UNIV_UNLIKELY(!block)) { + /* Page not in buf_pool: needs to be read from file */ + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + hash_lock = buf_pool.page_hash.lock<true>(fold); + + if (buf_page_t *bpage= buf_pool.watch_set( + page_id, &hash_lock)) { + /* We can release hash_lock after we + increment the fix count to make + sure that no state change takes place. */ + bpage->fix(); + hash_lock->write_unlock(); + block = reinterpret_cast<buf_block_t*>(bpage); + fix_block = block; + goto got_block; + } + + hash_lock->write_unlock(); + } + + switch (mode) { + case BUF_GET_IF_IN_POOL: + case BUF_GET_IF_IN_POOL_OR_WATCH: + case BUF_PEEK_IF_IN_POOL: + case BUF_EVICT_IF_IN_POOL: + return(NULL); + } + + /* The call path is buf_read_page() -> + buf_read_page_low() (fil_space_t::io()) -> + buf_page_read_complete() -> + buf_decrypt_after_read(). Here fil_space_t* is used + and we decrypt -> buf_page_check_corrupt() where page + checksums are compared. Decryption, decompression as + well as error handling takes place at a lower level. + Here we only need to know whether the page really is + corrupted, or if an encrypted page with a valid + checksum cannot be decypted. */ + + dberr_t local_err = buf_read_page(page_id, zip_size); + + if (local_err == DB_SUCCESS) { + buf_read_ahead_random(page_id, zip_size, + ibuf_inside(mtr)); + + retries = 0; + } else if (mode == BUF_GET_POSSIBLY_FREED) { + if (err) { + *err = local_err; + } + return NULL; + } else if (retries < BUF_PAGE_READ_MAX_RETRIES) { + ++retries; + + DBUG_EXECUTE_IF( + "innodb_page_corruption_retries", + retries = BUF_PAGE_READ_MAX_RETRIES; + ); + } else { + if (err) { + *err = local_err; + } + + /* Pages whose encryption key is unavailable or used + key, encryption algorithm or encryption method is + incorrect are marked as encrypted in + buf_page_check_corrupt(). Unencrypted page could be + corrupted in a way where the key_id field is + nonzero. There is no checksum on field + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */ + if (local_err == DB_DECRYPTION_FAILED) { + return (NULL); + } + + if (local_err == DB_PAGE_CORRUPTED + && srv_force_recovery) { + return NULL; + } + + /* Try to set table as corrupted instead of + asserting. */ + if (page_id.space() == TRX_SYS_SPACE) { + } else if (page_id.space() == SRV_TMP_SPACE_ID) { + } else if (fil_space_t* space= fil_space_t::get( + page_id.space())) { + bool set = dict_set_corrupted_by_space(space); + space->release(); + if (set) { + return NULL; + } + } + + ib::fatal() << "Unable to read page " << page_id + << " into the buffer pool after " + << BUF_PAGE_READ_MAX_RETRIES + << ". The most probable cause" + " of this error may be that the" + " table has been corrupted." + " See https://mariadb.com/kb/en/library/innodb-recovery-modes/"; + } + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + goto loop; + } else { + fix_block = block; + } + + fix_block->fix(); + hash_lock->read_unlock(); + +got_block: + switch (mode) { + default: + ut_ad(block->zip_size() == zip_size); + break; + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + case BUF_EVICT_IF_IN_POOL: + if (fix_block->page.io_fix() == BUF_IO_READ) { + /* The page is being read to buffer pool, + but we cannot wait around for the read to + complete. */ + fix_block->unfix(); + return(NULL); + } + } + + switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) { + case BUF_BLOCK_FILE_PAGE: + if (fsp_is_system_temporary(page_id.space()) + && block->page.io_fix() != BUF_IO_NONE) { + /* This suggests that the page is being flushed. + Avoid returning reference to this page. + Instead wait for the flush action to complete. */ + fix_block->unfix(); + os_thread_sleep(WAIT_FOR_WRITE); + goto loop; + } + + if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) { +evict_from_pool: + ut_ad(!fix_block->page.oldest_modification()); + mysql_mutex_lock(&buf_pool.mutex); + fix_block->unfix(); + + if (!buf_LRU_free_page(&fix_block->page, true)) { + ut_ad(0); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return(NULL); + } + + break; + default: + ut_error; + break; + + case BUF_BLOCK_ZIP_PAGE: + if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) { + goto evict_from_pool; + } + + if (mode == BUF_PEEK_IF_IN_POOL) { + /* This mode is only used for dropping an + adaptive hash index. There cannot be an + adaptive hash index for a compressed-only + page, so do not bother decompressing the page. */ + fix_block->unfix(); + + return(NULL); + } + + buf_page_t* bpage = &block->page; + + /* Note: We have already buffer fixed this block. */ + if (bpage->buf_fix_count() > 1 + || bpage->io_fix() != BUF_IO_NONE) { + + /* This condition often occurs when the buffer + is not buffer-fixed, but I/O-fixed by + buf_page_init_for_read(). */ + fix_block->unfix(); + + /* The block is buffer-fixed or I/O-fixed. + Try again later. */ + os_thread_sleep(WAIT_FOR_READ); + + goto loop; + } + + /* Buffer-fix the block so that it cannot be evicted + or relocated while we are attempting to allocate an + uncompressed page. */ + + block = buf_LRU_get_free_block(false); + buf_block_init_low(block); + + mysql_mutex_lock(&buf_pool.mutex); + hash_lock = buf_pool.page_hash.lock_get(fold); + + hash_lock->write_lock(); + + /* Buffer-fixing prevents the page_hash from changing. */ + ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold)); + + fix_block->unfix(); /* hash_lock protects us after this */ + + if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) { + /* The block was buffer-fixed or I/O-fixed while + buf_pool.mutex was not held by this thread. + Free the block that was allocated and retry. + This should be extremely unlikely, for example, + if buf_page_get_zip() was invoked. */ + + hash_lock->write_unlock(); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&buf_pool.mutex); + + /* Try again */ + goto loop; + } + + fix_block = block; + + /* Move the compressed page from bpage to block, + and uncompress it. */ + + /* Note: this is the uncompressed block and it is not + accessible by other threads yet because it is not in + any list or hash table */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(bpage, &block->page); + + /* Set after buf_relocate(). */ + block->page.set_buf_fix_count(1); + + buf_flush_relocate_on_flush_list(bpage, &block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + /* Buffer-fix, I/O-fix, and X-latch the block + for the duration of the decompression. + Also add the block to the unzip_LRU list. */ + block->page.set_state(BUF_BLOCK_FILE_PAGE); + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(block, FALSE); + + block->page.set_io_fix(BUF_IO_READ); + rw_lock_x_lock_inline(&block->lock, 0, file, line); + + MEM_UNDEFINED(bpage, sizeof *bpage); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock->write_unlock(); + buf_pool.n_pend_unzip++; + + access_time = block->page.is_accessed(); + + if (!access_time && !recv_no_ibuf_operations + && ibuf_page_exists(block->page.id(), zip_size)) { + block->page.ibuf_exist = true; + } + + buf_page_free_descriptor(bpage); + + /* Decompress the page while not holding + buf_pool.mutex. */ + + if (!buf_zip_decompress(block, false)) { + rw_lock_x_unlock(&fix_block->lock); + fix_block->page.io_unfix(); + fix_block->unfix(); + --buf_pool.n_pend_unzip; + + if (err) { + *err = DB_PAGE_CORRUPTED; + } + return NULL; + } + + rw_lock_x_unlock(&block->lock); + fix_block->page.io_unfix(); + --buf_pool.n_pend_unzip; + break; + } + + ut_ad(block == fix_block); + ut_ad(fix_block->page.buf_fix_count()); + + ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +re_evict: + if (mode != BUF_GET_IF_IN_POOL + && mode != BUF_GET_IF_IN_POOL_OR_WATCH) { + } else if (!ibuf_debug) { + } else if (fil_space_t* space = fil_space_t::get(page_id.space())) { + /* Try to evict the block from the buffer pool, to use the + insert buffer (change buffer) as much as possible. */ + + mysql_mutex_lock(&buf_pool.mutex); + + fix_block->unfix(); + + /* Blocks cannot be relocated or enter or exit the + buf_pool while we are holding the buf_pool.mutex. */ + const bool evicted = buf_LRU_free_page(&fix_block->page, true); + space->release(); + + if (evicted) { + hash_lock = buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); + mysql_mutex_unlock(&buf_pool.mutex); + /* We may set the watch, as it would have + been set if the page were not in the + buffer pool in the first place. */ + block= reinterpret_cast<buf_block_t*>( + mode == BUF_GET_IF_IN_POOL_OR_WATCH + ? buf_pool.watch_set(page_id, &hash_lock) + : buf_pool.page_hash_get_low(page_id, fold)); + hash_lock->write_unlock(); + + if (block != NULL) { + /* Either the page has been read in or + a watch was set on that in the window + where we released the buf_pool.mutex + and before we acquire the hash_lock + above. Try again. */ + guess = block; + + goto loop; + } + + return(NULL); + } + + fix_block->fix(); + mysql_mutex_unlock(&buf_pool.mutex); + buf_flush_list(); + buf_flush_wait_batch_end_acquiring_mutex(false); + while (buf_flush_list_space(space)); + os_aio_wait_until_no_pending_writes(); + + if (fix_block->page.buf_fix_count() == 1 + && !fix_block->page.oldest_modification()) { + goto re_evict; + } + + /* Failed to evict the page; change it directly */ + } +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + ut_ad(fix_block->page.buf_fix_count()); + +#ifdef UNIV_DEBUG + /* We have already buffer fixed the page, and we are committed to + returning this page to the caller. Register for debugging. + Avoid debug latching if page/block belongs to system temporary + tablespace (Not much needed for table with single threaded access.). */ + if (!fsp_is_system_temporary(page_id.space())) { + ibool ret; + ret = rw_lock_s_lock_nowait( + fix_block->debug_latch, file, line); + ut_a(ret); + } +#endif /* UNIV_DEBUG */ + + /* While tablespace is reinited the indexes are already freed but the + blocks related to it still resides in buffer pool. Trying to remove + such blocks from buffer pool would invoke removal of AHI entries + associated with these blocks. Logic to remove AHI entry will try to + load the block but block is already in free state. Handle the said case + with mode = BUF_PEEK_IF_IN_POOL that is invoked from + "btr_search_drop_page_hash_when_freed". */ + ut_ad(mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL + || fix_block->page.status != buf_page_t::FREED); + + const bool not_first_access = fix_block->page.set_accessed(); + + if (mode != BUF_PEEK_IF_IN_POOL) { + buf_page_make_young_if_needed(&fix_block->page); + } + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE); + + /* We have to wait here because the IO_READ state was set + under the protection of the hash_lock and not block->lock. */ + buf_wait_for_read(fix_block); + + if (fix_block->page.id() != page_id) { + fix_block->unfix(); + +#ifdef UNIV_DEBUG + if (!fsp_is_system_temporary(page_id.space())) { + rw_lock_s_unlock(fix_block->debug_latch); + } +#endif /* UNIV_DEBUG */ + + if (err) { + *err = DB_PAGE_CORRUPTED; + } + + return NULL; + } + + if (fix_block->page.status != buf_page_t::FREED + && allow_ibuf_merge + && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX + && page_is_leaf(fix_block->frame)) { + rw_lock_x_lock_inline(&fix_block->lock, 0, file, line); + + if (fix_block->page.ibuf_exist) { + fix_block->page.ibuf_exist = false; + ibuf_merge_or_delete_for_page(fix_block, page_id, + zip_size); + } + + if (rw_latch == RW_X_LATCH) { + mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX); + } else { + rw_lock_x_unlock(&fix_block->lock); + goto get_latch; + } + } else { +get_latch: + fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr, + file, line); + } + + if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr)); + } + + return(fix_block); +} + +/** Get access to a database page. Buffered redo log may be applied. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge while +reading the pages from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_gen( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + const char* file, + unsigned line, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + if (buf_block_t *block= recv_sys.recover(page_id)) + { + block->fix(); + ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line)); + if (err) + *err= DB_SUCCESS; + const bool must_merge= allow_ibuf_merge && + ibuf_page_exists(page_id, block->zip_size()); + if (block->page.status == buf_page_t::FREED) + ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); + else if (must_merge && fil_page_get_type(block->frame) == FIL_PAGE_INDEX && + page_is_leaf(block->frame)) + { + rw_lock_x_lock_inline(&block->lock, 0, file, line); + block->page.ibuf_exist= false; + ibuf_merge_or_delete_for_page(block, page_id, block->zip_size()); + + if (rw_latch == RW_X_LATCH) + { + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + return block; + } + rw_lock_x_unlock(&block->lock); + } + block= buf_page_mtr_lock(block, rw_latch, mtr, file, line); + return block; + } + + return buf_page_get_low(page_id, zip_size, rw_latch, + guess, mode, file, line, mtr, err, allow_ibuf_merge); +} + +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +ibool +buf_page_optimistic_get( +/*====================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: guessed buffer block */ + ib_uint64_t modify_clock,/*!< in: modify clock value */ + const char* file, /*!< in: file name */ + unsigned line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ibool success; + + ut_ad(block); + ut_ad(mtr); + ut_ad(mtr->is_active()); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + + if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE + || block->page.io_fix() != BUF_IO_NONE)) { + return FALSE; + } + + const page_id_t id(block->page.id()); + + page_hash_latch *hash_lock = buf_pool.hash_lock_get(id); + hash_lock->read_lock(); + + if (UNIV_UNLIKELY(id != block->page.id() + || block->page.state() != BUF_BLOCK_FILE_PAGE + || block->page.io_fix() != BUF_IO_NONE)) { + hash_lock->read_unlock(); + return(FALSE); + } + + buf_block_buf_fix_inc(block, file, line); + hash_lock->read_unlock(); + + block->page.set_accessed(); + + buf_page_make_young_if_needed(&block->page); + + ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL)); + + mtr_memo_type_t fix_type; + + if (rw_latch == RW_S_LATCH) { + fix_type = MTR_MEMO_PAGE_S_FIX; + success = rw_lock_s_lock_nowait(&block->lock, file, line); + } else { + fix_type = MTR_MEMO_PAGE_X_FIX; + success = rw_lock_x_lock_func_nowait_inline( + &block->lock, file, line); + } + + ut_ad(id == block->page.id()); + + if (!success) { + buf_block_buf_fix_dec(block); + return(FALSE); + } + + if (modify_clock != block->modify_clock) { + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&block->lock); + } else { + rw_lock_x_unlock(&block->lock); + } + + buf_block_buf_fix_dec(block); + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + + buf_pool.stat.n_page_gets++; + + return(TRUE); +} + +/** Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the lock_sys_t::mutex. +@param[in] page_id page id +@param[in] file file name +@param[in] line line where called +@param[in] mtr mini-transaction +@return pointer to a page or NULL */ +buf_block_t* +buf_page_try_get_func( + const page_id_t page_id, + const char* file, + unsigned line, + mtr_t* mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + + page_hash_latch *hash_lock; + buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id, + page_id.fold(), + &hash_lock); + if (!bpage) + return nullptr; + if (bpage->state() != BUF_BLOCK_FILE_PAGE) + { + hash_lock->read_unlock(); + return nullptr; + } + + buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage); + buf_block_buf_fix_inc(block, file, line); + hash_lock->read_unlock(); + + mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX; + if (!rw_lock_s_lock_nowait(&block->lock, file, line)) + { + /* Let us try to get an X-latch. If the current thread + is holding an X-latch on the page, we cannot get an S-latch. */ + fix_type= MTR_MEMO_PAGE_X_FIX; + if (!rw_lock_x_lock_func_nowait_inline(&block->lock, file, line)) + { + buf_block_buf_fix_dec(block); + return nullptr; + } + } + + mtr_memo_push(mtr, block, fix_type); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(bpage->buf_fix_count()); + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE); + ut_ad(bpage->id() == page_id); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + buf_pool.stat.n_page_gets++; + return block; +} + +/** Initialize the block. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param fix initial buf_fix_count() */ +void buf_block_t::initialise(const page_id_t page_id, ulint zip_size, + uint32_t fix) +{ + ut_ad(page.state() != BUF_BLOCK_FILE_PAGE); + buf_block_init_low(this); + page.init(page_id, fix); + page_zip_set_size(&page.zip, zip_size); +} + +/** Initialize a page in the buffer pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@param[in,out] space space object +@param[in] offset offset of the tablespace +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[in,out] free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create(fil_space_t *space, uint32_t offset, + ulint zip_size, mtr_t *mtr, buf_block_t *free_block) +{ + page_id_t page_id(space->id, offset); + ut_ad(mtr->is_active()); + ut_ad(page_id.space() != 0 || !zip_size); + + space->free_page(offset, false); + free_block->initialise(page_id, zip_size, 1); + + const ulint fold= page_id.fold(); + mysql_mutex_lock(&buf_pool.mutex); + +loop: + buf_block_t *block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash_get_low(page_id, fold)); + + if (block && block->page.in_file() && + !buf_pool.watch_is_sentinel(block->page)) + { +#ifdef BTR_CUR_HASH_ADAPT + const dict_index_t *drop_hash_entry= nullptr; +#endif + switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) { + default: + ut_ad(0); + break; + case BUF_BLOCK_FILE_PAGE: + if (!mtr->have_x_latch(*block)) + { + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + while (!rw_lock_x_lock_nowait(&block->lock)) + { + /* Wait for buf_page_write_complete() to release block->lock. + We must not hold buf_pool.mutex while waiting. */ + timespec abstime; + set_timespec_nsec(abstime, 1000000); + my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, + &abstime); + } + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); + } + else + { + ut_ad(!block->page.ibuf_exist); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!block->index); +#endif + } +#ifdef BTR_CUR_HASH_ADAPT + drop_hash_entry= block->index; +#endif + break; + case BUF_BLOCK_ZIP_PAGE: + page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); + if (block->page.io_fix() != BUF_IO_NONE) + { + hash_lock->write_unlock(); + /* Wait for buf_page_write_complete() to release the I/O fix. */ + timespec abstime; + set_timespec_nsec(abstime, 1000000); + my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, + &abstime); + goto loop; + } + + rw_lock_x_lock(&free_block->lock); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(&block->page, &free_block->page); + buf_flush_relocate_on_flush_list(&block->page, &free_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + free_block->page.set_state(BUF_BLOCK_FILE_PAGE); + buf_unzip_LRU_add_block(free_block, FALSE); + hash_lock->write_unlock(); + buf_page_free_descriptor(&block->page); + block= free_block; + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); + break; + } + + mysql_mutex_unlock(&buf_pool.mutex); + +#ifdef BTR_CUR_HASH_ADAPT + if (drop_hash_entry) + btr_search_drop_page_hash_index(block); +#endif /* BTR_CUR_HASH_ADAPT */ + + if (block->page.ibuf_exist) + { + if (!recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + block->page.ibuf_exist= false; + } + + return block; + } + + /* If we get here, the page was not in buf_pool: init it there */ + + DBUG_PRINT("ib_buf", ("create page %u:%u", + page_id.space(), page_id.page_no())); + + block= free_block; + + /* Duplicate buf_block_buf_fix_inc_func() */ + ut_ad(block->page.buf_fix_count() == 1); + ut_ad(fsp_is_system_temporary(page_id.space()) || + rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__)); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(&block->page, false); + page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); + block->page.set_state(BUF_BLOCK_FILE_PAGE); + ut_d(block->page.in_page_hash= true); + HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page); + + rw_lock_x_lock(&block->lock); + if (UNIV_UNLIKELY(zip_size)) + { + /* Prevent race conditions during buf_buddy_alloc(), which may + release and reacquire buf_pool.mutex, by IO-fixing and X-latching + the block. */ + block->page.set_io_fix(BUF_IO_READ); + hash_lock->write_unlock(); + + /* buf_pool.mutex may be released and reacquired by + buf_buddy_alloc(). We must defer this operation until + after the block descriptor has been added to + buf_pool.LRU and buf_pool.page_hash. */ + block->page.zip.data= buf_buddy_alloc(zip_size); + + /* To maintain the invariant block->in_unzip_LRU_list == + block->page.belongs_to_unzip_LRU() we have to add this + block to unzip_LRU after block->page.zip.data is set. */ + ut_ad(block->page.belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(block, FALSE); + + block->page.set_io_fix(BUF_IO_NONE); + } + else + hash_lock->write_unlock(); + + mysql_mutex_unlock(&buf_pool.mutex); + + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + block->page.set_accessed(); + buf_pool.stat.n_pages_created++; + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} && + !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); + memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + + /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the + following pages: + (1) The first page of the InnoDB system tablespace (page 0:0) + (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages + (3) key_version on encrypted pages (not page 0:0) */ + + memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return block; +} + +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param io_type BUF_IO_READ or BUF_IO_WRITE */ +ATTRIBUTE_COLD __attribute__((nonnull)) +void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type) +{ + const byte* frame; + monitor_id_t counter; + + ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); + + frame = bpage->zip.data + ? bpage->zip.data + : ((buf_block_t*) bpage)->frame; + + switch (fil_page_get_type(frame)) { + ulint level; + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + level = btr_page_get_level(frame); + + /* Check if it is an index page for insert buffer */ + if (fil_page_get_type(frame) == FIL_PAGE_INDEX + && btr_page_get_index_id(frame) + == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + io_type, MONITOR_INDEX_IBUF_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + io_type, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE); + } + } else { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + io_type, MONITOR_INDEX_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + io_type, MONITOR_INDEX_NON_LEAF_PAGE); + } + } + break; + + case FIL_PAGE_UNDO_LOG: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE); + break; + + case FIL_PAGE_INODE: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE); + break; + + case FIL_PAGE_IBUF_FREE_LIST: + counter = MONITOR_RW_COUNTER(io_type, + MONITOR_IBUF_FREELIST_PAGE); + break; + + case FIL_PAGE_IBUF_BITMAP: + counter = MONITOR_RW_COUNTER(io_type, + MONITOR_IBUF_BITMAP_PAGE); + break; + + case FIL_PAGE_TYPE_SYS: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_TRX_SYS: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_FSP_HDR: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE); + break; + + case FIL_PAGE_TYPE_XDES: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE); + break; + + case FIL_PAGE_TYPE_BLOB: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB2: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE); + break; + + default: + counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE); + } + + MONITOR_INC_NOCHECK(counter); +} + +/** Mark a table corrupted. +@param[in] bpage corrupted page +@param[in] space tablespace of the corrupted page */ +ATTRIBUTE_COLD +static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space) +{ + /* If block is not encrypted find the table with specified + space id, and mark it corrupted. Encrypted tables + are marked unusable later e.g. in ::open(). */ + if (!space.crypt_data + || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) { + dict_set_corrupted_by_space(&space); + } else { + dict_set_encrypted_by_space(&space); + } +} + +/** Release and evict a corrupted page. +@param bpage page that was being read */ +ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage) +{ + const page_id_t id(bpage->id()); + page_hash_latch *hash_lock= hash_lock_get(id); + + mysql_mutex_lock(&mutex); + hash_lock->write_lock(); + + ut_ad(bpage->io_fix() == BUF_IO_READ); + ut_ad(!bpage->oldest_modification()); + bpage->set_corrupt_id(); + + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock, + BUF_IO_READ); + + bpage->io_unfix(); + + /* remove from LRU and page_hash */ + buf_LRU_free_one_page(bpage, id, hash_lock); + mysql_mutex_unlock(&mutex); + + ut_d(auto n=) n_pend_reads--; + ut_ad(n > 0); +} + +/** Mark a table corrupted. +@param[in] bpage Corrupted page +@param[in] node data file +Also remove the bpage from LRU list. */ +ATTRIBUTE_COLD +static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node) +{ + ut_ad(bpage->id().space() == node.space->id); + buf_pool.corrupted_evict(bpage); + + if (!srv_force_recovery) + buf_mark_space_corrupt(bpage, *node.space); +} + +/** Check if the encrypted page is corrupted for the full crc32 format. +@param[in] space_id page belongs to space id +@param[in] d page +@param[in] is_compressed compressed page +@return true if page is corrupted or false if it isn't */ +static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d, + bool is_compressed) +{ + if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID)) + return true; + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + + return !is_compressed && + memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d, + d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4); +} + +/** Check if page is maybe compressed, encrypted or both when we encounter +corrupted page. Note that we can't be 100% sure if page is corrupted +or decrypt/decompress just failed. +@param[in,out] bpage page +@param[in] node data file +@return whether the operation succeeded +@retval DB_SUCCESS if page has been read and is not corrupted +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. +@retval DB_TABLESPACE_DELETED if accessed tablespace is not found */ +static dberr_t buf_page_check_corrupt(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + + byte* dst_frame = (bpage->zip.data) ? bpage->zip.data : + ((buf_block_t*) bpage)->frame; + dberr_t err = DB_SUCCESS; + uint key_version = buf_page_get_key_version(dst_frame, + node.space->flags); + + /* In buf_decrypt_after_read we have either decrypted the page if + page post encryption checksum matches and used key_id is found + from the encryption plugin. If checksum did not match page was + not decrypted and it could be either encrypted and corrupted + or corrupted or good page. If we decrypted, there page could + still be corrupted if used key does not match. */ + const bool seems_encrypted = !node.space->full_crc32() && key_version + && node.space->crypt_data + && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY || + node.space->full_crc32()); + + /* If traditional checksums match, we assume that page is + not anymore encrypted. */ + if (node.space->full_crc32() + && !buf_is_zeroes(span<const byte>(dst_frame, + node.space->physical_size())) + && (key_version || node.space->is_compressed() + || node.space->purpose == FIL_TYPE_TEMPORARY)) { + if (buf_page_full_crc32_is_corrupted( + bpage->id().space(), dst_frame, + node.space->is_compressed())) { + err = DB_PAGE_CORRUPTED; + } + } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) { + err = DB_PAGE_CORRUPTED; + } + + if (seems_encrypted && err == DB_PAGE_CORRUPTED + && bpage->id().page_no() != 0) { + err = DB_DECRYPTION_FAILED; + + ib::error() + << "The page " << bpage->id() + << " in file '" << node.name + << "' cannot be decrypted."; + + ib::info() + << "However key management plugin or used key_version " + << key_version + << " is not found or" + " used encryption algorithm or method does not match."; + + if (bpage->id().space() != TRX_SYS_SPACE) { + ib::info() + << "Marking tablespace as missing." + " You may drop this table or" + " install correct key management plugin" + " and key file."; + } + } + + return (err); +} + +/** Complete a read request of a file page to buf_pool. +@param bpage recently read page +@param node data file +@return whether the operation succeeded +@retval DB_SUCCESS always when writing, or if a read page was OK +@retval DB_PAGE_CORRUPTED if the checksum fails on a page read +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */ +dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node) +{ + const page_id_t id(bpage->id()); + ut_ad(bpage->in_file()); + ut_ad(!buf_dblwr.is_inside(id)); + ut_ad(id.space() == node.space->id); + ut_ad(bpage->zip_size() == node.space->zip_size()); + + /* We do not need protect io_fix here by mutex to read it because + this and buf_page_write_complete() are the only functions where we can + change the value from BUF_IO_READ or BUF_IO_WRITE to some other + value, and our code ensures that this is the only thread that handles + the i/o for this block. */ + + ut_ad(bpage->io_fix() == BUF_IO_READ); + ut_ad(!!bpage->zip.ssize == !!bpage->zip.data); + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data); + + const byte *frame= bpage->zip.data + ? bpage->zip.data + : reinterpret_cast<buf_block_t*>(bpage)->frame; + ut_ad(frame); + + dberr_t err; + if (!buf_page_decrypt_after_read(bpage, node)) + { + err= DB_DECRYPTION_FAILED; + goto database_corrupted; + } + + if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE) + { + buf_pool.n_pend_unzip++; + auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE); + buf_pool.n_pend_unzip--; + + if (!ok) + { + ib::info() << "Page " << id << " zip_decompress failure."; + err= DB_PAGE_CORRUPTED; + goto database_corrupted; + } + } + + { + const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID), + mach_read_from_4(frame + FIL_PAGE_OFFSET)); + + if (read_id == id); + else if (read_id == page_id_t(0, 0)) + /* This is likely an uninitialized page. */; + else if (!node.space->full_crc32() && + page_id_t(0, read_id.page_no()) == id) + /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace + before MySQL 4.1.1, which introduced innodb_file_per_table. */; + else if (node.space->full_crc32() && + *reinterpret_cast<const uint32_t*> + (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) && + node.space->crypt_data && + node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED) + { + ib::error() << "Cannot decrypt " << id; + err= DB_DECRYPTION_FAILED; + goto release_page; + } + else + ib::error() << "Space id and page no stored in the page, read in are " + << read_id << ", should be " << id; + } + + err= buf_page_check_corrupt(bpage, node); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { +database_corrupted: + /* Not a real corruption if it was triggered by error injection */ + DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", + if (!is_predefined_tablespace(id.space())) + { + buf_corrupt_page_release(bpage, node); + ib::info() << "Simulated IMPORT corruption"; + return err; + } + err= DB_SUCCESS; + goto page_not_corrupt;); + + if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE) + memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size); + + if (err == DB_PAGE_CORRUPTED) + { + ib::error() << "Database page corruption on disk" + " or a failed read of file '" + << node.name << "' page " << id + << ". You may have to recover from a backup."; + + buf_page_print(frame, bpage->zip_size()); + + ib::info() << " You can use CHECK TABLE to scan" + " your table for corruption. " + << FORCE_RECOVERY_MSG; + } + + if (!srv_force_recovery) + { + /* If the corruption is in the system tablespace, we will + intentionally crash the server. */ + if (id.space() == TRX_SYS_SPACE) + ib::fatal() << "Aborting because of a corrupt database page."; + buf_corrupt_page_release(bpage, node); + return err; + } + } + + DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", + page_not_corrupt: bpage= bpage; ); + + if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) + { +release_page: + buf_corrupt_page_release(bpage, node); + if (recv_recovery_is_on()) + recv_sys.free_corrupted_page(id); + return err; + } + + if (recv_recovery_is_on()) + recv_recover_page(node.space, bpage); + + if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations && + (!id.space() || !is_predefined_tablespace(id.space())) && + fil_page_get_type(frame) == FIL_PAGE_INDEX && + page_is_leaf(frame)) + bpage->ibuf_exist= true; + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(bpage, BUF_IO_READ); + DBUG_PRINT("ib_buf", ("read page %u:%u", + id.space(), id.page_no())); + + /* Because this thread which does the unlocking might not be the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread id. */ + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ); + bpage->io_unfix(); + + ut_d(auto n=) buf_pool.n_pend_reads--; + ut_ad(n > 0); + buf_pool.stat.n_pages_read++; + + return DB_SUCCESS; +} + +#ifdef UNIV_DEBUG +/** Check that all blocks are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +void buf_pool_t::assert_all_freed() +{ + mysql_mutex_lock(&mutex); + const chunk_t *chunk= chunks; + for (auto i= n_chunks; i--; chunk++) + if (const buf_block_t* block= chunk->not_freed()) + ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; + mysql_mutex_unlock(&mutex); +} +#endif /* UNIV_DEBUG */ + +/** Refresh the statistics used to print per-second averages. */ +void buf_refresh_io_stats() +{ + buf_pool.last_printout_time = time(NULL); + buf_pool.old_stat = buf_pool.stat; +} + +/** Invalidate all pages in the buffer pool. +All pages must be in a replaceable state (not modified or latched). */ +void buf_pool_invalidate() +{ + mysql_mutex_lock(&buf_pool.mutex); + + buf_flush_wait_batch_end(true); + buf_flush_wait_batch_end(false); + + /* It is possible that a write batch that has been posted + earlier is still not complete. For buffer pool invalidation to + proceed we must ensure there is NO write activity happening. */ + + ut_d(mysql_mutex_unlock(&buf_pool.mutex)); + ut_d(buf_pool.assert_all_freed()); + ut_d(mysql_mutex_lock(&buf_pool.mutex)); + + while (buf_LRU_scan_and_free_block()); + + ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0); + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + + buf_pool.freed_page_clock = 0; + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + + memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat)); + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +#ifdef UNIV_DEBUG +/** Validate the buffer pool. */ +void buf_pool_t::validate() +{ + ulint n_lru = 0; + ulint n_flushing = 0; + ulint n_free = 0; + ulint n_zip = 0; + + mysql_mutex_lock(&mutex); + + chunk_t* chunk = chunks; + + /* Check the uncompressed blocks. */ + + for (auto i = n_chunks; i--; chunk++) { + + ulint j; + buf_block_t* block = chunk->blocks; + + for (j = chunk->size; j--; block++) { + switch (block->page.state()) { + case BUF_BLOCK_ZIP_PAGE: + /* This kind of block descriptors should + be allocated by malloc() only. */ + ut_error; + break; + + case BUF_BLOCK_NOT_USED: + n_free++; + break; + + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + /* do nothing */ + break; + + case BUF_BLOCK_FILE_PAGE: + const page_id_t id = block->page.id(); + ut_ad(page_hash_get_low(id, id.fold()) + == &block->page); + n_lru++; + break; + + } + } + } + + /* Check dirty blocks. */ + + mysql_mutex_lock(&flush_list_mutex); + for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->oldest_modification()); + ut_ad(!fsp_is_system_temporary(b->id().space())); + n_flushing++; + + switch (b->state()) { + case BUF_BLOCK_ZIP_PAGE: + n_lru++; + n_zip++; + break; + case BUF_BLOCK_FILE_PAGE: + /* uncompressed page */ + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + const page_id_t id = b->id(); + ut_ad(page_hash_get_low(id, id.fold()) == b); + } + + ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); + + mysql_mutex_unlock(&flush_list_mutex); + + if (curr_size == old_size + && n_lru + n_free > curr_size + n_zip) { + + ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free + << ", pool " << curr_size + << " zip " << n_zip << ". Aborting..."; + } + + ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru); + + if (curr_size == old_size + && UT_LIST_GET_LEN(free) != n_free) { + + ib::fatal() << "Free list len " + << UT_LIST_GET_LEN(free) + << ", free blocks " << n_free << ". Aborting..."; + } + + mysql_mutex_unlock(&mutex); + + ut_d(buf_LRU_validate()); + ut_d(buf_flush_validate()); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Write information of the buf_pool to the error log. */ +void buf_pool_t::print() +{ + index_id_t* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + index_id_t id; + ulint n_found; + chunk_t* chunk; + dict_index_t* index; + + size = curr_size; + + index_ids = static_cast<index_id_t*>( + ut_malloc_nokey(size * sizeof *index_ids)); + + counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size)); + + mysql_mutex_lock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + + ib::info() + << "[buffer pool: size=" << curr_size + << ", database pages=" << UT_LIST_GET_LEN(LRU) + << ", free pages=" << UT_LIST_GET_LEN(free) + << ", modified database pages=" + << UT_LIST_GET_LEN(flush_list) + << ", n pending decompressions=" << n_pend_unzip + << ", n pending reads=" << n_pend_reads + << ", n pending flush LRU=" << n_flush_LRU_ + << " list=" << n_flush_list_ + << ", pages made young=" << stat.n_pages_made_young + << ", not young=" << stat.n_pages_not_made_young + << ", pages read=" << stat.n_pages_read + << ", created=" << stat.n_pages_created + << ", written=" << stat.n_pages_written << "]"; + + mysql_mutex_unlock(&flush_list_mutex); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + chunk = chunks; + + for (i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + ulint n_blocks = chunk->size; + + for (; n_blocks--; block++) { + const buf_frame_t* frame = block->frame; + + if (fil_page_index_page_check(frame)) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (index_ids[j] == id) { + counts[j]++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + } + + mysql_mutex_unlock(&mutex); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + if (!index) { + ib::info() << "Block count for index " + << index_ids[i] << " in buffer is about " + << counts[i]; + } else { + ib::info() << "Block count for index " << index_ids[i] + << " in buffer is about " << counts[i] + << ", index " << index->name + << " of table " << index->table->name; + } + } + + ut_free(index_ids); + ut_free(counts); + + validate(); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/** @return the number of latched pages in the buffer pool */ +ulint buf_get_latched_pages_number() +{ + ulint fixed_pages_number= 0; + + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b; + b= UT_LIST_GET_NEXT(LRU, b)) + if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE)) + fixed_pages_number++; + + mysql_mutex_unlock(&buf_pool.mutex); + + return fixed_pages_number; +} +#endif /* UNIV_DEBUG */ + +/** Collect buffer pool metadata. +@param[out] pool_info buffer pool metadata */ +void buf_stats_get_pool_info(buf_pool_info_t *pool_info) +{ + time_t current_time; + double time_elapsed; + + mysql_mutex_lock(&buf_pool.mutex); + + pool_info->pool_size = buf_pool.curr_size; + + pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + pool_info->old_lru_len = buf_pool.LRU_old_len; + + pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); + + pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + pool_info->n_pend_reads = buf_pool.n_pend_reads; + + pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_; + + pool_info->n_pending_flush_list = buf_pool.n_flush_list_; + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool.last_printout_time); + + pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young; + + pool_info->n_pages_not_made_young = + buf_pool.stat.n_pages_not_made_young; + + pool_info->n_pages_read = buf_pool.stat.n_pages_read; + + pool_info->n_pages_created = buf_pool.stat.n_pages_created; + + pool_info->n_pages_written = buf_pool.stat.n_pages_written; + + pool_info->n_page_gets = buf_pool.stat.n_page_gets; + + pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd; + pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read; + + pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted; + + pool_info->page_made_young_rate = + static_cast<double>(buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young) + / time_elapsed; + + pool_info->page_not_made_young_rate = + static_cast<double>(buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young) + / time_elapsed; + + pool_info->pages_read_rate = + static_cast<double>(buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read) + / time_elapsed; + + pool_info->pages_created_rate = + static_cast<double>(buf_pool.stat.n_pages_created + - buf_pool.old_stat.n_pages_created) + / time_elapsed; + + pool_info->pages_written_rate = + static_cast<double>(buf_pool.stat.n_pages_written + - buf_pool.old_stat.n_pages_written) + / time_elapsed; + + pool_info->n_page_get_delta = buf_pool.stat.n_page_gets + - buf_pool.old_stat.n_page_gets; + + if (pool_info->n_page_get_delta) { + pool_info->page_read_delta = buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read; + + pool_info->young_making_delta = + buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young; + + pool_info->not_young_making_delta = + buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young; + } + pool_info->pages_readahead_rnd_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd + - buf_pool.old_stat.n_ra_pages_read_rnd) + / time_elapsed; + + + pool_info->pages_readahead_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_read + - buf_pool.old_stat.n_ra_pages_read) + / time_elapsed; + + pool_info->pages_evicted_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_evicted + - buf_pool.old_stat.n_ra_pages_evicted) + / time_elapsed; + + pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->io_sum = buf_LRU_stat_sum.io; + + pool_info->io_cur = buf_LRU_stat_cur.io; + + pool_info->unzip_sum = buf_LRU_stat_sum.unzip; + + pool_info->unzip_cur = buf_LRU_stat_cur.unzip; + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +static +void +buf_print_io_instance( +/*==================*/ + buf_pool_info_t*pool_info, /*!< in: buffer pool info */ + FILE* file) /*!< in/out: buffer where to print */ +{ + ut_ad(pool_info); + + fprintf(file, + "Buffer pool size " ULINTPF "\n" + "Free buffers " ULINTPF "\n" + "Database pages " ULINTPF "\n" + "Old database pages " ULINTPF "\n" + "Modified db pages " ULINTPF "\n" + "Percent of dirty pages(LRU & free pages): %.3f\n" + "Max dirty pages percent: %.3f\n" + "Pending reads " ULINTPF "\n" + "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n", + pool_info->pool_size, + pool_info->free_list_len, + pool_info->lru_len, + pool_info->old_lru_len, + pool_info->flush_list_len, + static_cast<double>(pool_info->flush_list_len) + / (static_cast<double>(pool_info->lru_len + + pool_info->free_list_len) + 1.0) + * 100.0, + srv_max_buf_pool_modified_pct, + pool_info->n_pend_reads, + pool_info->n_pending_flush_lru, + pool_info->n_pending_flush_list); + + fprintf(file, + "Pages made young " ULINTPF ", not young " ULINTPF "\n" + "%.2f youngs/s, %.2f non-youngs/s\n" + "Pages read " ULINTPF ", created " ULINTPF + ", written " ULINTPF "\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + pool_info->n_pages_made_young, + pool_info->n_pages_not_made_young, + pool_info->page_made_young_rate, + pool_info->page_not_made_young_rate, + pool_info->n_pages_read, + pool_info->n_pages_created, + pool_info->n_pages_written, + pool_info->pages_read_rate, + pool_info->pages_created_rate, + pool_info->pages_written_rate); + + if (pool_info->n_page_get_delta) { + double hit_rate = static_cast<double>( + pool_info->page_read_delta) + / static_cast<double>(pool_info->n_page_get_delta); + + if (hit_rate > 1) { + hit_rate = 1; + } + + fprintf(file, + "Buffer pool hit rate " ULINTPF " / 1000," + " young-making rate " ULINTPF " / 1000 not " + ULINTPF " / 1000\n", + ulint(1000 * (1 - hit_rate)), + ulint(1000 + * double(pool_info->young_making_delta) + / double(pool_info->n_page_get_delta)), + ulint(1000 * double(pool_info->not_young_making_delta) + / double(pool_info->n_page_get_delta))); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + /* Statistics about read ahead algorithm */ + fprintf(file, "Pages read ahead %.2f/s," + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", + + pool_info->pages_readahead_rate, + pool_info->pages_evicted_rate, + pool_info->pages_readahead_rnd_rate); + + /* Print some values to help us with visualizing what is + happening with LRU eviction. */ + fprintf(file, + "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n" + "I/O sum[" ULINTPF "]:cur[" ULINTPF "], " + "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n", + pool_info->lru_len, pool_info->unzip_lru_len, + pool_info->io_sum, pool_info->io_cur, + pool_info->unzip_sum, pool_info->unzip_cur); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +void +buf_print_io( +/*=========*/ + FILE* file) /*!< in/out: buffer where to print */ +{ + buf_pool_info_t pool_info; + + buf_stats_get_pool_info(&pool_info); + buf_print_io_instance(&pool_info, file); +} + +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param[in] page page frame +@param[in] fsp_flags tablespace flags +@return true if true if page is encrypted and OK, false otherwise */ +bool buf_page_verify_crypt_checksum(const byte* page, ulint fsp_flags) +{ + if (!fil_space_t::full_crc32(fsp_flags)) { + return fil_space_verify_crypt_checksum( + page, fil_space_t::zip_size(fsp_flags)); + } + + return !buf_page_is_corrupted(true, page, fsp_flags); +} + +/** Print the given page_id_t object. +@param[in,out] out the output stream +@param[in] page_id the page_id_t object to be printed +@return the output stream */ +std::ostream& operator<<(std::ostream &out, const page_id_t page_id) +{ + out << "[page id: space=" << page_id.space() + << ", page number=" << page_id.page_no() << "]"; + return out; +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc new file mode 100644 index 00000000..e98dc184 --- /dev/null +++ b/storage/innobase/buf/buf0checksum.cc @@ -0,0 +1,129 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.cc +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#include "buf0checksum.h" +#include "fil0fil.h" +#include "ut0crc32.h" +#include "ut0rnd.h" + +#ifndef UNIV_INNOCHECKSUM +#include "srv0srv.h" +#endif /* !UNIV_INNOCHECKSUM */ + +/** the value of innodb_checksum_algorithm */ +ulong srv_checksum_algorithm; + +/** Calculate the CRC32 checksum of a page. The value is stored to the page +when it is written to a file and also checked for a match when reading from +the file. Note that we must be careful to calculate the same value on all +architectures. +@param[in] page buffer page (srv_page_size bytes) +@return CRC-32C */ +uint32_t buf_calc_page_crc32(const byte* page) +{ + /* Note: innodb_checksum_algorithm=crc32 could and should have + included the entire page in the checksum, and CRC-32 values + should be combined with the CRC-32 function, not with + exclusive OR. We stick to the current algorithm in order to + remain compatible with old data files. */ + return ut_crc32(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_OFFSET) + ^ ut_crc32(page + FIL_PAGE_DATA, + srv_page_size + - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM)); +} + +/** Calculate a checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_new_checksum(const byte* page) +{ + ulint checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool + to the first pages of data files, we have to skip them in the page + checksum calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + srv_page_size - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + return(static_cast<uint32_t>(checksum)); +} + +/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that +the checksum only looked at the first few bytes of the page. +This calculates that old checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_old_checksum(const byte* page) +{ + return(static_cast<uint32_t> + (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION))); +} + +/** Return a printable string describing the checksum algorithm. +@param[in] algo algorithm +@return algorithm name */ +const char* +buf_checksum_algorithm_name(srv_checksum_algorithm_t algo) +{ + switch (algo) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + return("crc32"); + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + return("strict_crc32"); + case SRV_CHECKSUM_ALGORITHM_INNODB: + return("innodb"); + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + return("strict_innodb"); + case SRV_CHECKSUM_ALGORITHM_NONE: + return("none"); + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return("strict_none"); + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + return("full_crc32"); + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + return("strict_full_crc32"); + } + + ut_error; + return(NULL); +} diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc new file mode 100644 index 00000000..52e947b7 --- /dev/null +++ b/storage/innobase/buf/buf0dblwr.cc @@ -0,0 +1,764 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dblwr.cc +Doublwrite buffer module + +Created 2011/12/19 +*******************************************************/ + +#include "buf0dblwr.h" +#include "buf0buf.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "sync0sync.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "fil0crypt.h" +#include "fil0pagecompress.h" + +using st_::span; + +/** The doublewrite buffer */ +buf_dblwr_t buf_dblwr; + +/** @return the TRX_SYS page */ +inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) +{ + buf_block_t *block= buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + return block; +} + +/** Initialize the doublewrite buffer data structure. +@param header doublewrite page header in the TRX_SYS page */ +inline void buf_dblwr_t::init(const byte *header) +{ + ut_ad(!active_slot->first_free); + ut_ad(!active_slot->reserved); + ut_ad(!batch_running); + + mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); + pthread_cond_init(&cond, nullptr); + block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1)); + block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2)); + + const uint32_t buf_size= 2 * block_size(); + for (int i= 0; i < 2; i++) + { + slots[i].write_buf= static_cast<byte*> + (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size)); + slots[i].buf_block_arr= static_cast<element*> + (ut_zalloc_nokey(buf_size * sizeof(element))); + } + active_slot= &slots[0]; +} + +/** Create or restore the doublewrite buffer in the TRX_SYS page. +@return whether the operation succeeded */ +bool buf_dblwr_t::create() +{ + if (is_initialised()) + return true; + + mtr_t mtr; + const ulint size= block_size(); + +start_again: + mtr.start(); + + buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + + if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N) + { + /* The doublewrite buffer has already been created: just read in + some numbers */ + init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame); + mtr.commit(); + return true; + } + + if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size) + { +too_small: + ib::error() << "Cannot create doublewrite buffer: " + "the first file in innodb_data_file_path must be at least " + << (3 * (size >> (20U - srv_page_size_shift))) << "M."; + mtr.commit(); + return false; + } + else + { + buf_block_t *b= fseg_create(fil_system.sys_space, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, + &mtr, false, trx_sys_block); + if (!b) + goto too_small; + ib::info() << "Doublewrite buffer not found: creating new"; + + /* FIXME: After this point, the doublewrite buffer creation + is not atomic. The doublewrite buffer should not exist in + the InnoDB system tablespace file in the first place. + It could be located in separate optional file(s) in a + user-specified location. */ + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK); + } + + byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->frame; + for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE; + i < 2 * size + extent_size / 2; i++) + { + buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1, + FSP_UP, &mtr); + if (!new_block) + { + ib::error() << "Cannot create doublewrite buffer: " + " you must increase your tablespace size." + " Cannot continue operation."; + /* This may essentially corrupt the doublewrite + buffer. However, usually the doublewrite buffer + is created at database initialization, and it + should not matter (just remove all newly created + InnoDB files and restart). */ + mtr.commit(); + return false; + } + + /* We read the allocated pages to the buffer pool; when they are + written to disk in a flush, the space id and page number fields + are also written to the pages. When we at database startup read + pages from the doublewrite buffer, we know that if the space id + and page number in them are the same as the page position in the + tablespace, then the page has not been written to in + doublewrite. */ + + ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); + const page_id_t id= new_block->page.id(); + /* We only do this in the debug build, to ensure that the check in + buf_flush_init_for_writing() will see a valid page type. The + flushes of new_block are actually unnecessary here. */ + ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame, + FIL_PAGE_TYPE_SYS)); + + if (i == size / 2) + { + ut_a(id.page_no() == size); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 + + trx_sys_block->frame, id.page_no()); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame, + id.page_no()); + } + else if (i == size / 2 + size) + { + ut_a(id.page_no() == 2 * size); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 + + trx_sys_block->frame, id.page_no()); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame, + id.page_no()); + } + else if (i > size / 2) + ut_a(id.page_no() == prev_page_no + 1); + + if (((i + 1) & 15) == 0) { + /* rw_locks can only be recursively x-locked 2048 times. (on 32 + bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a + negative number, and thus lock_word becomes like a shared lock). + For 4k page size this loop will lock the fseg header too many + times. Since this code is not done while any other threads are + active, restart the MTR occasionally. */ + mtr.commit(); + mtr.start(); + trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->frame; + } + + prev_page_no= id.page_no(); + } + + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame, + TRX_SYS_DOUBLEWRITE_MAGIC_N); + + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + + trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N); + mtr.commit(); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint(); + + /* Remove doublewrite pages from LRU */ + buf_pool_invalidate(); + + ib::info() << "Doublewrite buffer created"; + goto start_again; +} + +/** Initialize the doublewrite buffer memory structure on recovery. +If we are upgrading from a version before MySQL 4.1, then this +function performs the necessary update operations to support +innodb_file_per_table. If we are in a crash recovery, this function +loads the pages from double write buffer into memory. +@param file File handle +@param path Path name of file +@return DB_SUCCESS or error code */ +dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path) +{ + ut_ad(this == &buf_dblwr); + const uint32_t size= block_size(); + + /* We do the file i/o past the buffer pool */ + byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size, + srv_page_size)); + /* Read the TRX_SYS header to check if we are using the doublewrite buffer */ + dberr_t err= os_file_read(IORequestRead, file, read_buf, + TRX_SYS_PAGE_NO << srv_page_size_shift, + srv_page_size); + + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the system tablespace header page"; +func_exit: + aligned_free(read_buf); + return err; + } + + /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */ + if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE + + read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N) + { + /* There is no doublewrite buffer initialized in the TRX_SYS page. + This should normally not be possible; the doublewrite buffer should + be initialized when creating the database. */ + err= DB_SUCCESS; + goto func_exit; + } + + init(TRX_SYS_DOUBLEWRITE + read_buf); + + const bool upgrade_to_innodb_file_per_table= + mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + + TRX_SYS_DOUBLEWRITE + read_buf) != + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N; + + auto write_buf= active_slot->write_buf; + /* Read the pages from the doublewrite buffer to memory */ + err= os_file_read(IORequestRead, file, write_buf, + block1.page_no() << srv_page_size_shift, + size << srv_page_size_shift); + + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the first double write buffer extent"; + goto func_exit; + } + + err= os_file_read(IORequestRead, file, + write_buf + (size << srv_page_size_shift), + block2.page_no() << srv_page_size_shift, + size << srv_page_size_shift); + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the second double write buffer extent"; + goto func_exit; + } + + byte *page= write_buf; + + if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table)) + { + ib::info() << "Resetting space id's in the doublewrite buffer"; + + for (ulint i= 0; i < size * 2; i++, page += srv_page_size) + { + memset(page + FIL_PAGE_SPACE_ID, 0, 4); + /* For innodb_checksum_algorithm=innodb, we do not need to + calculate new checksums for the pages because the field + .._SPACE_ID does not affect them. Write the page back to where + we read it from. */ + const ulint source_page_no= i < size + ? block1.page_no() + i + : block2.page_no() + i - size; + err= os_file_write(IORequestWrite, path, file, page, + source_page_no << srv_page_size_shift, srv_page_size); + if (err != DB_SUCCESS) + { + ib::error() << "Failed to upgrade the double write buffer"; + goto func_exit; + } + } + os_file_flush(file); + } + else + for (ulint i= 0; i < size * 2; i++, page += srv_page_size) + if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN))) + /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */ + recv_sys.dblwr.add(page); + + err= DB_SUCCESS; + goto func_exit; +} + +/** Process and remove the double write buffer pages for all tablespaces. */ +void buf_dblwr_t::recover() +{ + ut_ad(recv_sys.parse_start_lsn); + if (!is_initialised()) + return; + + uint32_t page_no_dblwr= 0; + byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size, + srv_page_size)); + byte *const buf= read_buf + srv_page_size; + + for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin(); + i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr) + { + byte *page= *i; + const uint32_t page_no= page_get_page_no(page); + if (!page_no) /* recovered via Datafile::restore_from_doublewrite() */ + continue; + + const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN); + if (recv_sys.parse_start_lsn > lsn) + /* Pages written before the checkpoint are not useful for recovery. */ + continue; + const ulint space_id= page_get_space_id(page); + const page_id_t page_id(space_id, page_no); + + if (recv_sys.scanned_lsn < lsn) + { + ib::info() << "Ignoring a doublewrite copy of page " << page_id + << " with future log sequence number " << lsn; + continue; + } + + fil_space_t *space= fil_space_t::get(space_id); + + if (!space) + /* The tablespace that this page once belonged to does not exist */ + continue; + + if (UNIV_UNLIKELY(page_no >= space->get_size())) + { + /* Do not report the warning for undo tablespaces, because they + can be truncated in place. */ + if (!srv_is_undo_tablespace(space_id)) + ib::warn() << "A copy of page " << page_no + << " in the doublewrite buffer slot " << page_no_dblwr + << " is beyond the end of tablespace " << space->name + << " (" << space->size << " pages)"; +next_page: + space->release(); + continue; + } + + const ulint physical_size= space->physical_size(); + ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size))); + + /* We want to ensure that for partial reads the unread portion of + the page is NUL. */ + memset(read_buf, 0x0, physical_size); + + /* Read in the actual page from the file */ + fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER), + os_offset_t{page_no} * physical_size, + physical_size, read_buf); + + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) + ib::warn() << "Double write buffer recovery: " << page_id + << " (tablespace '" << space->name + << "') read failed with error: " << fio.err; + + if (buf_is_zeroes(span<const byte>(read_buf, physical_size))) + { + /* We will check if the copy in the doublewrite buffer is + valid. If not, we will ignore this page (there should be redo + log records to initialize it). */ + } + else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf)) + goto next_page; + else + /* We intentionally skip this message for all-zero pages. */ + ib::info() << "Trying to recover page " << page_id + << " from the doublewrite buffer."; + + page= recv_sys.dblwr.find_page(page_id, space, buf); + + if (!page) + goto next_page; + + /* Write the good page from the doublewrite buffer to the intended + position. */ + space->reacquire(); + fio= space->io(IORequestWrite, + os_offset_t{page_id.page_no()} * physical_size, + physical_size, page); + + if (fio.err == DB_SUCCESS) + ib::info() << "Recovered page " << page_id << " to '" << fio.node->name + << "' from the doublewrite buffer."; + goto next_page; + } + + recv_sys.dblwr.pages.clear(); + fil_flush_file_spaces(); + aligned_free(read_buf); +} + +/** Free the doublewrite buffer. */ +void buf_dblwr_t::close() +{ + if (!is_initialised()) + return; + + /* Free the double write data structures. */ + ut_ad(!active_slot->reserved); + ut_ad(!active_slot->first_free); + ut_ad(!batch_running); + + pthread_cond_destroy(&cond); + for (int i= 0; i < 2; i++) + { + aligned_free(slots[i].write_buf); + ut_free(slots[i].buf_block_arr); + } + mysql_mutex_destroy(&mutex); + + memset((void*) this, 0, sizeof *this); + active_slot= &slots[0]; +} + +/** Update the doublewrite buffer on write completion. */ +void buf_dblwr_t::write_completed() +{ + ut_ad(this == &buf_dblwr); + ut_ad(srv_use_doublewrite_buf); + ut_ad(is_initialised()); + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&mutex); + + ut_ad(batch_running); + slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved); + ut_ad(flush_slot->reserved <= flush_slot->first_free); + + if (!--flush_slot->reserved) + { + mysql_mutex_unlock(&mutex); + /* This will finish the batch. Sync data files to the disk. */ + fil_flush_file_spaces(); + mysql_mutex_lock(&mutex); + + /* We can now reuse the doublewrite memory buffer: */ + flush_slot->first_free= 0; + batch_running= false; + pthread_cond_broadcast(&cond); + } + + mysql_mutex_unlock(&mutex); +} + +#ifdef UNIV_DEBUG +/** Check the LSN values on the page. +@param[in] page page to check +@param[in] s tablespace */ +static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s) +{ + /* Ignore page_compressed or encrypted pages */ + if (s.is_compressed() || buf_page_get_key_version(page, s.flags)) + return; + const byte* lsn_start= FIL_PAGE_LSN + 4 + page; + const byte* lsn_end= page + srv_page_size - + (s.full_crc32() + ? FIL_PAGE_FCRC32_END_LSN + : FIL_PAGE_END_LSN_OLD_CHKSUM - 4); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4)); +} + +static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page) +{ + if (fil_space_t *space= fil_space_t::get(b.id().space())) + { + buf_dblwr_check_page_lsn(page, *space); + space->release(); + } +} + +/** Check the LSN values on the page with which this block is associated. */ +static void buf_dblwr_check_block(const buf_page_t *bpage) +{ + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE); + const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame; + + switch (fil_page_get_type(page)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + if (page_is_comp(page)) + { + if (page_simple_validate_new(page)) + return; + } + else if (page_simple_validate_old(page)) + return; + /* While it is possible that this is not an index page but just + happens to have wrongly set FIL_PAGE_TYPE, such pages should never + be modified to without also adjusting the page type during page + allocation or buf_flush_init_for_writing() or + fil_block_reset_type(). */ + buf_page_print(page); + + ib::fatal() << "Apparent corruption of an index page " << bpage->id() + << " to be written to data file. We intentionally crash" + " the server to prevent corrupt data from ending up in" + " data files."; + } +} +#endif /* UNIV_DEBUG */ + +bool buf_dblwr_t::flush_buffered_writes(const ulint size) +{ + mysql_mutex_assert_owner(&mutex); + ut_ad(size == block_size()); + + for (;;) + { + if (!active_slot->first_free) + return false; + if (!batch_running) + break; + my_cond_wait(&cond, &mutex.m_mutex); + } + + ut_ad(active_slot->reserved == active_slot->first_free); + ut_ad(!flushing_buffered_writes); + + /* Disallow anyone else to start another batch of flushing. */ + slot *flush_slot= active_slot; + /* Switch the active slot */ + active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_a(active_slot->first_free == 0); + batch_running= true; + const ulint old_first_free= flush_slot->first_free; + auto write_buf= flush_slot->write_buf; + const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 && + old_first_free > size; + flushing_buffered_writes= 1 + multi_batch; + pages_submitted+= old_first_free; + /* Now safe to release the mutex. */ + mysql_mutex_unlock(&mutex); +#ifdef UNIV_DEBUG + for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++) + { + buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage; + + if (bpage->zip.data) + /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */ + continue; + + /* Check that the actual page in the buffer pool is not corrupt + and the LSN values are sane. */ + buf_dblwr_check_block(bpage); + ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2)); + } +#endif /* UNIV_DEBUG */ + const IORequest request(nullptr, fil_system.sys_space->chain.start, + IORequest::DBLWR_BATCH); + ut_a(fil_system.sys_space->acquire()); + if (multi_batch) + { + fil_system.sys_space->reacquire(); + os_aio(request, write_buf, + os_offset_t{block1.page_no()} << srv_page_size_shift, + size << srv_page_size_shift); + os_aio(request, write_buf + (size << srv_page_size_shift), + os_offset_t{block2.page_no()} << srv_page_size_shift, + (old_first_free - size) << srv_page_size_shift); + } + else + os_aio(request, write_buf, + os_offset_t{block1.page_no()} << srv_page_size_shift, + old_first_free << srv_page_size_shift); + return true; +} + +void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) +{ + ut_ad(this == &buf_dblwr); + ut_ad(srv_use_doublewrite_buf); + ut_ad(is_initialised()); + ut_ad(!srv_read_only_mode); + ut_ad(!request.bpage); + ut_ad(request.node == fil_system.sys_space->chain.start); + ut_ad(request.type == IORequest::DBLWR_BATCH); + mysql_mutex_lock(&mutex); + ut_ad(batch_running); + ut_ad(flushing_buffered_writes); + ut_ad(flushing_buffered_writes <= 2); + writes_completed++; + if (UNIV_UNLIKELY(--flushing_buffered_writes)) + { + mysql_mutex_unlock(&mutex); + return; + } + + slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved == flush_slot->first_free); + /* increment the doublewrite flushed pages counter */ + pages_written+= flush_slot->first_free; + mysql_mutex_unlock(&mutex); + + /* Now flush the doublewrite buffer data to disk */ + fil_system.sys_space->flush<false>(); + + /* The writes have been flushed to disk now and in recovery we will + find them in the doublewrite buffer blocks. Next, write the data pages. */ + for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++) + { + auto e= flush_slot->buf_block_arr[i]; + buf_page_t* bpage= e.request.bpage; + ut_ad(bpage->in_file()); + + /* We request frame here to get correct buffer in case of + encryption and/or page compression */ + void *frame= buf_page_get_frame(bpage); + + auto e_size= e.size; + + if (UNIV_LIKELY_NULL(bpage->zip.data)) + { + e_size= bpage->zip_size(); + ut_ad(e_size); + } + else + { + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE); + ut_ad(!bpage->zip_size()); + ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame))); + } + + const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + + static_cast<const byte*>(frame))); + ut_ad(lsn); + ut_ad(lsn >= bpage->oldest_modification()); + log_write_up_to(lsn, true); + e.request.node->space->io(e.request, bpage->physical_offset(), e_size, + frame, bpage); + } +} + +/** Flush possible buffered writes to persistent storage. +It is very important to call this function after a batch of writes has been +posted, and also when we may have to wait for a page latch! +Otherwise a deadlock of threads can occur. */ +void buf_dblwr_t::flush_buffered_writes() +{ + if (!is_initialised() || !srv_use_doublewrite_buf) + { + fil_flush_file_spaces(); + return; + } + + ut_ad(!srv_read_only_mode); + const ulint size= block_size(); + + mysql_mutex_lock(&mutex); + if (!flush_buffered_writes(size)) + mysql_mutex_unlock(&mutex); +} + +/** Schedule a page write. If the doublewrite memory buffer is full, +flush_buffered_writes() will be invoked to make space. +@param request asynchronous write request +@param size payload size in bytes */ +void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) +{ + ut_ad(request.is_async()); + ut_ad(request.is_write()); + ut_ad(request.bpage); + ut_ad(request.bpage->in_file()); + ut_ad(request.node); + ut_ad(request.node->space->id == request.bpage->id().space()); + ut_ad(request.node->space->referenced()); + ut_ad(!srv_read_only_mode); + + const ulint buf_size= 2 * block_size(); + + mysql_mutex_lock(&mutex); + + for (;;) + { + ut_ad(active_slot->first_free <= buf_size); + if (active_slot->first_free != buf_size) + break; + + if (flush_buffered_writes(buf_size / 2)) + mysql_mutex_lock(&mutex); + } + + byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free; + + /* We request frame here to get correct buffer in case of + encryption and/or page compression */ + void *frame= buf_page_get_frame(request.bpage); + + /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages, + and at least srv_page_size (4096-byte) for everything else. */ + memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size); + /* fil_page_compress() for page_compressed guarantees 256-byte alignment */ + memset_aligned<256>(p + size, 0, srv_page_size - size); + /* FIXME: Inform the compiler that "size" and "srv_page_size - size" + are integer multiples of 256, so the above can translate into simple + SIMD instructions. Currently, we make no such assumptions about the + non-pointer parameters that are passed to the _aligned templates. */ + ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size); + ut_ad(active_slot->reserved == active_slot->first_free); + ut_ad(active_slot->reserved < buf_size); + new (active_slot->buf_block_arr + active_slot->first_free++) + element{request, size}; + active_slot->reserved= active_slot->first_free; + + if (active_slot->first_free != buf_size || + !flush_buffered_writes(buf_size / 2)) + mysql_mutex_unlock(&mutex); +} diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc new file mode 100644 index 00000000..c6ddcb4f --- /dev/null +++ b/storage/innobase/buf/buf0dump.cc @@ -0,0 +1,824 @@ +/***************************************************************************** + +Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.cc +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#include "my_global.h" +#include "mysqld.h" +#include "my_sys.h" + +#include "mysql/psi/mysql_stage.h" +#include "mysql/psi/psi.h" + +#include "buf0buf.h" +#include "buf0dump.h" +#include "dict0dict.h" +#include "os0file.h" +#include "os0thread.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "sync0rw.h" +#include "ut0byte.h" + +#include <algorithm> + +#include "mysql/service_wsrep.h" /* wsrep_recovery */ +#include <my_service_manager.h> + +static void buf_do_load_dump(); + +enum status_severity { + STATUS_INFO, + STATUS_ERR +}; + +#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) + +/* Flags that tell the buffer pool dump/load thread which action should it +take after being waked up. */ +static volatile bool buf_dump_should_start; +static volatile bool buf_load_should_start; + +static bool buf_load_abort_flag; + +/** Start the buffer pool dump/load task and instructs it to start a dump. */ +void buf_dump_start() +{ + buf_dump_should_start= true; + buf_do_load_dump(); +} + +/** Start the buffer pool dump/load task and instructs it to start a load. */ +void buf_load_start() +{ + buf_load_should_start= true; + buf_do_load_dump(); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */ +static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) +void +buf_dump_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according + to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_dump_status, + sizeof(export_vars.innodb_buffer_pool_dump_status), + fmt, ap); + + switch (severity) { + case STATUS_INFO: + ib::info() << export_vars.innodb_buffer_pool_dump_status; + break; + + case STATUS_ERR: + ib::error() << export_vars.innodb_buffer_pool_dump_status; + break; + } + + va_end(ap); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */ +static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) +void +buf_load_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_load_status, + sizeof(export_vars.innodb_buffer_pool_load_status), + fmt, ap); + + switch (severity) { + case STATUS_INFO: + ib::info() << export_vars.innodb_buffer_pool_load_status; + break; + + case STATUS_ERR: + ib::error() << export_vars.innodb_buffer_pool_load_status; + break; + } + + va_end(ap); +} + +/** Returns the directory path where the buffer pool dump file will be created. +@return directory path */ +static +const char* +get_buf_dump_dir() +{ + const char* dump_dir; + + /* The dump file should be created in the default data directory if + innodb_data_home_dir is set as an empty string. */ + if (!*srv_data_home) { + dump_dir = fil_path_to_mysql_datadir; + } else { + dump_dir = srv_data_home; + } + + return(dump_dir); +} + +/** Generate the path to the buffer pool dump/load file. +@param[out] path generated path +@param[in] path_size size of 'path', used as in snprintf(3). */ +static void buf_dump_generate_path(char *path, size_t path_size) +{ + char buf[FN_REFLEN]; + + mysql_mutex_lock(&LOCK_global_system_variables); + snprintf(buf, sizeof(buf), "%s%c%s", get_buf_dump_dir(), + OS_PATH_SEPARATOR, srv_buf_dump_filename); + mysql_mutex_unlock(&LOCK_global_system_variables); + + os_file_type_t type; + bool exists = false; + bool ret; + + ret = os_file_status(buf, &exists, &type); + + /* For realpath() to succeed the file must exist. */ + + if (ret && exists) { + /* my_realpath() assumes the destination buffer is big enough + to hold FN_REFLEN bytes. */ + ut_a(path_size >= FN_REFLEN); + + my_realpath(path, buf, 0); + } else { + /* If it does not exist, then resolve only srv_data_home + and append srv_buf_dump_filename to it. */ + char srv_data_home_full[FN_REFLEN]; + + my_realpath(srv_data_home_full, get_buf_dump_dir(), 0); + + if (srv_data_home_full[strlen(srv_data_home_full) - 1] + == OS_PATH_SEPARATOR) { + + snprintf(path, path_size, "%s%s", + srv_data_home_full, + srv_buf_dump_filename); + } else { + snprintf(path, path_size, "%s%c%s", + srv_data_home_full, + OS_PATH_SEPARATOR, + srv_buf_dump_filename); + } + } +} + +/*****************************************************************//** +Perform a buffer pool dump into the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_dump( +/*=====*/ + ibool obey_shutdown) /*!< in: quit if we are in a shutting down + state */ +{ +#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown) + + char full_filename[OS_FILE_MAX_PATH]; + char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"]; + char now[32]; + FILE* f; + int ret; + + buf_dump_generate_path(full_filename, sizeof(full_filename)); + + snprintf(tmp_filename, sizeof(tmp_filename), + "%s.incomplete", full_filename); + + buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s", + full_filename); + +#if defined(__GLIBC__) || defined(__WIN__) || O_CLOEXEC == 0 + f = fopen(tmp_filename, "w" STR_O_CLOEXEC); +#else + { + int fd; + fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640); + if (fd >= 0) { + f = fdopen(fd, "w"); + } + else { + f = NULL; + } + } +#endif + if (f == NULL) { + buf_dump_status(STATUS_ERR, + "Cannot open '%s' for writing: %s", + tmp_filename, strerror(errno)); + return; + } + const buf_page_t* bpage; + page_id_t* dump; + ulint n_pages; + ulint j; + + mysql_mutex_lock(&buf_pool.mutex); + + n_pages = UT_LIST_GET_LEN(buf_pool.LRU); + + /* skip empty buffer pools */ + if (n_pages == 0) { + mysql_mutex_unlock(&buf_pool.mutex); + goto done; + } + + if (srv_buf_pool_dump_pct != 100) { + ulint t_pages; + + /* limit the number of total pages dumped to X% of the + total number of pages */ + t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100; + if (n_pages > t_pages) { + buf_dump_status(STATUS_INFO, + "Restricted to " ULINTPF + " pages due to " + "innodb_buf_pool_dump_pct=%lu", + t_pages, srv_buf_pool_dump_pct); + n_pages = t_pages; + } + + if (n_pages == 0) { + n_pages = 1; + } + } + + dump = static_cast<page_id_t*>(ut_malloc_nokey( + n_pages * sizeof(*dump))); + + if (dump == NULL) { + mysql_mutex_unlock(&buf_pool.mutex); + fclose(f); + buf_dump_status(STATUS_ERR, + "Cannot allocate " ULINTPF " bytes: %s", + (ulint) (n_pages * sizeof(*dump)), + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0; + bpage != NULL && j < n_pages; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + + ut_a(bpage->in_file()); + const page_id_t id(bpage->id()); + + if (id.space() == SRV_TMP_SPACE_ID) { + /* Ignore the innodb_temporary tablespace. */ + continue; + } + + if (bpage->status == buf_page_t::FREED) { + continue; + } + + dump[j++] = id; + } + + mysql_mutex_unlock(&buf_pool.mutex); + + ut_a(j <= n_pages); + n_pages = j; + + for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) { + ret = fprintf(f, "%u,%u\n", + dump[j].space(), dump[j].page_no()); + if (ret < 0) { + ut_free(dump); + fclose(f); + buf_dump_status(STATUS_ERR, + "Cannot write to '%s': %s", + tmp_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + if (SHUTTING_DOWN() && !(j & 1023)) { + service_manager_extend_timeout( + INNODB_EXTEND_TIMEOUT_INTERVAL, + "Dumping buffer pool page " + ULINTPF "/" ULINTPF, j + 1, n_pages); + } + } + + ut_free(dump); + +done: + ret = fclose(f); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot close '%s': %s", + tmp_filename, strerror(errno)); + return; + } + /* else */ + + ret = unlink(full_filename); + if (ret != 0 && errno != ENOENT) { + buf_dump_status(STATUS_ERR, + "Cannot delete '%s': %s", + full_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + ret = rename(tmp_filename, full_filename); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot rename '%s' to '%s': %s", + tmp_filename, full_filename, + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + /* success */ + + ut_sprintf_timestamp(now); + + buf_dump_status(STATUS_INFO, + "Buffer pool(s) dump completed at %s", now); + + /* Though dumping doesn't related to an incomplete load, + we reset this to 0 here to indicate that a shutdown can also perform + a dump */ + export_vars.innodb_buffer_pool_load_incomplete = 0; +} + +/*****************************************************************//** +Artificially delay the buffer pool loading if necessary. The idea of +this function is to prevent hogging the server with IO and slowing down +too much normal client queries. */ +UNIV_INLINE +void +buf_load_throttle_if_needed( +/*========================*/ + ulint* last_check_time, /*!< in/out: milliseconds since epoch + of the last time we did check if + throttling is needed, we do the check + every srv_io_capacity IO ops. */ + ulint* last_activity_count, + ulint n_io) /*!< in: number of IO ops done since + buffer pool load has started */ +{ + if (n_io % srv_io_capacity < srv_io_capacity - 1) { + return; + } + + if (*last_check_time == 0 || *last_activity_count == 0) { + *last_check_time = ut_time_ms(); + *last_activity_count = srv_get_activity_count(); + return; + } + + /* srv_io_capacity IO operations have been performed by buffer pool + load since the last time we were here. */ + + /* If no other activity, then keep going without any delay. */ + if (srv_get_activity_count() == *last_activity_count) { + return; + } + + /* There has been other activity, throttle. */ + + ulint now = ut_time_ms(); + ulint elapsed_time = now - *last_check_time; + + /* Notice that elapsed_time is not the time for the last + srv_io_capacity IO operations performed by BP load. It is the + time elapsed since the last time we detected that there has been + other activity. This has a small and acceptable deficiency, e.g.: + 1. BP load runs and there is no other activity. + 2. Other activity occurs, we run N IO operations after that and + enter here (where 0 <= N < srv_io_capacity). + 3. last_check_time is very old and we do not sleep at this time, but + only update last_check_time and last_activity_count. + 4. We run srv_io_capacity more IO operations and call this function + again. + 5. There has been more other activity and thus we enter here. + 6. Now last_check_time is recent and we sleep if necessary to prevent + more than srv_io_capacity IO operations per second. + The deficiency is that we could have slept at 3., but for this we + would have to update last_check_time before the + "cur_activity_count == *last_activity_count" check and calling + ut_time_ms() that often may turn out to be too expensive. */ + + if (elapsed_time < 1000 /* 1 sec (1000 milli secs) */) { + os_thread_sleep((1000 - elapsed_time) * 1000 /* micro secs */); + } + + *last_check_time = ut_time_ms(); + *last_activity_count = srv_get_activity_count(); +} + +/*****************************************************************//** +Perform a buffer pool load from the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_load_status will be set accordingly, see buf_load_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_load() +/*======*/ +{ + char full_filename[OS_FILE_MAX_PATH]; + char now[32]; + FILE* f; + page_id_t* dump; + ulint dump_n; + ulint i; + uint32_t space_id; + uint32_t page_no; + int fscanf_ret; + + /* Ignore any leftovers from before */ + buf_load_abort_flag = false; + + buf_dump_generate_path(full_filename, sizeof(full_filename)); + + buf_load_status(STATUS_INFO, + "Loading buffer pool(s) from %s", full_filename); + + f = fopen(full_filename, "r" STR_O_CLOEXEC); + if (f == NULL) { + buf_load_status(STATUS_INFO, + "Cannot open '%s' for reading: %s", + full_filename, strerror(errno)); + return; + } + /* else */ + + /* First scan the file to estimate how many entries are in it. + This file is tiny (approx 500KB per 1GB buffer pool), reading it + two times is fine. */ + dump_n = 0; + while (fscanf(f, "%u,%u", &space_id, &page_no) == 2 + && !SHUTTING_DOWN()) { + dump_n++; + } + + if (!SHUTTING_DOWN() && !feof(f)) { + /* fscanf() returned != 2 */ + const char* what; + if (ferror(f)) { + what = "reading"; + } else { + what = "parsing"; + } + fclose(f); + buf_load_status(STATUS_ERR, "Error %s '%s'," + " unable to load buffer pool (stage 1)", + what, full_filename); + return; + } + + /* If dump is larger than the buffer pool(s), then we ignore the + extra trailing. This could happen if a dump is made, then buffer + pool is shrunk and then load is attempted. */ + dump_n = std::min(dump_n, buf_pool.get_n_pages()); + + if (dump_n != 0) { + dump = static_cast<page_id_t*>(ut_malloc_nokey( + dump_n * sizeof(*dump))); + } else { + fclose(f); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s" + " (%s was empty)", now, full_filename); + return; + } + + if (dump == NULL) { + fclose(f); + buf_load_status(STATUS_ERR, + "Cannot allocate " ULINTPF " bytes: %s", + dump_n * sizeof(*dump), + strerror(errno)); + return; + } + + rewind(f); + + export_vars.innodb_buffer_pool_load_incomplete = 1; + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no); + + if (fscanf_ret != 2) { + if (feof(f)) { + break; + } + /* else */ + + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s', unable" + " to load buffer pool (stage 2)", + full_filename); + return; + } + + if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) { + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s': bogus" + " space,page %u,%u at line " ULINTPF + ", unable to load buffer pool", + full_filename, + space_id, page_no, + i); + return; + } + + dump[i] = page_id_t(space_id, page_no); + } + + /* Set dump_n to the actual number of initialized elements, + i could be smaller than dump_n here if the file got truncated after + we read it the first time. */ + dump_n = i; + + fclose(f); + + if (dump_n == 0) { + ut_free(dump); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s" + " (%s was empty or had errors)", now, full_filename); + return; + } + + if (!SHUTTING_DOWN()) { + std::sort(dump, dump + dump_n); + } + + ulint last_check_time = 0; + ulint last_activity_cnt = 0; + + /* Avoid calling the expensive fil_space_t::get() for each + page within the same tablespace. dump[] is sorted by (space, page), + so all pages from a given tablespace are consecutive. */ + ulint cur_space_id = dump[0].space(); + fil_space_t* space = fil_space_t::get(cur_space_id); + ulint zip_size = space ? space->zip_size() : 0; + + PSI_stage_progress* pfs_stage_progress __attribute__((unused)) + = mysql_set_stage(srv_stage_buffer_pool_load.m_key); + mysql_stage_set_work_estimated(pfs_stage_progress, dump_n); + mysql_stage_set_work_completed(pfs_stage_progress, 0); + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + + /* space_id for this iteration of the loop */ + const ulint this_space_id = dump[i].space(); + + if (this_space_id == SRV_TMP_SPACE_ID) { + /* Ignore the innodb_temporary tablespace. */ + continue; + } + + if (this_space_id != cur_space_id) { + if (space) { + space->release(); + } + + cur_space_id = this_space_id; + space = fil_space_t::get(cur_space_id); + + if (!space) { + continue; + } + + zip_size = space->zip_size(); + } + + /* JAN: TODO: As we use background page read below, + if tablespace is encrypted we cant use it. */ + if (!space || dump[i].page_no() >= space->get_size() || + (space->crypt_data && + space->crypt_data->encryption != FIL_ENCRYPTION_OFF && + space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) { + continue; + } + + if (space->is_stopping()) { + space->release(); + space = nullptr; + continue; + } + + space->reacquire(); + buf_read_page_background(space, dump[i], zip_size, true); + + if (buf_load_abort_flag) { + if (space) { + space->release(); + } + buf_load_abort_flag = false; + ut_free(dump); + buf_load_status( + STATUS_INFO, + "Buffer pool(s) load aborted on request"); + /* Premature end, set estimated = completed = i and + end the current stage event. */ + + mysql_stage_set_work_estimated(pfs_stage_progress, i); + mysql_stage_set_work_completed(pfs_stage_progress, i); + + mysql_end_stage(); + return; + } + + buf_load_throttle_if_needed( + &last_check_time, &last_activity_cnt, i); + +#ifdef UNIV_DEBUG + if ((i+1) >= srv_buf_pool_load_pages_abort) { + buf_load_abort_flag = true; + } +#endif + } + + if (space) { + space->release(); + } + + ut_free(dump); + + ut_sprintf_timestamp(now); + + if (i == dump_n) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s", now); + export_vars.innodb_buffer_pool_load_incomplete = 0; + } else if (!buf_load_abort_flag) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to user instigated abort at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we don't want a shutdown to save the buffer pool */ + } else { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to shutdown at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we want to abort without saving the buffer pool */ + } + + /* Make sure that estimated = completed when we end. */ + mysql_stage_set_work_completed(pfs_stage_progress, dump_n); + /* End the stage progress event. */ + mysql_end_stage(); +} + +/** Abort a currently running buffer pool load. */ +void buf_load_abort() +{ + buf_load_abort_flag= true; +} + +/*****************************************************************//** +This is the main task for buffer pool dump/load. when scheduled +either performs a dump or load, depending on server state, state of the variables etc- */ +static void buf_dump_load_func(void *) +{ + ut_ad(!srv_read_only_mode); + static bool first_time = true; + if (first_time && srv_buffer_pool_load_at_startup) { + +#ifdef WITH_WSREP + if (!get_wsrep_recovery()) { +#endif /* WITH_WSREP */ + buf_load(); +#ifdef WITH_WSREP + } +#endif /* WITH_WSREP */ + } + first_time = false; + + while (!SHUTTING_DOWN()) { + if (buf_dump_should_start) { + buf_dump_should_start = false; + buf_dump(true); + } + if (buf_load_should_start) { + buf_load_should_start = false; + buf_load(); + } + + if (!buf_dump_should_start && !buf_load_should_start) { + return; + } + } + + /* In shutdown */ + if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) { + if (export_vars.innodb_buffer_pool_load_incomplete) { + buf_dump_status(STATUS_INFO, + "Dumping of buffer pool not started" + " as load was incomplete"); +#ifdef WITH_WSREP + } else if (get_wsrep_recovery()) { +#endif /* WITH_WSREP */ + } else { + buf_dump(false/* do complete dump at shutdown */); + } + } +} + + +/* Execute task with max.concurrency */ +static tpool::task_group tpool_group(1); +static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group); +static bool load_dump_enabled; + +/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/ +void buf_load_at_startup() +{ + load_dump_enabled= true; + if (srv_buffer_pool_load_at_startup) + buf_do_load_dump(); +} + +static void buf_do_load_dump() +{ + if (load_dump_enabled && !buf_dump_load_task.is_running()) + srv_thread_pool->submit_task(&buf_dump_load_task); +} + +/** Wait for currently running load/dumps to finish*/ +void buf_load_dump_end() +{ + ut_ad(SHUTTING_DOWN()); + buf_dump_load_task.wait(); +} diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc new file mode 100644 index 00000000..10a84d99 --- /dev/null +++ b/storage/innobase/buf/buf0flu.cc @@ -0,0 +1,2530 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2021, MariaDB Corporation. +Copyright (c) 2013, 2014, Fusion-io + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0flu.cc +The database buffer buf_pool flush algorithm + +Created 11/11/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include <my_service_manager.h> +#include <mysql/service_thd_wait.h> +#include <sql_class.h> + +#include "buf0flu.h" +#include "buf0buf.h" +#include "buf0checksum.h" +#include "buf0dblwr.h" +#include "srv0start.h" +#include "page0zip.h" +#include "fil0fil.h" +#include "log0crypt.h" +#include "srv0mon.h" +#include "fil0pagecompress.h" +#ifdef HAVE_LZO +# include "lzo/lzo1x.h" +#elif defined HAVE_SNAPPY +# include "snappy-c.h" +#endif + +/** Number of pages flushed via LRU. Protected by buf_pool.mutex. +Also included in buf_flush_page_count. */ +ulint buf_lru_flush_page_count; + +/** Number of pages flushed. Protected by buf_pool.mutex. */ +ulint buf_flush_page_count; + +/** Flag indicating if the page_cleaner is in active state. */ +bool buf_page_cleaner_is_active; + +/** Factor for scan length to determine n_pages for intended oldest LSN +progress */ +static constexpr ulint buf_flush_lsn_scan_factor = 3; + +/** Average redo generation rate */ +static lsn_t lsn_avg_rate = 0; + +/** Target oldest_modification for the page cleaner background flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed<lsn_t> buf_flush_async_lsn; +/** Target oldest_modification for the page cleaner furious flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed<lsn_t> buf_flush_sync_lsn; + +#ifdef UNIV_PFS_THREAD +mysql_pfs_key_t page_cleaner_thread_key; +#endif /* UNIV_PFS_THREAD */ + +/** Page cleaner structure */ +static struct +{ + /** total elapsed time in adaptive flushing, in seconds */ + ulint flush_time; + /** number of adaptive flushing passes */ + ulint flush_pass; +} page_cleaner; + +#ifdef UNIV_DEBUG +my_bool innodb_page_cleaner_disabled_debug; +#endif /* UNIV_DEBUG */ + +/** If LRU list of a buf_pool is less than this size then LRU eviction +should not happen. This is because when we do LRU flushing we also put +the blocks on free list. If LRU list is very small then we can end up +in thrashing. */ +#define BUF_LRU_MIN_LEN 256 + +/* @} */ + +#ifdef UNIV_DEBUG +/** Validate the flush list. */ +static void buf_flush_validate_low(); + +/** Validates the flush list some of the time. */ +static void buf_flush_validate_skip() +{ +/** Try buf_flush_validate_low() every this many times */ +# define BUF_FLUSH_VALIDATE_SKIP 23 + + /** The buf_flush_validate_low() call skip counter. + Use a signed type because of the race condition below. */ + static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly buf_flush_validate_low() + check in debug builds. */ + if (--buf_flush_validate_count > 0) { + return; + } + + buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + buf_flush_validate_low(); +} +#endif /* UNIV_DEBUG */ + +/** Wake up the page cleaner if needed */ +inline void buf_pool_t::page_cleaner_wakeup() +{ + if (!page_cleaner_idle()) + return; + double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + double pct_lwm= srv_max_dirty_pages_pct_lwm; + + /* if pct_lwm != 0.0, adaptive flushing is enabled. + signal buf page cleaner thread + - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow + - if pct_lwm > dirty_pct then it will invoke idle flushing flow. + + idle_flushing: + dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an + idle flushing use-case. + + Why is last_activity_count not updated always? + - let's first understand when is server activity count updated. + - it is updated on commit of a transaction trx_t::commit() and not + on adding a page to the flush list. + - page_cleaner_wakeup is called when a page is added to the flush list. + + - now let's say the first user thread, updates the count from X -> Y but + is yet to commit the transaction (so activity count is still Y). + followup user threads will see the updated count as (Y) that is matching + the universal server activity count (Y), giving a false impression that + the server is idle. + + How to avoid this? + - by allowing last_activity_count to updated when page-cleaner is made + active and has work to do. This ensures that the last_activity signal + is consumed by the page-cleaner before the next one is generated. */ + if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) || + (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) || + srv_max_buf_pool_modified_pct <= dirty_pct) + { + page_cleaner_is_idle= false; + pthread_cond_signal(&do_flush_list); + } +} + +inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage) +{ + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + mysql_mutex_assert_owner(&flush_list_mutex); + flush_hp.adjust(bpage); + UT_LIST_REMOVE(flush_list, bpage); +} + +/** Insert a modified block into the flush list. +@param block modified block +@param lsn start LSN of the mini-transaction that modified the block */ +void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn) +{ + mysql_mutex_assert_not_owner(&mutex); + mysql_mutex_assert_owner(&log_sys.flush_order_mutex); + ut_ad(lsn > 2); + ut_ad(!fsp_is_system_temporary(block->page.id().space())); + + mysql_mutex_lock(&flush_list_mutex); + if (ut_d(const lsn_t old=) block->page.oldest_modification()) + { + ut_ad(old == 1); + delete_from_flush_list_low(&block->page); + } + else + stat.flush_list_bytes+= block->physical_size(); + ut_ad(stat.flush_list_bytes <= curr_pool_size); + + block->page.set_oldest_modification(lsn); + MEM_CHECK_DEFINED(block->page.zip.data + ? block->page.zip.data : block->frame, + block->physical_size()); + UT_LIST_ADD_FIRST(flush_list, &block->page); + ut_d(buf_flush_validate_skip()); + page_cleaner_wakeup(); + mysql_mutex_unlock(&flush_list_mutex); +} + +/** Remove a block from flush_list. +@param bpage buffer pool page +@param clear whether to invoke buf_page_t::clear_oldest_modification() */ +void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear) +{ + delete_from_flush_list_low(bpage); + stat.flush_list_bytes-= bpage->physical_size(); + if (clear) + bpage->clear_oldest_modification(); +#ifdef UNIV_DEBUG + buf_flush_validate_skip(); +#endif /* UNIV_DEBUG */ +} + +/** Remove all dirty pages belonging to a given tablespace when we are +deleting the data file of that tablespace. +The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. +@param id tablespace identifier */ +void buf_flush_remove_pages(ulint id) +{ + const page_id_t first(id, 0), end(id + 1, 0); + ut_ad(id); + mysql_mutex_lock(&buf_pool.mutex); + + for (;;) + { + bool deferred= false; + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + ut_d(const auto s= bpage->state()); + ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE || + s == BUF_BLOCK_REMOVE_HASH); + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + const page_id_t bpage_id(bpage->id()); + + if (bpage_id < first || bpage_id >= end); + else if (bpage->io_fix() != BUF_IO_NONE) + deferred= true; + else + buf_pool.delete_from_flush_list(bpage); + + bpage= prev; + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (!deferred) + break; + + mysql_mutex_unlock(&buf_pool.mutex); + os_thread_yield(); + mysql_mutex_lock(&buf_pool.mutex); + buf_flush_wait_batch_end(false); + } + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage have already been +copied to dpage. +IMPORTANT: When this function is called bpage and dpage are not +exact copies of each other. For example, they both will have different +::state. Also the ::list pointers in dpage may be stale. We need to +use the current list node (bpage) to do the list manipulation because +the list pointers could have changed between the time that we copied +the contents of bpage to the dpage and the flush list manipulation +below. */ +ATTRIBUTE_COLD +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage) /*!< in/out: destination block */ +{ + buf_page_t* prev; + + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + + const lsn_t lsn = bpage->oldest_modification(); + + if (!lsn) { + return; + } + + ut_ad(lsn == 1 || lsn > 2); + ut_ad(dpage->oldest_modification() == lsn); + + /* Important that we adjust the hazard pointer before removing + the bpage from the flush list. */ + buf_pool.flush_hp.adjust(bpage); + + prev = UT_LIST_GET_PREV(list, bpage); + UT_LIST_REMOVE(buf_pool.flush_list, bpage); + + bpage->clear_oldest_modification(); + + if (lsn == 1) { + buf_pool.stat.flush_list_bytes -= dpage->physical_size(); + dpage->list.prev = nullptr; + dpage->list.next = nullptr; + dpage->clear_oldest_modification(); + } else if (prev) { + ut_ad(prev->oldest_modification()); + UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage); + } else { + UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage); + } + + ut_d(buf_flush_validate_low()); +} + +/** Complete write of a file page from buf_pool. +@param request write request */ +void buf_page_write_complete(const IORequest &request) +{ + ut_ad(request.is_write()); + ut_ad(!srv_read_only_mode/* || + request.node->space->purpose == FIL_TYPE_TEMPORARY*/); + buf_page_t *bpage= request.bpage; + ut_ad(bpage); + ut_ad(bpage->in_file()); + /* bpage->io_fix() can only be changed by buf_page_write_complete() + and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */ + ut_ad(bpage->io_fix() == BUF_IO_WRITE); + ut_ad(!buf_dblwr.is_inside(bpage->id())); + ut_ad(request.node->space->id == bpage->id().space()); + + if (bpage->status == buf_page_t::INIT_ON_FLUSH) + bpage->status= buf_page_t::NORMAL; + else + { + ut_ad(bpage->status == buf_page_t::NORMAL); + if (request.node->space->use_doublewrite()) + { + ut_ad(request.node->space != fil_system.temp_space); + buf_dblwr.write_completed(); + } + } + + if (bpage->slot) + { + bpage->slot->release(); + bpage->slot= nullptr; + } + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(bpage, BUF_IO_WRITE); + DBUG_PRINT("ib_buf", ("write page %u:%u", + bpage->id().space(), bpage->id().page_no())); + const bool temp= fsp_is_system_temporary(bpage->id().space()); + + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written++; + /* While we do not need any mutex for clearing oldest_modification + here, we hope that it will be in the same cache line with io_fix, + whose changes must be protected by buf_pool.mutex. */ + bpage->clear_oldest_modification(temp); + ut_ad(bpage->io_fix() == BUF_IO_WRITE); + bpage->set_io_fix(BUF_IO_NONE); + + /* Because this thread which does the unlocking might not be the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread id. */ + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE); + + if (request.is_LRU()) + { + buf_LRU_free_page(bpage, true); + + ut_ad(buf_pool.n_flush_LRU_); + if (!--buf_pool.n_flush_LRU_) + { + pthread_cond_broadcast(&buf_pool.done_flush_LRU); + pthread_cond_signal(&buf_pool.done_free); + } + } + else + { + ut_ad(!temp); + ut_ad(buf_pool.n_flush_list_); + if (!--buf_pool.n_flush_list_) + pthread_cond_broadcast(&buf_pool.done_flush_list); + } + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. +@param[in,out] page page to update +@param[in] size compressed page size */ +void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size) +{ + ut_ad(size > 0); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + page_zip_calc_checksum(page, size, + static_cast<srv_checksum_algorithm_t> + (srv_checksum_algorithm))); +} + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page) +{ + ut_d(bool compressed = false); + ut_d(bool corrupted = false); + ut_d(const uint size = buf_page_full_crc32_size(page, &compressed, + &corrupted)); + ut_ad(!compressed); + ut_ad(!corrupted); + ut_ad(size == uint(srv_page_size)); + const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(page + payload, ut_crc32(page, payload)); +} + +/** Initialize a page for writing to the tablespace. +@param[in] block buffer block; NULL if bypassing + the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if + uncompressed +@param[in] use_full_checksum whether tablespace uses full checksum */ +void +buf_flush_init_for_writing( + const buf_block_t* block, + byte* page, + void* page_zip_, + bool use_full_checksum) +{ + if (block != NULL && block->frame != page) { + /* If page is encrypted in full crc32 format then + checksum stored already as a part of fil_encrypt_buf() */ + ut_ad(use_full_checksum); + return; + } + + ut_ad(block == NULL || block->frame == page); + ut_ad(block == NULL || page_zip_ == NULL + || &block->page.zip == page_zip_); + ut_ad(page); + + if (page_zip_) { + page_zip_des_t* page_zip; + ulint size; + + page_zip = static_cast<page_zip_des_t*>(page_zip_); + size = page_zip_get_size(page_zip); + + ut_ad(size); + ut_ad(ut_is_2pow(size)); + ut_ad(size <= UNIV_ZIP_SIZE_MAX); + + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + memcpy(page_zip->data, page, size); + /* fall through */ + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + buf_flush_update_zip_checksum(page_zip->data, size); + return; + } + + ib::error() << "The compressed page to be written" + " seems corrupt:"; + ut_print_buf(stderr, page, size); + fputs("\nInnoDB: Possibly older version of the page:", stderr); + ut_print_buf(stderr, page_zip->data, size); + putc('\n', stderr); + ut_error; + } + + if (use_full_checksum) { + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned"); + static_assert(FIL_PAGE_LSN % 4 == 0, "aligned"); + memcpy_aligned<4>(page + srv_page_size + - FIL_PAGE_FCRC32_END_LSN, + FIL_PAGE_LSN + 4 + page, 4); + return buf_flush_assign_full_crc32_checksum(page); + } + + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned"); + static_assert(FIL_PAGE_LSN % 8 == 0, "aligned"); + memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + FIL_PAGE_LSN + page, 8); + + if (block && srv_page_size == 16384) { + /* The page type could be garbage in old files + created before MySQL 5.5. Such files always + had a page size of 16 kilobytes. */ + ulint page_type = fil_page_get_type(page); + ulint reset_type = page_type; + + switch (block->page.id().page_no() % 16384) { + case 0: + reset_type = block->page.id().page_no() == 0 + ? FIL_PAGE_TYPE_FSP_HDR + : FIL_PAGE_TYPE_XDES; + break; + case 1: + reset_type = FIL_PAGE_IBUF_BITMAP; + break; + case FSP_TRX_SYS_PAGE_NO: + if (block->page.id() + == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) { + reset_type = FIL_PAGE_TYPE_TRX_SYS; + break; + } + /* fall through */ + default: + switch (page_type) { + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + case FIL_PAGE_UNDO_LOG: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_TYPE_SYS: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_IBUF_BITMAP: + /* These pages should have + predetermined page numbers + (see above). */ + default: + reset_type = FIL_PAGE_TYPE_UNKNOWN; + break; + } + } + + if (UNIV_UNLIKELY(page_type != reset_type)) { + ib::info() + << "Resetting invalid page " + << block->page.id() << " type " + << page_type << " to " + << reset_type << " when flushing."; + fil_page_set_type(page, reset_type); + } + } + + uint32_t checksum = BUF_NO_CHECKSUM_MAGIC; + + switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) { + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + checksum = buf_calc_page_new_checksum(page); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + checksum); + /* With the InnoDB checksum, we overwrite the first 4 bytes of + the end lsn field to store the old formula checksum. Since it + depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to + be calculated after storing the new formula checksum. */ + checksum = buf_calc_page_old_checksum(page); + break; + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + /* In other cases we write the same checksum to both fields. */ + checksum = buf_calc_page_crc32(page); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + checksum); + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + checksum); + break; + /* no default so the compiler will emit a warning if + new enum is added and not handled here */ + } + + mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + checksum); +} + +/** Reserve a buffer for compression. +@param[in,out] slot reserved slot */ +static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot) +{ + if (slot->comp_buf) + return; + /* Both Snappy and LZO compression methods require that the output + buffer be bigger than input buffer. Adjust the allocated size. */ + ulint size= srv_page_size; +#ifdef HAVE_LZO + size+= LZO1X_1_15_MEM_COMPRESS; +#elif defined HAVE_SNAPPY + size= snappy_max_compressed_length(size); +#endif + slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size)); +} + +/** Encrypt a buffer of temporary tablespace +@param[in] offset Page offset +@param[in] s Page to encrypt +@param[in,out] d Output buffer +@return encrypted buffer or NULL */ +static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d) +{ + /* Calculate the start offset in a page */ + uint srclen= static_cast<uint>(srv_page_size) - + (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true)) + return NULL; + + const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(d + payload, ut_crc32(d, payload)); + + srv_stats.pages_encrypted.inc(); + srv_stats.n_temp_blocks_encrypted.inc(); + return d; +} + +/** Encryption and page_compression hook that is called just before +a page is written to disk. +@param[in,out] space tablespace +@param[in,out] bpage buffer page +@param[in] s physical page frame that is being encrypted +@param[in,out] size payload size in bytes +@return page frame to be written to file +(may be src_frame or an encrypted/compressed copy of it) */ +static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s, + size_t *size) +{ + ut_ad(bpage->status != buf_page_t::FREED); + ut_ad(space->id == bpage->id().space()); + + ut_d(fil_page_type_validate(space, s)); + const uint32_t page_no= bpage->id().page_no(); + + switch (page_no) { + case TRX_SYS_PAGE_NO: + if (bpage->id().space() != TRX_SYS_SPACE) + break; + /* The TRX_SYS page is neither encrypted nor compressed, because + it contains the address of the doublewrite buffer. */ + /* fall through */ + case 0: + /* Page 0 of a tablespace is not encrypted/compressed */ + return s; + } + + fil_space_crypt_t *crypt_data= space->crypt_data; + bool encrypted, page_compressed; + if (space->purpose == FIL_TYPE_TEMPORARY) + { + ut_ad(!crypt_data); + encrypted= innodb_encrypt_temporary_tables; + page_compressed= false; + } + else + { + encrypted= crypt_data && !crypt_data->not_encrypted() && + crypt_data->type != CRYPT_SCHEME_UNENCRYPTED && + (!crypt_data->is_default_encryption() || srv_encrypt_tables); + page_compressed= space->is_compressed(); + } + + const bool full_crc32= space->full_crc32(); + + if (!encrypted && !page_compressed) + { + /* No need to encrypt or compress. Clear key-version & crypt-checksum. */ + static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2, + "not perfect alignment"); + if (full_crc32) + memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4); + else + memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + return s; + } + + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + if (full_crc32) + memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN, + FIL_PAGE_LSN + 4 + s, 4); + + ut_ad(!bpage->zip_size() || !page_compressed); + /* Find free slot from temporary memory array */ + buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve(); + ut_a(slot); + slot->allocate(); + slot->out_buf= NULL; + bpage->slot= slot; + + byte *d= slot->crypt_buf; + + if (!page_compressed) + { +not_compressed: + byte *tmp= space->purpose == FIL_TYPE_TEMPORARY + ? buf_tmp_page_encrypt(page_no, s, d) + : fil_space_encrypt(space, page_no, s, d); + + slot->out_buf= d= tmp; + + ut_d(fil_page_type_validate(space, tmp)); + } + else + { + ut_ad(space->purpose != FIL_TYPE_TEMPORARY); + /* First we compress the page content */ + buf_tmp_reserve_compression_buf(slot); + byte *tmp= slot->comp_buf; + ulint len= fil_page_compress(s, tmp, space->flags, + fil_space_get_block_size(space, page_no), + encrypted); + + if (!len) + goto not_compressed; + + *size= len; + + if (full_crc32) + { + ut_d(bool compressed = false); + len= buf_page_full_crc32_size(tmp, +#ifdef UNIV_DEBUG + &compressed, +#else + NULL, +#endif + NULL); + ut_ad(compressed); + } + + /* Workaround for MDEV-15527. */ + memset(tmp + len, 0 , srv_page_size - len); + ut_d(fil_page_type_validate(space, tmp)); + + if (encrypted) + tmp = fil_space_encrypt(space, page_no, tmp, d); + + if (full_crc32) + { + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4)); + ut_ad(!buf_page_is_corrupted(true, tmp, space->flags)); + } + + slot->out_buf= d= tmp; + } + + ut_d(fil_page_type_validate(space, d)); + return d; +} + +/** Free a page whose underlying file page has been freed. */ +inline void buf_pool_t::release_freed_page(buf_page_t *bpage) +{ + ut_ad(bpage->in_file()); + const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE; + mysql_mutex_lock(&mutex); + bpage->set_io_fix(BUF_IO_NONE); + bpage->status= buf_page_t::NORMAL; + mysql_mutex_lock(&flush_list_mutex); + ut_d(const lsn_t oldest_modification= bpage->oldest_modification();) + if (fsp_is_system_temporary(bpage->id().space())) + { + ut_ad(uncompressed); + ut_ad(oldest_modification == 2); + } + else + { + ut_ad(oldest_modification > 2); + delete_from_flush_list(bpage, false); + } + bpage->clear_oldest_modification(); + mysql_mutex_unlock(&flush_list_mutex); + + if (uncompressed) + rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock, + BUF_IO_WRITE); + + buf_LRU_free_page(bpage, true); + mysql_mutex_unlock(&mutex); +} + +/** Write a flushable page from buf_pool to a file. +buf_pool.mutex must be held. +@param bpage buffer control block +@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param space tablespace +@return whether the page was flushed and buf_pool.mutex was released */ +static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) +{ + ut_ad(bpage->in_file()); + ut_ad(bpage->ready_for_flush()); + ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == + (space == fil_system.temp_space)); + ut_ad(space->purpose == FIL_TYPE_TABLESPACE || + space->atomic_write_supported); + ut_ad(space->referenced()); + ut_ad(lru || space != fil_system.temp_space); + + rw_lock_t *rw_lock; + + if (bpage->state() != BUF_BLOCK_FILE_PAGE) + rw_lock= nullptr; + else + { + rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock; + if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) + return false; + } + + bpage->set_io_fix(BUF_IO_WRITE); + /* Because bpage->status can only be changed while buf_block_t + exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages + without first allocating the uncompressed page frame. Such + allocation cannot be completed due to our io_fix. So, bpage->status + is protected even if !rw_lock. */ + const auto status= bpage->status; + + if (status != buf_page_t::FREED) + { + if (lru) + buf_pool.n_flush_LRU_++; + else + buf_pool.n_flush_list_++; + buf_flush_page_count++; + } + + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + + /* We are holding rw_lock = buf_block_t::lock in SX mode except if + this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame + has been evicted from the buffer pool. + + Apart from possible rw_lock protection, bpage is also protected by + io_fix and oldest_modification()!=0. Thus, it cannot be relocated in + the buffer pool or removed from flush_list or LRU_list. */ + + DBUG_PRINT("ib_buf", ("%s %u page %u:%u", + lru ? "LRU" : "flush_list", + bpage->id().space(), bpage->id().page_no())); + ut_ad(bpage->io_fix() == BUF_IO_WRITE); + ut_d(const lsn_t oldest_modification= bpage->oldest_modification()); + ut_ad(space == fil_system.temp_space + ? oldest_modification == 2 + : oldest_modification > 2); + ut_ad(bpage->state() == + (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE)); + ut_ad(ULINT_UNDEFINED > + (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_)); + mysql_mutex_unlock(&buf_pool.mutex); + + buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage); + page_t *frame= bpage->zip.data; + + if (status == buf_page_t::FREED) + buf_pool.release_freed_page(&block->page); + else + { + space->reacquire(); + ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH); + size_t size; +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + size_t orig_size; +#endif + IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; + + if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */ + { + ut_ad(!space->full_crc32()); + ut_ad(!space->is_compressed()); /* not page_compressed */ + size= bpage->zip_size(); +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + orig_size= size; +#endif + buf_flush_update_zip_checksum(frame, size); + frame= buf_page_encrypt(space, bpage, frame, &size); + ut_ad(size == bpage->zip_size()); + } + else + { + byte *page= block->frame; + size= block->physical_size(); +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + orig_size= size; +#endif + + if (space->full_crc32()) + { + /* innodb_checksum_algorithm=full_crc32 is not implemented for + ROW_FORMAT=COMPRESSED pages. */ + ut_ad(!frame); + page= buf_page_encrypt(space, bpage, page, &size); + buf_flush_init_for_writing(block, page, nullptr, true); + } + else + { + buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr, + false); + page= buf_page_encrypt(space, bpage, frame ? frame : page, &size); + } + +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + if (size != orig_size && space->punch_hole) + type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH; +#endif + frame=page; + } + + ut_ad(status == bpage->status); + ut_ad(oldest_modification == bpage->oldest_modification()); + + if (status != buf_page_t::NORMAL || !space->use_doublewrite()) + { + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + (frame ? frame + : block->frame))); + ut_ad(lsn >= oldest_modification); + if (lsn > log_sys.get_flushed_lsn()) + log_write_up_to(lsn, true); + } + space->io(IORequest(type, bpage), + bpage->physical_offset(), size, frame, bpage); + } + else + buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size); + } + + /* Increment the I/O operation count used for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + return true; +} + +/** Check whether a page can be flushed from the buf_pool. +@param id page identifier +@param fold id.fold() +@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@return whether the page can be flushed */ +static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(fold == id.fold()); + + buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold); + + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + return false; + + /* We avoid flushing 'non-old' blocks in an LRU flush, because the + flushed blocks are soon freed */ + if (lru && !bpage->is_old()) + return false; + + return bpage->oldest_modification() > 1 && bpage->ready_for_flush(); +} + +/** Check which neighbors of a page can be flushed from the buf_pool. +@param space tablespace +@param id page identifier of a dirty page +@param contiguous whether to consider contiguous areas of pages +@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@return last page number that can be flushed */ +static page_id_t buf_flush_check_neighbors(const fil_space_t &space, + page_id_t &id, bool contiguous, + bool lru) +{ + ut_ad(id.page_no() < space.size); + /* When flushed, dirty blocks are searched in neighborhoods of this + size, and flushed along with the original page. */ + const ulint s= buf_pool.curr_size / 16; + const uint32_t read_ahead= buf_pool.read_ahead_area; + const uint32_t buf_flush_area= read_ahead > s + ? static_cast<uint32_t>(s) : read_ahead; + page_id_t low= id - (id.page_no() % buf_flush_area); + page_id_t high= low + buf_flush_area; + high.set_page_no(std::min(high.page_no(), space.last_page_number())); + + if (!contiguous) + { + high= std::max(id + 1, high); + id= low; + return high; + } + + /* Determine the contiguous dirty area around id. */ + const ulint id_fold= id.fold(); + + mysql_mutex_lock(&buf_pool.mutex); + + if (id > low) + { + ulint fold= id_fold; + for (page_id_t i= id - 1;; --i) + { + fold--; + if (!buf_flush_check_neighbor(i, fold, lru)) + { + low= i + 1; + break; + } + if (i == low) + break; + } + } + + page_id_t i= id; + id= low; + ulint fold= id_fold; + while (++i < high) + { + ++fold; + if (!buf_flush_check_neighbor(i, fold, lru)) + break; + } + + mysql_mutex_unlock(&buf_pool.mutex); + return i; +} + +MY_ATTRIBUTE((nonnull)) +/** Write punch-hole or zeroes of the freed ranges when +innodb_immediate_scrub_data_uncompressed from the freed ranges. +@param space tablespace which may contain ranges of freed pages */ +static void buf_flush_freed_pages(fil_space_t *space) +{ + const bool punch_hole= space->punch_hole; + if (!srv_immediate_scrub_data_uncompressed && !punch_hole) + return; + lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn(); + + std::unique_lock<std::mutex> freed_lock(space->freed_range_mutex); + if (space->freed_ranges.empty() + || flush_to_disk_lsn < space->get_last_freed_lsn()) + { + freed_lock.unlock(); + return; + } + + range_set freed_ranges= std::move(space->freed_ranges); + freed_lock.unlock(); + + for (const auto &range : freed_ranges) + { + const ulint physical_size= space->physical_size(); + + if (punch_hole) + { + space->reacquire(); + space->io(IORequest(IORequest::PUNCH_RANGE), + os_offset_t{range.first} * physical_size, + (range.last - range.first + 1) * physical_size, + nullptr); + } + else if (srv_immediate_scrub_data_uncompressed) + { + for (os_offset_t i= range.first; i <= range.last; i++) + { + space->reacquire(); + space->io(IORequest(IORequest::WRITE_ASYNC), + i * physical_size, physical_size, + const_cast<byte*>(field_ref_zero)); + } + } + buf_pool.stat.n_pages_written+= (range.last - range.first + 1); + } +} + +/** Flushes to disk all flushable pages within the flush area +and also write zeroes or punch the hole for the freed ranges of pages. +@param space tablespace +@param page_id page identifier +@param contiguous whether to consider contiguous areas of pages +@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param n_flushed number of pages flushed so far in this batch +@param n_to_flush maximum number of pages we are allowed to flush +@return number of pages flushed */ +static ulint buf_flush_try_neighbors(fil_space_t *space, + const page_id_t page_id, + bool contiguous, bool lru, + ulint n_flushed, ulint n_to_flush) +{ + ut_ad(space->id == page_id.space()); + + ulint count= 0; + page_id_t id= page_id; + page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru); + + ut_ad(page_id >= id); + ut_ad(page_id < high); + + for (ulint id_fold= id.fold(); id < high && !space->is_stopping(); + ++id, ++id_fold) + { + if (count + n_flushed >= n_to_flush) + { + if (id > page_id) + break; + /* If the page whose neighbors we are flushing has not been + flushed yet, we must flush the page that we selected originally. */ + id= page_id; + id_fold= id.fold(); + } + + mysql_mutex_lock(&buf_pool.mutex); + + if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold)) + { + ut_ad(bpage->in_file()); + /* We avoid flushing 'non-old' blocks in an LRU flush, + because the flushed blocks are soon freed */ + if (!lru || id == page_id || bpage->is_old()) + { + if (!buf_pool.watch_is_sentinel(*bpage) && + bpage->oldest_modification() > 1 && + bpage->ready_for_flush() && buf_flush_page(bpage, lru, space)) + { + ++count; + continue; + } + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + } + + if (auto n= count - 1) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, n); + } + + return count; +} + +/*******************************************************************//** +This utility moves the uncompressed frames of pages to the free list. +Note that this function does not actually flush any data to disk. It +just detaches the uncompressed frames from the compressed pages at the +tail of the unzip_LRU and puts those freed frames in the free list. +Note that it is a best effort attempt and it is not guaranteed that +after a call to this function there will be 'max' blocks in the free +list. +@param[in] max desired number of blocks in the free_list +@return number of blocks moved to the free list. */ +static ulint buf_free_from_unzip_LRU_list_batch(ulint max) +{ + ulint scanned = 0; + ulint count = 0; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + + while (block + && count < max + && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth + && UT_LIST_GET_LEN(buf_pool.unzip_LRU) + > UT_LIST_GET_LEN(buf_pool.LRU) / 10) { + + ++scanned; + if (buf_LRU_free_page(&block->page, false)) { + /* Block was freed. buf_pool.mutex potentially + released and reacquired */ + ++count; + block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + } else { + block = UT_LIST_GET_PREV(unzip_LRU, block); + } + } + + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); + } + + return(count); +} + +/** Start writing out pages for a tablespace. +@param id tablespace identifier +@return tablespace +@retval nullptr if the pages for this tablespace should be discarded */ +static fil_space_t *buf_flush_space(const uint32_t id) +{ + fil_space_t *space= fil_space_t::get(id); + if (space) + buf_flush_freed_pages(space); + return space; +} + +struct flush_counters_t +{ + /** number of dirty pages flushed */ + ulint flushed; + /** number of clean pages evicted */ + ulint evicted; +}; + +/** Try to discard a dirty page. +@param bpage dirty page whose tablespace is not accessible */ +static void buf_flush_discard_page(buf_page_t *bpage) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + ut_ad(bpage->in_file()); + ut_ad(bpage->oldest_modification()); + + rw_lock_t *rw_lock; + + if (bpage->state() != BUF_BLOCK_FILE_PAGE) + rw_lock= nullptr; + else + { + rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock; + if (!rw_lock_sx_lock_nowait(rw_lock, 0)) + return; + } + + bpage->status= buf_page_t::NORMAL; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.delete_from_flush_list(bpage); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (rw_lock) + rw_lock_sx_unlock(rw_lock); + + buf_LRU_free_page(bpage, true); +} + +/** Flush dirty blocks from the end of the LRU list. +@param max maximum number of blocks to make available in buf_pool.free +@param n counts of flushed and evicted pages */ +static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) +{ + ulint scanned= 0; + ulint free_limit= srv_LRU_scan_depth; + + mysql_mutex_assert_owner(&buf_pool.mutex); + if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size) + free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw); + + const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN + ? 0 : srv_flush_neighbors; + fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); + bpage && n->flushed + n->evicted < max && + UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && + UT_LIST_GET_LEN(buf_pool.free) < free_limit; ++scanned) + { + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + const lsn_t oldest_modification= bpage->oldest_modification(); + buf_pool.lru_hp.set(prev); + + if (oldest_modification <= 1 && bpage->can_relocate()) + { + /* block is ready for eviction i.e., it is clean and is not + IO-fixed or buffer fixed. */ + if (buf_LRU_free_page(bpage, true)) + ++n->evicted; + } + else if (oldest_modification > 1 && bpage->ready_for_flush()) + { + /* Block is ready for flush. Dispatch an IO request. The IO + helper thread will put it on free list in IO completion routine. */ + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) + { + if (space) + space->release(); + space= buf_flush_space(space_id); + last_space_id= space_id; + } + else + ut_ad(!space); + } + else if (space->is_stopping()) + { + space->release(); + space= nullptr; + } + + if (!space) + buf_flush_discard_page(bpage); + else if (neighbors && space->is_rotational()) + { + mysql_mutex_unlock(&buf_pool.mutex); + n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1, + true, n->flushed, max); +reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + } + else if (buf_flush_page(bpage, true, space)) + { + ++n->flushed; + goto reacquire_mutex; + } + } + else + /* Can't evict or dispatch this block. Go to previous. */ + ut_ad(buf_pool.lru_hp.is_hp(prev)); + bpage= buf_pool.lru_hp.get(); + } + + buf_pool.lru_hp.set(nullptr); + + if (space) + space->release(); + + /* We keep track of all flushes happening as part of LRU flush. When + estimating the desired rate at which flush_list should be flushed, + we factor in this value. */ + buf_lru_flush_page_count+= n->flushed; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); +} + +/** Flush and move pages from LRU or unzip_LRU list to the free list. +Whether LRU or unzip_LRU is used depends on the state of the system. +@param max maximum number of blocks to make available in buf_pool.free +@return number of flushed pages */ +static ulint buf_do_LRU_batch(ulint max) +{ + const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU() + ? buf_free_from_unzip_LRU_list_batch(max) + : 0; + flush_counters_t n; + n.flushed= 0; + n.evicted= n_unzip_LRU_evicted; + buf_flush_LRU_list_batch(max, &n); + + if (const ulint evicted= n.evicted - n_unzip_LRU_evicted) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, + evicted); + } + + return n.flushed; +} + +/** This utility flushes dirty blocks from the end of the flush_list. +The calling thread is not allowed to own any latches on pages! +@param max_n maximum mumber of blocks to flush +@param lsn once an oldest_modification>=lsn is found, terminate the batch +@return number of blocks for which the write request was queued */ +static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) +{ + ulint count= 0; + ulint scanned= 0; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN + ? 0 : srv_flush_neighbors; + fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); + + /* Start from the end of the list looking for a suitable block to be + flushed. */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); + bpage && len && count < max_n; ++scanned, len--) + { + const lsn_t oldest_modification= bpage->oldest_modification(); + if (oldest_modification >= lsn) + break; + ut_ad(bpage->in_file()); + + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + if (oldest_modification == 1) + { + buf_pool.delete_from_flush_list(bpage); + skip: + bpage= prev; + continue; + } + + ut_ad(oldest_modification > 2); + ut_ad(bpage->in_file()); + + if (!bpage->ready_for_flush()) + goto skip; + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: A concurrent execution of buf_flush_list_space() may + terminate this scan prematurely. The buf_pool.n_flush_list() + should prevent multiple threads from executing + buf_do_flush_list_batch() concurrently, + but buf_flush_list_space() is ignoring that. */ + buf_pool.flush_hp.set(prev); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) + { + if (space) + space->release(); + space= buf_flush_space(space_id); + last_space_id= space_id; + } + else + ut_ad(!space); + } + else if (space->is_stopping()) + { + space->release(); + space= nullptr; + } + + if (!space) + buf_flush_discard_page(bpage); + else if (neighbors && space->is_rotational()) + { + mysql_mutex_unlock(&buf_pool.mutex); + count+= buf_flush_try_neighbors(space, page_id, neighbors == 1, + false, count, max_n); + reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + } + else if (buf_flush_page(bpage, false, space)) + { + ++count; + goto reacquire_mutex; + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + bpage= buf_pool.flush_hp.get(); + } + + buf_pool.flush_hp.set(nullptr); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (space) + space->release(); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + scanned); + if (count) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + count); + mysql_mutex_assert_owner(&buf_pool.mutex); + return count; +} + +/** Wait until a flush batch ends. +@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ +void buf_flush_wait_batch_end(bool lru) +{ + const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_; + + if (n_flush) + { + auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list; + tpool::tpool_wait_begin(); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + do + my_cond_wait(cond, &buf_pool.mutex.m_mutex); + while (n_flush); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); + pthread_cond_broadcast(cond); + } +} + +/** Write out dirty blocks from buf_pool.flush_list. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +ulint buf_flush_list(ulint max_n, lsn_t lsn) +{ + ut_ad(lsn); + + if (buf_pool.n_flush_list()) + return 0; + + mysql_mutex_lock(&buf_pool.mutex); + const bool running= buf_pool.n_flush_list_ != 0; + /* FIXME: we are performing a dirty read of buf_pool.flush_list.count + while not holding buf_pool.flush_list_mutex */ + if (running || !UT_LIST_GET_LEN(buf_pool.flush_list)) + { + if (!running) + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.mutex); + return 0; + } + + buf_pool.n_flush_list_++; + const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); + const ulint n_flushing= --buf_pool.n_flush_list_; + + buf_pool.try_LRU_scan= true; + + mysql_mutex_unlock(&buf_pool.mutex); + + if (!n_flushing) + pthread_cond_broadcast(&buf_pool.done_flush_list); + + buf_dblwr.flush_buffered_writes(); + + DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); + return n_flushed; +} + +/** Try to flush all the dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) +{ + const auto space_id= space->id; + ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND); + + bool may_have_skipped= false; + ulint max_n_flush= srv_io_capacity; + + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + bool acquired= space->acquire(); + buf_flush_freed_pages(space); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + ut_d(const auto s= bpage->state()); + ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE || + s == BUF_BLOCK_REMOVE_HASH); + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->in_file()); + + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + if (bpage->id().space() != space_id); + else if (bpage->oldest_modification() == 1) + buf_pool.delete_from_flush_list(bpage); + else if (!bpage->ready_for_flush()) + may_have_skipped= true; + else + { + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: Multiple executions of buf_flush_list_space() may be + interleaved, and also buf_do_flush_list_batch() may be running + concurrently. This may terminate our iteration prematurely, + leading us to return may_have_skipped=true. */ + buf_pool.flush_hp.set(prev); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (!acquired) + { + was_freed: + buf_flush_discard_page(bpage); + } + else + { + if (space->is_stopping()) + { + space->release(); + acquired= false; + goto was_freed; + } + if (!buf_flush_page(bpage, false, space)) + { + may_have_skipped= true; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + goto next_after_skip; + } + if (n_flushed) + ++*n_flushed; + if (!--max_n_flush) + { + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + may_have_skipped= true; + break; + } + mysql_mutex_lock(&buf_pool.mutex); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.flush_hp.is_hp(prev)) + may_have_skipped= true; + next_after_skip: + bpage= buf_pool.flush_hp.get(); + continue; + } + + bpage= prev; + } + + /* Note: this loop may have been executed concurrently with + buf_do_flush_list_batch() as well as other threads executing + buf_flush_list_space(). We should always return true from + buf_flush_list_space() if that should be the case; in + buf_do_flush_list_batch() we will simply perform less work. */ + + buf_pool.flush_hp.set(nullptr); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_pool.try_LRU_scan= true; + + mysql_mutex_unlock(&buf_pool.mutex); + + if (acquired) + space->release(); + + if (space->purpose == FIL_TYPE_IMPORT) + os_aio_wait_until_no_pending_writes(); + else + buf_dblwr.flush_buffered_writes(); + + return may_have_skipped; +} + +/** Write out dirty blocks from buf_pool.LRU. +@param max_n wished maximum mumber of blocks flushed +@return the number of processed pages +@retval 0 if a buf_pool.LRU batch is already running */ +ulint buf_flush_LRU(ulint max_n) +{ + if (buf_pool.n_flush_LRU()) + return 0; + + log_buffer_flush_to_disk(true); + + mysql_mutex_lock(&buf_pool.mutex); + if (buf_pool.n_flush_LRU_) + { + mysql_mutex_unlock(&buf_pool.mutex); + return 0; + } + buf_pool.n_flush_LRU_++; + + ulint n_flushed= buf_do_LRU_batch(max_n); + + const ulint n_flushing= --buf_pool.n_flush_LRU_; + + buf_pool.try_LRU_scan= true; + + mysql_mutex_unlock(&buf_pool.mutex); + + if (!n_flushing) + { + pthread_cond_broadcast(&buf_pool.done_flush_LRU); + pthread_cond_signal(&buf_pool.done_free); + } + + buf_dblwr.flush_buffered_writes(); + + DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed)); + return n_flushed; +} + +/** Initiate a log checkpoint, discarding the start of the log. +@param oldest_lsn the checkpoint LSN +@param end_lsn log_sys.get_lsn() +@return true if success, false if a checkpoint write was already running */ +static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_assert_owner(&log_sys.mutex); + ut_ad(oldest_lsn <= end_lsn); + ut_ad(end_lsn == log_sys.get_lsn()); + ut_ad(!recv_no_log_write); + + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + + if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + /* Some log has been written since the previous checkpoint. */; + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + /* MariaDB startup expects the redo log file to be logically empty + (not even containing a FILE_CHECKPOINT record) after a clean shutdown. + Perform an extra checkpoint at shutdown. */; + else + { + /* Do nothing, because nothing was logged (other than a + FILE_CHECKPOINT record) since the previous checkpoint. */ + mysql_mutex_unlock(&log_sys.mutex); + return true; + } + + /* Repeat the FILE_MODIFY records after the checkpoint, in case some + log records between the checkpoint and log_sys.lsn need them. + Finally, write a FILE_CHECKPOINT record. Redo log apply expects to + see a FILE_CHECKPOINT after the checkpoint, except on clean + shutdown, where the log will be empty after the checkpoint. + + It is important that we write out the redo log before any further + dirty pages are flushed to the tablespace files. At this point, + because we hold log_sys.mutex, mtr_t::commit() in other threads will + be blocked, and no pages can be added to the flush lists. */ + lsn_t flush_lsn= oldest_lsn; + + if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn || + srv_shutdown_state <= SRV_SHUTDOWN_INITIATED)) + { + flush_lsn= log_sys.get_lsn(); + ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); + mysql_mutex_unlock(&log_sys.mutex); + log_write_up_to(flush_lsn, true, true); + mysql_mutex_lock(&log_sys.mutex); + if (log_sys.last_checkpoint_lsn >= oldest_lsn) + { + mysql_mutex_unlock(&log_sys.mutex); + return true; + } + } + else + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); + + ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); + + if (log_sys.n_pending_checkpoint_writes) + { + /* A checkpoint write is running */ + mysql_mutex_unlock(&log_sys.mutex); + return false; + } + + log_sys.next_checkpoint_lsn= oldest_lsn; + log_write_checkpoint_info(end_lsn); + mysql_mutex_assert_not_owner(&log_sys.mutex); + + return true; +} + +/** Make a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log file. Use log_make_checkpoint() to flush also the pool. +@retval true if the checkpoint was or had been made +@retval false if a checkpoint write was already running */ +static bool log_checkpoint() +{ + if (recv_recovery_is_on()) + recv_sys.apply(true); + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + mysql_mutex_lock(&log_sys.mutex); + const lsn_t end_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&log_sys.flush_order_mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_unlock(&log_sys.flush_order_mutex); + return log_checkpoint_low(oldest_lsn, end_lsn); +} + +/** Make a checkpoint. */ +ATTRIBUTE_COLD void log_make_checkpoint() +{ + buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire)); + while (!log_checkpoint()); +} + +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) +{ + ut_ad(sync_lsn); + ut_ad(sync_lsn < LSN_MAX); + mysql_mutex_assert_not_owner(&log_sys.mutex); + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) + { +#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ + if (UNIV_UNLIKELY(!buf_page_cleaner_is_active) + ut_d(|| innodb_page_cleaner_disabled_debug)) + { + do + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); + buf_flush_wait_batch_end_acquiring_mutex(false); + if (n_pages) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_pages); + } + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn); + + goto try_checkpoint; + } +#endif + if (buf_flush_sync_lsn < sync_lsn) + { + buf_flush_sync_lsn= sync_lsn; + pthread_cond_signal(&buf_pool.do_flush_list); + } + + do + { + tpool::tpool_wait_begin(); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + thd_wait_end(nullptr); + tpool::tpool_wait_end(); + + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + } + while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn); + } + +try_checkpoint: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn)) + { + /* If the buffer pool was clean, no log write was guaranteed + to happen until now. There could be an outstanding FILE_CHECKPOINT + record from a previous fil_names_clear() call, which we must + write out before we can advance the checkpoint. */ + if (sync_lsn > log_sys.get_flushed_lsn()) + log_write_up_to(sync_lsn, true); + log_checkpoint(); + } +} + +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) +{ + mysql_mutex_assert_not_owner(&log_sys.mutex); + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + Atomic_relaxed<lsn_t> &limit= furious + ? buf_flush_sync_lsn : buf_flush_async_lsn; + + if (limit < lsn) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (limit < lsn) + limit= lsn; + pthread_cond_signal(&buf_pool.do_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } +} + +/** Wait for pending flushes to complete. */ +void buf_flush_wait_batch_end_acquiring_mutex(bool lru) +{ + if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list()) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_flush_wait_batch_end(lru); + mysql_mutex_unlock(&buf_pool.mutex); + } +} + +/** Conduct checkpoint-related flushing for innodb_flush_sync=ON, +and try to initiate checkpoints until the target is met. +@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ +ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) +{ + ut_ad(!srv_read_only_mode); + + for (;;) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_flushed); + } + + /* Attempt to perform a log checkpoint upon completing each batch. */ + if (recv_recovery_is_on()) + recv_sys.apply(true); + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + mysql_mutex_lock(&log_sys.mutex); + const lsn_t newest_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&log_sys.flush_order_mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); + mysql_mutex_unlock(&log_sys.flush_order_mutex); + const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; + + if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_checkpoint_low(checkpoint_lsn, newest_lsn); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + measure= buf_pool.get_oldest_modification(LSN_MAX); + } + else + { + mysql_mutex_unlock(&log_sys.mutex); + if (!measure) + measure= LSN_MAX; + } + + mysql_mutex_assert_not_owner(&log_sys.mutex); + + /* After attempting log checkpoint, check if we have reached our target. */ + const lsn_t target= buf_flush_sync_lsn; + + if (measure >= target) + buf_flush_sync_lsn= 0; + else if (measure >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; + + /* wake up buf_flush_wait_flushed() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + + lsn= std::max(lsn, target); + + if (measure >= lsn) + return; + } +} + +/** Check if the adpative flushing threshold is recommended based on +redo log capacity filled threshold. +@param oldest_lsn buf_pool.get_oldest_modification() +@return true if adaptive flushing is recommended. */ +static bool af_needed_for_redo(lsn_t oldest_lsn) +{ + lsn_t age= (log_sys.get_lsn() - oldest_lsn); + lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm * + static_cast<double>(log_sys.log_capacity) / 100); + + /* if age > af_lwm adaptive flushing is recommended */ + return (age > af_lwm); +} + +/*********************************************************************//** +Calculates if flushing is required based on redo generation rate. +@return percent of io_capacity to flush to manage redo space */ +static +ulint +af_get_pct_for_lsn( +/*===============*/ + lsn_t age) /*!< in: current age of LSN. */ +{ + lsn_t af_lwm = static_cast<lsn_t>( + srv_adaptive_flushing_lwm + * static_cast<double>(log_sys.log_capacity) / 100); + + if (age < af_lwm) { + /* No adaptive flushing. */ + return(0); + } + + lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async; + + ut_ad(srv_max_io_capacity >= srv_io_capacity); + return static_cast<ulint>( + (static_cast<double>(srv_max_io_capacity / srv_io_capacity + * lsn_age_factor) + * sqrt(static_cast<double>(lsn_age_factor)) + / 7.5)); +} + +/** This function is called approximately once every second by the +page_cleaner thread if innodb_adaptive_flushing=ON. +Based on various factors it decides if there is a need to do flushing. +@return number of pages recommended to be flushed +@param last_pages_in number of pages flushed in previous batch +@param oldest_lsn buf_pool.get_oldest_modification(0) +@param dirty_blocks UT_LIST_GET_LEN(buf_pool.flush_list) +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */ +static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, + lsn_t oldest_lsn, + ulint dirty_blocks, + double dirty_pct) +{ + static lsn_t prev_lsn = 0; + static ulint sum_pages = 0; + static ulint avg_page_rate = 0; + static ulint n_iterations = 0; + static time_t prev_time; + lsn_t lsn_rate; + ulint n_pages = 0; + + const lsn_t cur_lsn = log_sys.get_lsn(); + ut_ad(oldest_lsn <= cur_lsn); + ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn); + time_t curr_time = time(nullptr); + const double max_pct = srv_max_buf_pool_modified_pct; + + if (!prev_lsn || !pct_for_lsn) { + prev_time = curr_time; + prev_lsn = cur_lsn; + if (max_pct > 0.0) { + dirty_pct /= max_pct; + } + + n_pages = ulint(dirty_pct * double(srv_io_capacity)); + if (n_pages < dirty_blocks) { + n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks); + } + + return n_pages; + } + + sum_pages += last_pages_in; + + double time_elapsed = difftime(curr_time, prev_time); + + /* We update our variables every srv_flushing_avg_loops + iterations to smooth out transition in workload. */ + if (++n_iterations >= srv_flushing_avg_loops + || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) { + + if (time_elapsed < 1) { + time_elapsed = 1; + } + + avg_page_rate = static_cast<ulint>( + ((static_cast<double>(sum_pages) + / time_elapsed) + + static_cast<double>(avg_page_rate)) / 2); + + /* How much LSN we have generated since last call. */ + lsn_rate = static_cast<lsn_t>( + static_cast<double>(cur_lsn - prev_lsn) + / time_elapsed); + + lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; + + ulint flush_tm = page_cleaner.flush_time; + ulint flush_pass = page_cleaner.flush_pass; + + page_cleaner.flush_time = 0; + page_cleaner.flush_pass = 0; + + if (flush_pass) { + flush_tm /= flush_pass; + } + + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass); + + prev_lsn = cur_lsn; + prev_time = curr_time; + + n_iterations = 0; + + sum_pages = 0; + } + + const ulint pct_for_dirty = srv_max_dirty_pages_pct_lwm == 0 + ? (dirty_pct >= max_pct ? 100 : 0) + : static_cast<ulint> + (max_pct > 0.0 ? dirty_pct / max_pct : dirty_pct); + ulint pct_total = std::max(pct_for_dirty, pct_for_lsn); + + /* Estimate pages to be flushed for the lsn progress */ + lsn_t target_lsn = oldest_lsn + + lsn_avg_rate * buf_flush_lsn_scan_factor; + ulint pages_for_lsn = 0; + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list); + b != NULL; + b = UT_LIST_GET_PREV(list, b)) { + if (b->oldest_modification() > target_lsn) { + break; + } + if (++pages_for_lsn >= srv_max_io_capacity) { + break; + } + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + pages_for_lsn /= buf_flush_lsn_scan_factor; + if (pages_for_lsn < 1) { + pages_for_lsn = 1; + } + + n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0) + + avg_page_rate + pages_for_lsn) / 3; + + if (n_pages > srv_max_io_capacity) { + n_pages = srv_max_io_capacity; + } + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn); + + MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); + MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); + + return(n_pages); +} + +/******************************************************************//** +page_cleaner thread tasked with flushing dirty pages from the buffer +pools. As of now we'll have only one coordinator. +@return a dummy parameter */ +static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*) +{ + my_thread_init(); +#ifdef UNIV_PFS_THREAD + pfs_register_thread(page_cleaner_thread_key); +#endif /* UNIV_PFS_THREAD */ + ut_ad(!srv_read_only_mode); + ut_ad(buf_page_cleaner_is_active); + + ulint last_pages= 0; + timespec abstime; + set_timespec(abstime, 1); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + lsn_t lsn_limit; + ulint last_activity_count= srv_get_activity_count(); + + for (;;) + { + lsn_limit= buf_flush_sync_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { +furious_flush: + if (UNIV_LIKELY(srv_flush_sync)) + { + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; + } + } + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; + + /* If buf pager cleaner is idle and there is no work + (either dirty pages are all flushed or adaptive flushing + is not enabled) then opt for non-timed wait */ + if (buf_pool.page_cleaner_idle() && + (!UT_LIST_GET_LEN(buf_pool.flush_list) || + srv_max_dirty_pages_pct_lwm == 0.0)) + my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex); + else + my_cond_timedwait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex, &abstime); + + set_timespec(abstime, 1); + + lsn_t soft_lsn_limit= buf_flush_async_lsn; + lsn_limit= buf_flush_sync_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + if (UNIV_LIKELY(srv_flush_sync)) + goto furious_flush; + } + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; + + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); + + if (!oldest_lsn) + { + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + buf_flush_sync_lsn= 0; + /* wake up buf_flush_wait_flushed() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + } +unemployed: + buf_flush_async_lsn= 0; + buf_pool.page_cleaner_set_idle(true); + continue; + } + + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + ut_ad(dirty_blocks); + /* We perform dirty reads of the LRU+free list lengths here. + Division by zero is not possible, because buf_pool.flush_list is + guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ + const double dirty_pct= double(dirty_blocks) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + + bool idle_flush= false; + + if (lsn_limit || soft_lsn_limit); + else if (af_needed_for_redo(oldest_lsn)); + else if (srv_max_dirty_pages_pct_lwm != 0.0) + { + const ulint activity_count= srv_get_activity_count(); + if (activity_count != last_activity_count) + last_activity_count= activity_count; + else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0) + { + /* reaching here means 3 things: + - last_activity_count == activity_count: suggesting server is idle + (no trx_t::commit activity) + - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) + - there are no pending reads but there are dirty pages to flush */ + idle_flush= true; + buf_pool.update_last_activity_count(activity_count); + } + + if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm) + goto unemployed; + } + else if (dirty_pct < srv_max_buf_pool_modified_pct) + goto unemployed; + + if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit) + lsn_limit= buf_flush_sync_lsn= 0; + if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit) + soft_lsn_limit= buf_flush_async_lsn= 0; + + buf_pool.page_cleaner_set_idle(false); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (!lsn_limit) + lsn_limit= soft_lsn_limit; + + ulint n_flushed; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit); + /* wake up buf_flush_wait_flushed() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + goto try_checkpoint; + } + else if (idle_flush || !srv_adaptive_flushing) + { + n_flushed= buf_flush_list(srv_io_capacity); +try_checkpoint: + if (n_flushed) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); +do_checkpoint: + /* The periodic log_checkpoint() call here makes it harder to + reproduce bugs in crash recovery or mariabackup --prepare, or + in code that writes the redo log records. Omitting the call + here should not affect correctness, because log_free_check() + should still be invoking checkpoints when needed. */ + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;); + + if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL) + log_checkpoint(); + } + } + else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + dirty_blocks, + dirty_pct)) + { + page_cleaner.flush_pass++; + const ulint tm= ut_time_ms(); + last_pages= n_flushed= buf_flush_list(n); + page_cleaner.flush_time+= ut_time_ms() - tm; + + if (n_flushed) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); + goto do_checkpoint; + } + } + else if (buf_flush_async_lsn <= oldest_lsn) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + goto unemployed; + } + +#ifdef UNIV_DEBUG + while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn && + srv_shutdown_state == SRV_SHUTDOWN_NONE) + os_thread_sleep(100000); +#endif /* UNIV_DEBUG */ + +#ifndef DBUG_OFF +next: +#endif /* !DBUG_OFF */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + /* when idle flushing kicks in page_cleaner is marked active. + reset it back to idle since the it was made active as part of + idle flushing stage. */ + if (idle_flush) + buf_pool.page_cleaner_set_idle(true); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (srv_fast_shutdown != 2) + { + buf_flush_wait_batch_end_acquiring_mutex(true); + buf_flush_wait_batch_end_acquiring_mutex(false); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; + if (UNIV_UNLIKELY(lsn_limit != 0)) + goto furious_flush; + buf_page_cleaner_is_active= false; + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + my_thread_end(); + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + os_thread_exit(); + + OS_THREAD_DUMMY_RETURN; +} + +/** Initialize page_cleaner. */ +ATTRIBUTE_COLD void buf_flush_page_cleaner_init() +{ + ut_ad(!buf_page_cleaner_is_active); + ut_ad(srv_operation == SRV_OPERATION_NORMAL || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + buf_flush_async_lsn= 0; + buf_flush_sync_lsn= 0; + buf_page_cleaner_is_active= true; + os_thread_create(buf_flush_page_cleaner); +} + +/** @return the number of dirty pages in the buffer pool */ +static ulint buf_flush_list_length() +{ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return len; +} + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool() +{ + ut_ad(!buf_page_cleaner_is_active); + ut_ad(!buf_flush_sync_lsn); + + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush the buffer pool"); + + while (buf_pool.n_flush_list() || buf_flush_list_length()) + { + buf_flush_list(srv_max_io_capacity); + timespec abstime; + + if (buf_pool.n_flush_list()) + { + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + buf_flush_list_length()); + set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); + mysql_mutex_lock(&buf_pool.mutex); + while (buf_pool.n_flush_list_) + my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, + &abstime); + mysql_mutex_unlock(&buf_pool.mutex); + } + } + + ut_ad(!buf_pool.any_io_pending()); +} + +/** Synchronously flush dirty blocks. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync() +{ + ut_ad(!sync_check_iterate(dict_sync_check())); + + for (;;) + { + const ulint n_flushed= buf_flush_list(srv_max_io_capacity); + buf_flush_wait_batch_end_acquiring_mutex(false); + if (!n_flushed && !buf_flush_list_length()) + return; + } +} + +#ifdef UNIV_DEBUG +/** Functor to validate the flush list. */ +struct Check { + void operator()(const buf_page_t* elem) const + { + ut_ad(elem->oldest_modification()); + ut_ad(!fsp_is_system_temporary(elem->id().space())); + } +}; + +/** Validate the flush list. */ +static void buf_flush_validate_low() +{ + buf_page_t* bpage; + + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + + ut_list_validate(buf_pool.flush_list, Check()); + + bpage = UT_LIST_GET_FIRST(buf_pool.flush_list); + + while (bpage != NULL) { + const lsn_t om = bpage->oldest_modification(); + /* A page in buf_pool.flush_list can be in + BUF_BLOCK_REMOVE_HASH state. This happens when a page + is in the middle of being relocated. In that case the + original descriptor can have this state and still be + in the flush list waiting to acquire the + buf_pool.flush_list_mutex to complete the relocation. */ + ut_d(const auto s= bpage->state()); + ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE + || s == BUF_BLOCK_REMOVE_HASH); + ut_ad(om == 1 || om > 2); + + bpage = UT_LIST_GET_NEXT(list, bpage); + ut_ad(om == 1 || !bpage || recv_recovery_is_on() + || om >= bpage->oldest_modification()); + } +} + +/** Validate the flush list. */ +void buf_flush_validate() +{ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_validate_low(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc new file mode 100644 index 00000000..b282eb17 --- /dev/null +++ b/storage/innobase/buf/buf0lru.cc @@ -0,0 +1,1477 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0lru.cc +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" +#include "sync0rw.h" +#include "fil0fil.h" +#include "btr0btr.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "btr0sea.h" +#include "os0file.h" +#include "page0zip.h" +#include "log0recv.h" +#include "srv0srv.h" +#include "srv0mon.h" + +/** Flush this many pages in buf_LRU_get_free_block() */ +size_t innodb_lru_flush_size; + +/** The number of blocks from the LRU_old pointer onward, including +the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV +of the whole LRU list length, except that the tolerance defined below +is allowed. Note that the tolerance must be small enough such that for +even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not +allowed to point to either end of the LRU list. */ + +static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20; + +/** The minimum amount of non-old blocks when the LRU_old list exists +(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks). +@see buf_LRU_old_adjust_len */ +#define BUF_LRU_NON_OLD_MIN_LEN 5 + +/** If we switch on the InnoDB monitor because there are too few available +frames in the buffer pool, we set this to TRUE */ +static bool buf_lru_switched_on_innodb_mon = false; + +/** True if diagnostic message about difficult to find free blocks +in the buffer bool has already printed. */ +static bool buf_lru_free_blocks_error_printed; + +/******************************************************************//** +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics, +buf_LRU_evict_from_unzip_LRU() decides if we want to evict from +unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the +uncompressed frame (meaning we can evict dirty blocks as well). From +the regular LRU, we will evict the entire block (i.e.: both the +uncompressed and compressed data), which must be clean. */ + +/* @{ */ + +/** Number of intervals for which we keep the history of these stats. +Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */ +static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4; + +/** Co-efficient with which we multiply I/O operations to equate them +with page_zip_decompress() operations. */ +static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50; + +/** Sampled values buf_LRU_stat_cur. +Not protected by any mutex. Updated by buf_LRU_stat_update(). */ +static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL]; + +/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */ +static ulint buf_LRU_stat_arr_ind; + +/** Current operation counters. Not protected by any mutex. Cleared +by buf_LRU_stat_update(). */ +buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Not Protected by any mutex. */ +buf_LRU_stat_t buf_LRU_stat_sum; + +/* @} */ + +/** @name Heuristics for detecting index scan @{ */ +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +uint buf_LRU_old_threshold_ms; +/* @} */ + +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If bpage->state() == BUF_BLOCK_ZIP_PAGE && bpage->oldest_modification() <= 1, +the object will be freed. + +@param bpage buffer block +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed + +If a compressed page is freed other compressed pages may be relocated. +@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The +caller needs to free the page to the free list +@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + page_hash_latch *hash_lock, bool zip); + +/** Free a block to buf_pool */ +static void buf_LRU_block_free_hashed_page(buf_block_t *block) +{ + block->page.free_file_page(); + buf_LRU_block_free_non_file_page(block); +} + +/** Increase LRU size in bytes by the page size. +@param[in] bpage control block */ +static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage) +{ + /* FIXME: use atomics, not mutex */ + mysql_mutex_assert_owner(&buf_pool.mutex); + + buf_pool.stat.LRU_bytes += bpage->physical_size(); + + ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size); +} + +/** @return whether the unzip_LRU list should be used for evicting a victim +instead of the general LRU list */ +bool buf_LRU_evict_from_unzip_LRU() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + /* If the unzip_LRU list is empty, we can only use the LRU. */ + if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) { + return false; + } + + /* If unzip_LRU is at most 10% of the size of the LRU list, + then use the LRU. This slack allows us to keep hot + decompressed pages in the buffer pool. */ + if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) + <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) { + return false; + } + + /* If eviction hasn't started yet, we assume by default + that a workload is disk bound. */ + if (buf_pool.freed_page_clock == 0) { + return true; + } + + /* Calculate the average over past intervals, and add the values + of the current interval. */ + ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.io; + + ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.unzip; + + /* Decide based on our formula. If the load is I/O bound + (unzip_avg is smaller than the weighted io_avg), evict an + uncompressed frame from unzip_LRU. Otherwise we assume that + the load is CPU bound and evict from the regular LRU. */ + return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR); +} + +/** Try to free an uncompressed page of a compressed block from the unzip +LRU list. The compressed page is preserved, and it need not be clean. +@param limit maximum number of blocks to scan +@return true if freed */ +static bool buf_LRU_free_from_unzip_LRU_list(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (!buf_LRU_evict_from_unzip_LRU()) { + return(false); + } + + ulint scanned = 0; + bool freed = false; + + for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + block && scanned < limit; ++scanned) { + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + freed = buf_LRU_free_page(&block->page, false); + if (freed) { + break; + } + + block = prev_block; + } + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + scanned); + } + + return(freed); +} + +/** Try to free a clean page from the common LRU list. +@param limit maximum number of blocks to scan +@return whether a page was freed */ +static bool buf_LRU_free_from_common_LRU_list(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + ulint scanned = 0; + bool freed = false; + + for (buf_page_t* bpage = buf_pool.lru_scan_itr.start(); + bpage && scanned < limit; + ++scanned, bpage = buf_pool.lru_scan_itr.get()) { + buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + buf_pool.lru_scan_itr.set(prev); + + const auto accessed = bpage->is_accessed(); + + if (buf_LRU_free_page(bpage, true)) { + if (!accessed) { + /* Keep track of pages that are evicted without + ever being accessed. This gives us a measure of + the effectiveness of readahead */ + ++buf_pool.stat.n_ra_pages_evicted; + } + + freed = true; + break; + } + } + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + scanned); + } + + return(freed); +} + +/** Try to free a replaceable block. +@param limit maximum number of blocks to scan +@return true if found and freed */ +bool buf_LRU_scan_and_free_block(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + return buf_LRU_free_from_unzip_LRU_list(limit) || + buf_LRU_free_from_common_LRU_list(limit); +} + +/** @return a buffer block from the buf_pool.free list +@retval NULL if the free list is empty */ +buf_block_t* buf_LRU_get_free_only() +{ + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(buf_pool.free)); + + while (block != NULL) { + ut_ad(block->page.in_free_list); + ut_d(block->page.in_free_list = FALSE); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + UT_LIST_REMOVE(buf_pool.free, &block->page); + + if (buf_pool.curr_size >= buf_pool.old_size + || UT_LIST_GET_LEN(buf_pool.withdraw) + >= buf_pool.withdraw_target + || !buf_pool.will_be_withdrawn(block->page)) { + /* No adaptive hash index entries may point to + a free block. */ + assert_block_ahi_empty(block); + + block->page.set_state(BUF_BLOCK_MEMORY); + MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size); + break; + } + + /* This should be withdrawn */ + UT_LIST_ADD_LAST( + buf_pool.withdraw, + &block->page); + ut_d(block->in_withdraw_list = true); + + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(buf_pool.free)); + } + + return(block); +} + +/******************************************************************//** +Checks how much of buf_pool is occupied by non-data objects like +AHI, lock heaps etc. Depending on the size of non-data objects this +function will either assert or issue a warning and switch on the +status monitor. */ +static void buf_LRU_check_size_of_non_data_objects() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (recv_recovery_is_on() || buf_pool.curr_size != buf_pool.old_size) + return; + + const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + + if (s < buf_pool.curr_size / 20) + ib::fatal() << "Over 95 percent of the buffer pool is" + " occupied by lock heaps" +#ifdef BTR_CUR_HASH_ADAPT + " or the adaptive hash index" +#endif /* BTR_CUR_HASH_ADAPT */ + "! Check that your transactions do not set too many" + " row locks, or review if innodb_buffer_pool_size=" + << (buf_pool.curr_size >> (20U - srv_page_size_shift)) + << "M could be bigger."; + + if (s < buf_pool.curr_size / 3) + { + if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer) + { + /* Over 67 % of the buffer pool is occupied by lock heaps or + the adaptive hash index. This may be a memory leak! */ + ib::warn() << "Over 67 percent of the buffer pool is" + " occupied by lock heaps" +#ifdef BTR_CUR_HASH_ADAPT + " or the adaptive hash index" +#endif /* BTR_CUR_HASH_ADAPT */ + "! Check that your transactions do not set too many row locks." + " innodb_buffer_pool_size=" + << (buf_pool.curr_size >> (20U - srv_page_size_shift)) + << "M. Starting the InnoDB Monitor to print diagnostics."; + buf_lru_switched_on_innodb_mon= true; + srv_print_innodb_monitor= TRUE; + srv_monitor_timer_schedule_now(); + } + } + else if (buf_lru_switched_on_innodb_mon) + { + /* Switch off the InnoDB Monitor; this is a simple way to stop the + monitor if the situation becomes less urgent, but may also + surprise users who did SET GLOBAL innodb_status_output=ON earlier! */ + buf_lru_switched_on_innodb_mon= false; + srv_print_innodb_monitor= FALSE; + } +} + +/** Get a block from the buf_pool.free list. +If the list is empty, blocks will be moved from the end of buf_pool.LRU +to buf_pool.free. + +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from the buf_pool.free list, success:done + * if buf_pool.try_LRU_scan is set + * scan LRU up to 100 pages to free a clean block + * success:retry the free list + * flush up to innodb_lru_flush_size LRU blocks to data files + (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) + * on buf_page_write_complete() the blocks will put on buf_pool.free list + * success: retry the free list +* subsequent iterations: same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool.try_LRU_scan is not set + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t *buf_LRU_get_free_block(bool have_mutex) +{ + ulint n_iterations = 0; + ulint flush_failures = 0; + MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); + if (have_mutex) { + mysql_mutex_assert_owner(&buf_pool.mutex); + goto got_mutex; + } + mysql_mutex_lock(&buf_pool.mutex); +got_mutex: + buf_LRU_check_size_of_non_data_objects(); + buf_block_t* block; + + DBUG_EXECUTE_IF("ib_lru_force_no_free_page", + if (!buf_lru_free_blocks_error_printed) { + n_iterations = 21; + goto not_found;}); + +retry: + /* If there is a block in the free list, take it */ + if ((block = buf_LRU_get_free_only()) != nullptr) { +got_block: + if (!have_mutex) { + mysql_mutex_unlock(&buf_pool.mutex); + } + memset(&block->page.zip, 0, sizeof block->page.zip); + return block; + } + + MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS ); + if (n_iterations || buf_pool.try_LRU_scan) { + /* If no block was in the free list, search from the + end of the LRU list and try to free a block there. + If we are doing for the first time we'll scan only + tail of the LRU list otherwise we scan the whole LRU + list. */ + if (buf_LRU_scan_and_free_block(n_iterations + ? ULINT_UNDEFINED : 100)) { + goto retry; + } + + /* Tell other threads that there is no point + in scanning the LRU list. */ + buf_pool.try_LRU_scan = false; + } + + for (;;) { + if ((block = buf_LRU_get_free_only()) != nullptr) { + goto got_block; + } + if (!buf_pool.n_flush_LRU_) { + break; + } + my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex); + } + +#ifndef DBUG_OFF +not_found: +#endif + mysql_mutex_unlock(&buf_pool.mutex); + + if (n_iterations > 20 && !buf_lru_free_blocks_error_printed + && srv_buf_pool_old_size == srv_buf_pool_size) { + + ib::warn() << "Difficult to find free blocks in the buffer pool" + " (" << n_iterations << " search iterations)! " + << flush_failures << " failed attempts to" + " flush a page!" + " Consider increasing innodb_buffer_pool_size." + " Pending flushes (fsync) log: " + << log_sys.get_pending_flushes() + << "; buffer pool: " + << fil_n_pending_tablespace_flushes + << ". " << os_n_file_reads << " OS file reads, " + << os_n_file_writes << " OS file writes, " + << os_n_fsyncs + << " OS fsyncs."; + + buf_lru_free_blocks_error_printed = true; + } + + if (n_iterations > 1) { + MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + } + + /* No free block was found: try to flush the LRU list. + The freed blocks will be up for grabs for all threads. + + TODO: A more elegant way would have been to return one freed + up block to the caller here but the code that deals with + removing the block from buf_pool.page_hash and buf_pool.LRU is fairly + involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We + can do that in a separate patch sometime in future. */ + + if (!buf_flush_LRU(innodb_lru_flush_size)) { + MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); + ++flush_failures; + } + + n_iterations++; + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.LRU_waits++; + goto got_mutex; +} + +/** Move the LRU_old pointer so that the length of the old blocks list +is inside the allowed limits. */ +static void buf_LRU_old_adjust_len() +{ + ulint old_len; + ulint new_len; + + ut_a(buf_pool.LRU_old); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); + ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); + compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN + > BUF_LRU_OLD_RATIO_DIV + * (BUF_LRU_OLD_TOLERANCE + 5)); + compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN); + +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + + old_len = buf_pool.LRU_old_len; + new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU) + * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool.LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + for (;;) { + buf_page_t* LRU_old = buf_pool.LRU_old; + + ut_a(LRU_old); + ut_ad(LRU_old->in_LRU_list); +#ifdef UNIV_LRU_DEBUG + ut_a(LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + + /* Update the LRU_old pointer if necessary */ + + if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) { + + buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV( + LRU, LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(!LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + old_len = ++buf_pool.LRU_old_len; + LRU_old->set_old(true); + + } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { + + buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old); + old_len = --buf_pool.LRU_old_len; + LRU_old->set_old(false); + } else { + return; + } + } +} + +/** Initialize the old blocks pointer in the LRU list. This function should be +called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ +static void buf_LRU_old_init() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN); + + /* We first initialize all blocks in the LRU list as old and then use + the adjust function to move the LRU_old pointer to the right + position */ + + for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage)) { + + ut_ad(bpage->in_LRU_list); + + /* This loop temporarily violates the + assertions of buf_page_t::set_old(). */ + bpage->old = true; + } + + buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU); + buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU); + + buf_LRU_old_adjust_len(); +} + +/** Remove a block from the unzip_LRU list if it belonged to the list. +@param[in] bpage control block */ +static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage) +{ + ut_ad(bpage->in_file()); + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (bpage->belongs_to_unzip_LRU()) { + buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); + + ut_ad(block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = false); + + UT_LIST_REMOVE(buf_pool.unzip_LRU, block); + } +} + +/** Removes a block from the LRU list. +@param[in] bpage control block */ +static inline void buf_LRU_remove_block(buf_page_t* bpage) +{ + /* Important that we adjust the hazard pointers before removing + bpage from the LRU list. */ + buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage); + + /* If the LRU_old pointer is defined and points to just this block, + move it backward one step */ + + if (bpage == buf_pool.LRU_old) { + + /* Below: the previous block is guaranteed to exist, + because the LRU_old pointer is only allowed to differ + by BUF_LRU_OLD_TOLERANCE from strict + buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU + list length. */ + ut_a(prev_bpage); +#ifdef UNIV_LRU_DEBUG + ut_a(!prev_bpage->old); +#endif /* UNIV_LRU_DEBUG */ + buf_pool.LRU_old = prev_bpage; + prev_bpage->set_old(true); + + buf_pool.LRU_old_len++; + } + + buf_pool.stat.LRU_bytes -= bpage->physical_size(); + + buf_unzip_LRU_remove_block_if_needed(bpage); + + /* If the LRU list is so short that LRU_old is not defined, + clear the "old" flags and return */ + if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) { + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + + /* This loop temporarily violates the + assertions of buf_page_t::set_old(). */ + bpage->old = false; + } + + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + + return; + } + + ut_ad(buf_pool.LRU_old); + + /* Update the LRU_old_len field if necessary */ + if (bpage->old) { + buf_pool.LRU_old_len--; + } + + /* Adjust the length of the old block list if necessary */ + buf_LRU_old_adjust_len(); +} + +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old) /*!< in: TRUE if should be put to the end + of the list, else put to the start */ +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(block->page.belongs_to_unzip_LRU()); + ut_ad(!block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = true); + + if (old) { + UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block); + } else { + UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block); + } +} + +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the page_size is +already set when invoking the function, so that we can get correct +page_size from the buffer page when adding a block into LRU */ +void +buf_LRU_add_block( + buf_page_t* bpage, /*!< in: control block */ + bool old) /*!< in: true if should be put to the old blocks + in the LRU list, else put to the start; if the + LRU list is very short, the block is added to + the start, regardless of this parameter */ +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!bpage->in_LRU_list); + + if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) { + + UT_LIST_ADD_FIRST(buf_pool.LRU, bpage); + + bpage->freed_page_clock = buf_pool.freed_page_clock + & ((1U << 31) - 1); + } else { +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old, + bpage); + + buf_pool.LRU_old_len++; + } + + ut_d(bpage->in_LRU_list = TRUE); + + incr_LRU_size_in_bytes(bpage); + + if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool.LRU_old); + + /* Adjust the length of the old block list if necessary */ + + bpage->set_old(old); + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } else { + bpage->set_old(buf_pool.LRU_old != NULL); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (bpage->belongs_to_unzip_LRU()) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, old); + } +} + +/** Move a block to the start of the LRU list. */ +void buf_page_make_young(buf_page_t *bpage) +{ + ut_ad(bpage->in_file()); + + mysql_mutex_lock(&buf_pool.mutex); + + if (UNIV_UNLIKELY(bpage->old)) + buf_pool.stat.n_pages_made_young++; + + buf_LRU_remove_block(bpage); + buf_LRU_add_block(bpage, false); + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) +{ + const page_id_t id(bpage->id()); + buf_page_t* b = nullptr; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(bpage->in_file()); + ut_ad(bpage->in_LRU_list); + + /* First, perform a quick check before we acquire hash_lock. */ + if (!bpage->can_relocate()) { + return false; + } + + /* We must hold an exclusive hash_lock to prevent + bpage->can_relocate() from changing due to a concurrent + execution of buf_page_get_low(). */ + const ulint fold = id.fold(); + page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); + lsn_t oldest_modification = bpage->oldest_modification_acquire(); + + if (UNIV_UNLIKELY(!bpage->can_relocate())) { + /* Do not free buffer fixed and I/O-fixed blocks. */ + goto func_exit; + } + + if (oldest_modification == 1) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + oldest_modification = bpage->oldest_modification(); + if (oldest_modification) { + ut_ad(oldest_modification == 1); + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!bpage->oldest_modification()); + oldest_modification = 0; + } + + if (zip || !bpage->zip.data) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ + + if (oldest_modification) { + goto func_exit; + } + } else if (oldest_modification + && bpage->state() != BUF_BLOCK_FILE_PAGE) { +func_exit: + hash_lock->write_unlock(); + return(false); + + } else if (bpage->state() == BUF_BLOCK_FILE_PAGE) { + b = buf_page_alloc_descriptor(); + ut_a(b); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + new (b) buf_page_t(*bpage); + b->set_state(BUF_BLOCK_ZIP_PAGE); + } + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(bpage->in_file()); + ut_ad(bpage->in_LRU_list); + + DBUG_PRINT("ib_buf", ("free page %u:%u", + id.space(), id.page_no())); + + ut_ad(bpage->can_relocate()); + + if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) { + ut_ad(!b); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + return(true); + } + + /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr + then it was a compressed page with an uncompressed frame and + we are interested in freeing only the uncompressed frame. + Therefore we have to reinsert the compressed page descriptor + into the LRU and page_hash (and possibly flush_list). + if !b then it was a regular page that has been freed */ + + if (UNIV_LIKELY_NULL(b)) { + buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); + + ut_ad(!buf_pool.page_hash_get_low(id, fold)); + ut_ad(b->zip_size()); + + /* The field in_LRU_list of + the to-be-freed block descriptor should have + been cleared in + buf_LRU_block_remove_hashed(), which + invokes buf_LRU_remove_block(). */ + ut_ad(!bpage->in_LRU_list); + + /* bpage->state was BUF_BLOCK_FILE_PAGE because + b != nullptr. The type cast below is thus valid. */ + ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list); + + /* The fields of bpage were copied to b before + buf_LRU_block_remove_hashed() was invoked. */ + ut_ad(!b->in_zip_hash); + ut_ad(b->in_LRU_list); + ut_ad(b->in_page_hash); + + HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, b); + + /* Insert b where bpage was in the LRU list. */ + if (prev_b) { + ulint lru_len; + + ut_ad(prev_b->in_LRU_list); + ut_ad(prev_b->in_file()); + + UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b); + + incr_LRU_size_in_bytes(b); + + if (b->is_old()) { + buf_pool.LRU_old_len++; + if (buf_pool.LRU_old + == UT_LIST_GET_NEXT(LRU, b)) { + + buf_pool.LRU_old = b; + } + } + + lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + if (lru_len > BUF_LRU_OLD_MIN_LEN) { + ut_ad(buf_pool.LRU_old); + /* Adjust the length of the + old block list if necessary */ + buf_LRU_old_adjust_len(); + } else if (lru_len == BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is now long + enough for LRU_old to become + defined: init it */ + buf_LRU_old_init(); + } +#ifdef UNIV_LRU_DEBUG + /* Check that the "old" flag is consistent + in the block and its neighbours. */ + b->set_old(b->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } else { + ut_d(b->in_LRU_list = FALSE); + buf_LRU_add_block(b, b->old); + } + + buf_flush_relocate_on_flush_list(bpage, b); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + bpage->zip.data = nullptr; + + page_zip_set_size(&bpage->zip, 0); + + /* Prevent buf_page_get_gen() from + decompressing the block while we release + hash_lock. */ + b->set_io_fix(BUF_IO_PIN); + hash_lock->write_unlock(); + } else if (!zip) { + hash_lock->write_unlock(); + } + + buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); + +#ifdef BTR_CUR_HASH_ADAPT + if (block->index) { + mysql_mutex_unlock(&buf_pool.mutex); + + /* Remove the adaptive hash index on the page. + The page was declared uninitialized by + buf_LRU_block_remove_hashed(). We need to flag + the contents of the page valid (which it still is) in + order to avoid bogus Valgrind or MSAN warnings.*/ + + MEM_MAKE_DEFINED(block->frame, srv_page_size); + btr_search_drop_page_hash_index(block); + MEM_UNDEFINED(block->frame, srv_page_size); + + if (UNIV_LIKELY_NULL(b)) { + ut_ad(b->zip_size()); + b->io_unfix(); + } + + mysql_mutex_lock(&buf_pool.mutex); + } else +#endif + if (UNIV_LIKELY_NULL(b)) { + ut_ad(b->zip_size()); + b->io_unfix(); + } + + buf_LRU_block_free_hashed_page(block); + + return(true); +} + +/******************************************************************//** +Puts a block back to the free list. */ +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block) /*!< in: block, must not contain a file page */ +{ + void* data; + + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); + assert_block_ahi_empty(block); + ut_ad(!block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + + block->page.set_state(BUF_BLOCK_NOT_USED); + + MEM_UNDEFINED(block->frame, srv_page_size); + /* Wipe page_no and space_id */ + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xfe, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + 0xfe, 4); + data = block->page.zip.data; + + if (data != NULL) { + block->page.zip.data = NULL; + buf_pool_mutex_exit_forbid(); + + ut_ad(block->zip_size()); + + buf_buddy_free(data, block->zip_size()); + + buf_pool_mutex_exit_allow(); + page_zip_set_size(&block->page.zip, 0); + } + + if (buf_pool.curr_size < buf_pool.old_size + && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target + && buf_pool.will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_ADD_LAST( + buf_pool.withdraw, + &block->page); + ut_d(block->in_withdraw_list = true); + } else { + UT_LIST_ADD_FIRST(buf_pool.free, &block->page); + ut_d(block->page.in_free_list = true); + pthread_cond_signal(&buf_pool.done_free); + } + + MEM_NOACCESS(block->frame, srv_page_size); +} + +/** Release a memory block to the buffer pool. */ +ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block) +{ + ut_ad(this == &buf_pool); + mysql_mutex_lock(&mutex); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&mutex); +} + + +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(), +the object will be freed. + +@param bpage buffer block +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed + +If a compressed page is freed other compressed pages may be relocated. +@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The +caller needs to free the page to the free list +@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + page_hash_latch *hash_lock, bool zip) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(hash_lock->is_write_locked()); + + ut_a(bpage->io_fix() == BUF_IO_NONE); + ut_a(!bpage->buf_fix_count()); + + buf_LRU_remove_block(bpage); + + buf_pool.freed_page_clock += 1; + + switch (bpage->state()) { + case BUF_BLOCK_FILE_PAGE: + MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t)); + MEM_CHECK_ADDRESSABLE(((buf_block_t*) bpage)->frame, + srv_page_size); + buf_block_modify_clock_inc((buf_block_t*) bpage); + if (bpage->zip.data) { + const page_t* page = ((buf_block_t*) bpage)->frame; + + ut_a(!zip || !bpage->oldest_modification()); + ut_ad(bpage->zip_size()); + + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + if (!zip) { + /* InnoDB writes the data to the + uncompressed page frame. Copy it + to the compressed page, which will + be preserved. */ + memcpy(bpage->zip.data, page, + bpage->zip_size()); + } + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: +#if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT + /* During recovery, we only update the + compressed page, not the uncompressed one. */ + ut_a(recv_recovery_is_on() + || page_zip_validate( + &bpage->zip, page, + ((buf_block_t*) bpage)->index)); +#endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */ + break; + default: + ib::error() << "The compressed page to be" + " evicted seems corrupt:"; + ut_print_buf(stderr, page, srv_page_size); + + ib::error() << "Possibly older version of" + " the page:"; + + ut_print_buf(stderr, bpage->zip.data, + bpage->zip_size()); + putc('\n', stderr); + ut_error; + } + + break; + } + /* fall through */ + case BUF_BLOCK_ZIP_PAGE: + ut_a(!bpage->oldest_modification()); + MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + + ut_ad(!bpage->in_zip_hash); + HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, id.fold(), bpage); + + switch (bpage->state()) { + case BUF_BLOCK_ZIP_PAGE: + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->in_LRU_list); + ut_a(bpage->zip.data); + ut_a(bpage->zip.ssize); + ut_ad(!bpage->oldest_modification()); + + hash_lock->write_unlock(); + buf_pool_mutex_exit_forbid(); + + buf_buddy_free(bpage->zip.data, bpage->zip_size()); + + buf_pool_mutex_exit_allow(); + buf_page_free_descriptor(bpage); + return(false); + + case BUF_BLOCK_FILE_PAGE: + static_assert(FIL_NULL == 0xffffffffU, "fill pattern"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(reinterpret_cast<buf_block_t*>(bpage)->frame + + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(reinterpret_cast<buf_block_t*>(bpage)->frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size); + bpage->set_state(BUF_BLOCK_REMOVE_HASH); + + if (!zip) { + return true; + } + + /* Question: If we release hash_lock here + then what protects us against: + 1) Some other thread buffer fixing this page + 2) Some other thread trying to read this page and + not finding it in buffer pool attempting to read it + from the disk. + Answer: + 1) Cannot happen because the page is no longer in the + page_hash. Only possibility is when while invalidating + a tablespace we buffer fix the prev_page in LRU to + avoid relocation during the scan. But that is not + possible because we are holding buf_pool mutex. + + 2) Not possible because in buf_page_init_for_read() + we do a look up of page_hash while holding buf_pool + mutex and since we are holding buf_pool mutex here + and by the time we'll release it in the caller we'd + have inserted the compressed only descriptor in the + page_hash. */ + hash_lock->write_unlock(); + + if (bpage->zip.data) { + /* Free the compressed page. */ + void* data = bpage->zip.data; + bpage->zip.data = NULL; + + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->oldest_modification()); + ut_ad(!bpage->in_LRU_list); + buf_pool_mutex_exit_forbid(); + + buf_buddy_free(data, bpage->zip_size()); + + buf_pool_mutex_exit_allow(); + + page_zip_set_size(&bpage->zip, 0); + } + + return(true); + + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; + return(false); +} + +/** Remove one page from LRU list and put it to free list. +@param bpage file page to be freed +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) */ +void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id, + page_hash_latch *hash_lock) +{ + while (bpage->buf_fix_count()) + /* Wait for other threads to release the fix count + before releasing the bpage from LRU list. */ + (void) LF_BACKOFF(); + + if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true)) + buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage)); +} + +/** Update buf_pool.LRU_old_ratio. +@param[in] old_pct Reserve this percentage of + the buffer pool for "old" blocks +@param[in] adjust true=adjust the LRU list; + false=just assign buf_pool.LRU_old_ratio + during the initialization of InnoDB +@return updated old_pct */ +uint buf_LRU_old_ratio_update(uint old_pct, bool adjust) +{ + uint ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100; + if (ratio < BUF_LRU_OLD_RATIO_MIN) { + ratio = BUF_LRU_OLD_RATIO_MIN; + } else if (ratio > BUF_LRU_OLD_RATIO_MAX) { + ratio = BUF_LRU_OLD_RATIO_MAX; + } + + if (adjust) { + mysql_mutex_lock(&buf_pool.mutex); + + if (ratio != buf_pool.LRU_old_ratio) { + buf_pool.LRU_old_ratio = ratio; + + if (UT_LIST_GET_LEN(buf_pool.LRU) + >= BUF_LRU_OLD_MIN_LEN) { + buf_LRU_old_adjust_len(); + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + } else { + buf_pool.LRU_old_ratio = ratio; + } + /* the reverse of + ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */ + return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5)); +} + +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +void +buf_LRU_stat_update() +{ + buf_LRU_stat_t* item; + buf_LRU_stat_t cur_stat; + + if (!buf_pool.freed_page_clock) { + goto func_exit; + } + + /* Update the index. */ + item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; + buf_LRU_stat_arr_ind++; + buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL; + + /* Add the current value and subtract the obsolete entry. + Since buf_LRU_stat_cur is not protected by any mutex, + it can be changing between adding to buf_LRU_stat_sum + and copying to item. Assign it to local variables to make + sure the same value assign to the buf_LRU_stat_sum + and item */ + cur_stat = buf_LRU_stat_cur; + + buf_LRU_stat_sum.io += cur_stat.io - item->io; + buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip; + + /* Put current entry in the array. */ + memcpy(item, &cur_stat, sizeof *item); + +func_exit: + /* Clear the current entry. */ + memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); +} + +#ifdef UNIV_DEBUG +/** Validate the LRU list. */ +void buf_LRU_validate() +{ + ulint old_len; + ulint new_len; + + mysql_mutex_lock(&buf_pool.mutex); + + if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) { + + ut_a(buf_pool.LRU_old); + old_len = buf_pool.LRU_old_len; + + new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU) + * buf_pool.LRU_old_ratio + / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool.LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); + ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); + } + + CheckInLRUList::validate(); + + old_len = 0; + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + + switch (bpage->state()) { + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: + ut_ad(reinterpret_cast<buf_block_t*>(bpage) + ->in_unzip_LRU_list + == bpage->belongs_to_unzip_LRU()); + case BUF_BLOCK_ZIP_PAGE: + break; + } + + if (bpage->is_old()) { + const buf_page_t* prev + = UT_LIST_GET_PREV(LRU, bpage); + const buf_page_t* next + = UT_LIST_GET_NEXT(LRU, bpage); + + if (!old_len++) { + ut_a(buf_pool.LRU_old == bpage); + } else { + ut_a(!prev || prev->is_old()); + } + + ut_a(!next || next->is_old()); + } + } + + ut_a(buf_pool.LRU_old_len == old_len); + + CheckInFreeList::validate(); + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_a(bpage->state() == BUF_BLOCK_NOT_USED); + } + + CheckUnzipLRUAndLRUList::validate(); + + for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU); + block != NULL; + block = UT_LIST_GET_NEXT(unzip_LRU, block)) { + + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + ut_a(block->page.belongs_to_unzip_LRU()); + } + + mysql_mutex_unlock(&buf_pool.mutex); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Dump the LRU list to stderr. */ +void buf_LRU_print() +{ + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + const page_id_t id(bpage->id()); + + fprintf(stderr, "BLOCK space %u page %u ", + id.space(), id.page_no()); + + if (bpage->is_old()) { + fputs("old ", stderr); + } + + if (const uint32_t buf_fix_count = bpage->buf_fix_count()) { + fprintf(stderr, "buffix count %u ", buf_fix_count); + } + + if (const auto io_fix = bpage->io_fix()) { + fprintf(stderr, "io_fix %d ", io_fix); + } + + if (bpage->oldest_modification()) { + fputs("modif. ", stderr); + } + + switch (const auto state = bpage->state()) { + const byte* frame; + case BUF_BLOCK_FILE_PAGE: + frame = buf_block_get_frame((buf_block_t*) bpage); + fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n", + fil_page_get_type(frame), + btr_page_get_index_id(frame)); + break; + case BUF_BLOCK_ZIP_PAGE: + frame = bpage->zip.data; + fprintf(stderr, "\ntype %u size " ULINTPF + " index id " IB_ID_FMT "\n", + fil_page_get_type(frame), + bpage->zip_size(), + btr_page_get_index_id(frame)); + break; + + default: + fprintf(stderr, "\n!state %d!\n", state); + break; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc new file mode 100644 index 00000000..253a2542 --- /dev/null +++ b/storage/innobase/buf/buf0rea.cc @@ -0,0 +1,785 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0rea.cc +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include <mysql/service_thd_wait.h> + +#include "buf0rea.h" +#include "fil0fil.h" +#include "mtr0mtr.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "ibuf0ibuf.h" +#include "log0recv.h" +#include "trx0sys.h" +#include "os0file.h" +#include "srv0start.h" +#include "srv0srv.h" + +/** If there are buf_pool.curr_size per the number below pending reads, then +read-ahead is not done: this is to prevent flooding the buffer pool with +i/o-fixed buffer blocks */ +#define BUF_READ_AHEAD_PEND_LIMIT 2 + +/** Remove the sentinel block for the watch before replacing it with a +real block. watch_unset() or watch_occurred() will notice +that the block has been replaced with the real block. +@param watch sentinel */ +inline void buf_pool_t::watch_remove(buf_page_t *watch) +{ + ut_ad(hash_lock_get(watch->id())->is_write_locked()); + ut_a(watch_is_sentinel(*watch)); + if (watch->buf_fix_count()) + { + ut_ad(watch->in_page_hash); + ut_d(watch->in_page_hash= false); + HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch); + watch->set_buf_fix_count(0); + } + ut_ad(!watch->in_page_hash); + watch->set_state(BUF_BLOCK_NOT_USED); + watch->id_= page_id_t(~0ULL); +} + +/** Initialize a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip whether the uncompressed page is + requested (for ROW_FORMAT=COMPRESSED) +@return pointer to the block +@retval NULL in case of an error */ +static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, + ulint zip_size, bool unzip) +{ + mtr_t mtr; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) + { + /* It is a read-ahead within an ibuf routine */ + ut_ad(!ibuf_bitmap_page(page_id, zip_size)); + ibuf_mtr_start(&mtr); + + if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr)) + { + ibuf_mtr_commit(&mtr); + return nullptr; + } + } + else + ut_ad(mode == BUF_READ_ANY_PAGE); + + buf_page_t *bpage= nullptr; + buf_block_t *block= nullptr; + if (!zip_size || unzip || recv_recovery_is_on()) + { + block= buf_LRU_get_free_block(false); + block->initialise(page_id, zip_size); + /* We set a pass-type x-lock on the frame because then + the same thread which called for the read operation + (and is running now at this point of code) can wait + for the read to complete by waiting for the x-lock on + the frame; if the x-lock were recursive, the same + thread would illegally get the x-lock before the page + read is completed. The x-lock will be released + in buf_page_read_complete() by the io-handler thread. */ + rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); + } + + const ulint fold= page_id.fold(); + + mysql_mutex_lock(&buf_pool.mutex); + + /* We must acquire hash_lock this early to prevent + a race condition with buf_pool_t::watch_remove() */ + page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold); + hash_lock->write_lock(); + + buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold); + if (hash_page && !buf_pool.watch_is_sentinel(*hash_page)) + { + /* The page is already in the buffer pool. */ + hash_lock->write_unlock(); + if (block) + { + rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ); + buf_LRU_block_free_non_file_page(block); + } + goto func_exit; + } + + if (UNIV_LIKELY(block != nullptr)) + { + bpage= &block->page; + + /* Insert into the hash table of file pages */ + if (hash_page) + { + /* Preserve the reference count. */ + auto buf_fix_count= hash_page->buf_fix_count(); + ut_a(buf_fix_count > 0); + block->page.add_buf_fix_count(buf_fix_count); + buf_pool.watch_remove(hash_page); + } + + block->page.set_io_fix(BUF_IO_READ); + block->page.set_state(BUF_BLOCK_FILE_PAGE); + ut_ad(!block->page.in_page_hash); + ut_d(block->page.in_page_hash= true); + HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage); + hash_lock->write_unlock(); + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(bpage, true/* to old blocks */); + + if (UNIV_UNLIKELY(zip_size)) + { + /* buf_pool.mutex may be released and reacquired by + buf_buddy_alloc(). We must defer this operation until after the + block descriptor has been added to buf_pool.LRU and + buf_pool.page_hash. */ + block->page.zip.data= static_cast<page_zip_t*> + (buf_buddy_alloc(zip_size)); + + /* To maintain the invariant + block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU() + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(block->page.belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(block, TRUE); + } + } + else + { + hash_lock->write_unlock(); + + /* The compressed page must be allocated before the + control block (bpage), in order to avoid the + invocation of buf_buddy_relocate_block() on + uninitialized data. */ + bool lru= false; + void *data= buf_buddy_alloc(zip_size, &lru); + + hash_lock->write_lock(); + + /* If buf_buddy_alloc() allocated storage from the LRU list, + it released and reacquired buf_pool.mutex. Thus, we must + check the page_hash again, as it may have been modified. */ + if (UNIV_UNLIKELY(lru)) + { + hash_page= buf_pool.page_hash_get_low(page_id, fold); + + if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page))) + { + /* The block was added by some other thread. */ + hash_lock->write_unlock(); + buf_buddy_free(data, zip_size); + goto func_exit; + } + } + + bpage= buf_page_alloc_descriptor(); + + page_zip_des_init(&bpage->zip); + page_zip_set_size(&bpage->zip, zip_size); + bpage->zip.data = (page_zip_t*) data; + + bpage->init(BUF_BLOCK_ZIP_PAGE, page_id); + + if (hash_page) + { + /* Preserve the reference count. It can be 0 if + buf_pool_t::watch_unset() is executing concurrently, + waiting for buf_pool.mutex, which we are holding. */ + bpage->add_buf_fix_count(hash_page->buf_fix_count()); + buf_pool.watch_remove(hash_page); + } + + ut_ad(!bpage->in_page_hash); + ut_d(bpage->in_page_hash= true); + HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage); + bpage->set_io_fix(BUF_IO_READ); + hash_lock->write_unlock(); + + /* The block must be put to the LRU list, to the old blocks. + The zip size is already set into the page zip */ + buf_LRU_add_block(bpage, true/* to old blocks */); + } + + mysql_mutex_unlock(&buf_pool.mutex); + buf_pool.n_pend_reads++; + goto func_exit_no_mutex; +func_exit: + mysql_mutex_unlock(&buf_pool.mutex); +func_exit_no_mutex: + if (mode == BUF_READ_IBUF_PAGES_ONLY) + ibuf_mtr_commit(&mtr); + + ut_ad(!bpage || bpage->in_file()); + + return bpage; +} + +/** Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. + +@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED + if we are trying + to read from a non-existent tablespace +@param[in,out] space tablespace +@param[in] sync true if synchronous aio is desired +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ..., +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip true=request uncompressed page +@return whether a read request was queued */ +static +bool +buf_read_page_low( + dberr_t* err, + fil_space_t* space, + bool sync, + ulint mode, + const page_id_t page_id, + ulint zip_size, + bool unzip) +{ + buf_page_t* bpage; + + *err = DB_SUCCESS; + + if (buf_dblwr.is_inside(page_id)) { + ib::error() << "Trying to read doublewrite buffer page " + << page_id; + ut_ad(0); +nothing_read: + space->release(); + return false; + } + + if (sync) { + } else if (trx_sys_hdr_page(page_id) + || ibuf_bitmap_page(page_id, zip_size) + || (!recv_no_ibuf_operations + && ibuf_page(page_id, zip_size, nullptr))) { + + /* Trx sys header is so low in the latching order that we play + safe and do not leave the i/o-completion to an asynchronous + i/o-thread. Change buffer pages must always be read with + syncronous i/o, to make sure they do not get involved in + thread deadlocks. */ + sync = true; + } + + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ + bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip); + + if (bpage == NULL) { + goto nothing_read; + } + + ut_ad(bpage->in_file()); + + if (sync) { + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + } + + DBUG_LOG("ib_buf", + "read page " << page_id << " zip_size=" << zip_size + << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); + + void* dst; + + if (zip_size) { + dst = bpage->zip.data; + } else { + ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE); + + dst = ((buf_block_t*) bpage)->frame; + } + + const ulint len = zip_size ? zip_size : srv_page_size; + + auto fio = space->io(IORequest(sync + ? IORequest::READ_SYNC + : IORequest::READ_ASYNC), + page_id.page_no() * len, len, dst, bpage); + *err= fio.err; + + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { + if (!sync || fio.err == DB_TABLESPACE_DELETED) { + buf_pool.corrupted_evict(bpage); + return false; + } + + ut_error; + } + + if (sync) { + thd_wait_end(NULL); + + /* The i/o was already completed in space->io() */ + *err = buf_page_read_complete(bpage, *fio.node); + space->release(); + + if (*err != DB_SUCCESS) { + return false; + } + } + + return true; +} + +/** Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@param[in] page_id page id of a page which the current thread +wants to access +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +ulint +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) +{ + if (!srv_random_read_ahead) + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + fil_space_t* space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + ulint count= 5 + buf_read_ahead_area / 8; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + page_id_t high= low + buf_read_ahead_area; + high.set_page_no(std::min(high.page_no(), space->last_page_number())); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (page_id_t i= low; i < high; ++i) + { + const ulint fold= i.fold(); + page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold); + const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold); + bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage); + hash_lock->read_unlock(); + if (found && !--count) + goto read_ahead; + } + +no_read_ahead: + space->release(); + return 0; + +read_ahead: + if (space->is_stopping()) + goto no_read_ahead; + + /* Read all the suitable blocks within the area */ + const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + + for (page_id_t i= low; i < high; ++i) + { + if (ibuf_bitmap_page(i, zip_size)) + continue; + if (space->is_stopping()) + break; + dberr_t err; + space->reacquire(); + if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false)) + count++; + } + + if (count) + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + low.page_no())); + space->release(); + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool.stat.n_ra_pages_read_rnd+= count; + srv_stats.buf_pool_reads.add(count); + return count; +} + +/** High-level function which reads a page from a file to buf_pool +if it is not already there. Sets the io_fix and an exclusive lock +on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@retval DB_SUCCESS if the page was read and is not corrupted, +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. +@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) +{ + fil_space_t *space= fil_space_t::get(page_id.space()); + if (!space) + { + ib::info() << "trying to read page " << page_id + << " in nonexisting or being-dropped tablespace"; + return DB_TABLESPACE_DELETED; + } + + dberr_t err; + if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE, + page_id, zip_size, false)) + srv_stats.buf_pool_reads.add(1); + + buf_LRU_stat_inc_io(); + return err; +} + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in,out] space tablespace +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] sync true if synchronous aio is desired */ +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size, bool sync) +{ + dberr_t err; + + if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE, + page_id, zip_size, false)) { + srv_stats.buf_pool_reads.add(1); + } + + switch (err) { + case DB_SUCCESS: + case DB_ERROR: + break; + case DB_TABLESPACE_DELETED: + ib::info() << "trying to read page " << page_id + << " in the background" + " in a non-existing or being-dropped tablespace"; + break; + case DB_PAGE_CORRUPTED: + case DB_DECRYPTION_FAILED: + ib::error() + << "Background Page read failed to " + "read or decrypt " << page_id; + break; + default: + ib::fatal() << "Error " << err << " in background read of " + << page_id; + } + + /* We do not increment number of I/O operations used for LRU policy + here (buf_LRU_stat_inc_io()). We use this in heuristics to decide + about evicting uncompressed version of compressed pages from the + buffer pool. Since this function is called from buffer pool load + these IOs are deliberate and are not part of normal workload we can + ignore these in our heuristics. */ +} + +/** Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@param[in] page_id page id; see NOTE 3 above +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine +@return number of page read requests issued */ +ulint +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) +{ + /* check if readahead is disabled */ + if (!srv_read_ahead_threshold) + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + const page_id_t high_1= low + (buf_read_ahead_area - 1); + + /* We will check that almost all pages in the area have been accessed + in the desired order. */ + const bool descending= page_id == low; + + if (!descending && page_id != high_1) + /* This is not a border page of the area */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + fil_space_t *space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + + if (high_1.page_no() > space->last_page_number()) + { + /* The area is not whole. */ +fail: + space->release(); + return 0; + } + + /* How many out of order accessed pages can we ignore + when working out the access pattern for linear readahead */ + ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES - + srv_read_ahead_threshold, + uint32_t{buf_pool.read_ahead_area}); + page_id_t new_low= low, new_high_1= high_1; + unsigned prev_accessed= 0; + for (page_id_t i= low; i != high_1; ++i) + { + const ulint fold= i.fold(); + page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold); + const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold); + if (i == page_id) + { + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. The hash_lock is only protecting the + buf_pool.page_hash for page i, not the bpage contents itself. */ + if (!bpage) + { +hard_fail: + hash_lock->read_unlock(); + goto fail; + } + const byte *f; + switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) { + case BUF_BLOCK_FILE_PAGE: + f= reinterpret_cast<const buf_block_t*>(bpage)->frame; + break; + case BUF_BLOCK_ZIP_PAGE: + f= bpage->zip.data; + break; + default: + goto hard_fail; + } + + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT)); + if (prev == FIL_NULL || next == FIL_NULL) + goto hard_fail; + page_id_t id= page_id; + if (descending && next - 1 == page_id.page_no()) + id.set_page_no(prev); + else if (!descending && prev + 1 == page_id.page_no()) + id.set_page_no(next); + else + goto hard_fail; /* Successor or predecessor not in the right order */ + + new_low= id - (id.page_no() % buf_read_ahead_area); + new_high_1= new_low + (buf_read_ahead_area - 1); + + if (id != new_low && id != new_high_1) + /* This is not a border page of the area: return */ + goto hard_fail; + if (new_high_1.page_no() > space->last_page_number()) + /* The area is not whole */ + goto hard_fail; + } + else if (!bpage) + { +failed: + hash_lock->read_unlock(); + if (--count) + continue; + goto fail; + } + + const unsigned accessed= bpage->is_accessed(); + if (!accessed) + goto failed; + /* Note that buf_page_t::is_accessed() returns the time of the + first access. If some blocks of the extent existed in the buffer + pool at the time of a linear access pattern, the first access + times may be nonmonotonic, even though the latest access times + were linear. The threshold (srv_read_ahead_factor) should help a + little against this. */ + bool fail= prev_accessed && + (descending ? prev_accessed > accessed : prev_accessed < accessed); + prev_accessed= accessed; + if (fail) + goto failed; + hash_lock->read_unlock(); + } + + /* If we got this far, read-ahead can be sensible: do it */ + count= 0; + for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + new_low != new_high_1; ++new_low) + { + if (ibuf_bitmap_page(new_low, zip_size)) + continue; + if (space->is_stopping()) + break; + dberr_t err; + space->reacquire(); + count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size, + false); + } + + if (count) + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + new_low.page_no())); + space->release(); + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool.stat.n_ra_pages_read+= count; + return count; +} + +/** Issues read requests for pages which recovery wants to read in. +@param[in] space_id tablespace id +@param[in] page_nos array of page numbers to read, with the +highest page number the last in the array +@param[in] n number of page numbers in the array */ +void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n) +{ + fil_space_t* space = fil_space_t::get(space_id); + + if (!space) { + /* The tablespace is missing or unreadable: do nothing */ + return; + } + + const ulint zip_size = space->zip_size(); + + for (ulint i = 0; i < n; i++) { + + /* Ignore if the page already present in freed ranges. */ + if (space->freed_ranges.contains(page_nos[i])) { + continue; + } + + const page_id_t cur_page_id(space_id, page_nos[i]); + + ulint limit = 0; + for (ulint j = 0; j < buf_pool.n_chunks; j++) { + limit += buf_pool.chunks[j].size / 2; + } + + for (ulint count = 0; buf_pool.n_pend_reads >= limit; ) { + os_thread_sleep(10000); + + if (!(++count % 1000)) { + + ib::error() + << "Waited for " << count / 100 + << " seconds for " + << buf_pool.n_pend_reads + << " pending reads"; + } + } + + dberr_t err; + space->reacquire(); + buf_read_page_low(&err, space, false, + BUF_READ_ANY_PAGE, cur_page_id, zip_size, + true); + + if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) { + ib::error() << "Recovery failed to read or decrypt " + << cur_page_id; + } + } + + + DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n, + space->chain.start->name)); + space->release(); +} |