summaryrefslogtreecommitdiffstats
path: root/storage/innobase/buf
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
commit3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/buf
parentInitial commit. (diff)
downloadmariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz
mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/buf')
-rw-r--r--storage/innobase/buf/buf0block_hint.cc59
-rw-r--r--storage/innobase/buf/buf0buddy.cc769
-rw-r--r--storage/innobase/buf/buf0buf.cc4180
-rw-r--r--storage/innobase/buf/buf0checksum.cc98
-rw-r--r--storage/innobase/buf/buf0dblwr.cc779
-rw-r--r--storage/innobase/buf/buf0dump.cc765
-rw-r--r--storage/innobase/buf/buf0flu.cc2765
-rw-r--r--storage/innobase/buf/buf0lru.cc1452
-rw-r--r--storage/innobase/buf/buf0rea.cc710
9 files changed, 11577 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
new file mode 100644
index 00000000..6bd01faa
--- /dev/null
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+#include "buf0block_hint.h"
+namespace buf {
+
+TRANSACTIONAL_TARGET
+void Block_hint::buffer_fix_block_if_still_valid()
+{
+ /* To check if m_block belongs to the current buf_pool, we must
+ prevent freeing memory while we check, and until we buffer-fix the
+ block. For this purpose it is enough to latch any of the many
+ latches taken by buf_pool_t::resize().
+
+ Similar to buf_page_optimistic_get(), we must validate
+ m_block->page.id() after acquiring the hash_lock, because the object
+ may have been freed and not actually attached to buf_pool.page_hash
+ at the moment. (The block could have been reused to store a
+ different page, and that slice of buf_pool.page_hash could be protected
+ by another hash_lock that we are not holding.)
+
+ Finally, we must ensure that the block is not being freed. */
+ if (m_block)
+ {
+ auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(cell)};
+ if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
+ m_block->page.frame && m_block->page.in_file())
+ m_block->page.fix();
+ else
+ clear();
+ }
+}
+} // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
new file mode 100644
index 00000000..85a698bc
--- /dev/null
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -0,0 +1,769 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+ -- The above is true because we look at these fields when the
+ corresponding buddy block is free which implies that:
+ * The block we are looking at must have an address aligned at
+ the same size that its free buddy has. For example, if we have
+ a free block of 8K then its buddy's address must be aligned at
+ 8K as well.
+ * It is possible that the block we are looking at may have been
+ further divided into smaller sized blocks but its starting
+ address must still remain the start of a page frame i.e.: it
+ cannot be middle of a block. For example, if we have a free
+ block of size 8K then its buddy may be divided into blocks
+ of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+ the starting address of first 1K compressed page.
+ * What is important to note is that for any given block, the
+ buddy's address cannot be in the middle of a larger block i.e.:
+ in above example, our 8K block cannot have a buddy whose address
+ is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE SRV_SPACE_ID_UPPER_BOUND
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE 0XFFFFFFFFUL
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+ BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */
+ BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */
+ BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+ are in use */
+};
+
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+ buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of zip_free[] */
+{
+ ut_ad(i <= BUF_BUDDY_SIZES);
+
+ MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i);
+ MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+}
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return whether the buddy is free */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+ const buf_buddy_free_t* buf) /*!< in: block to check */
+{
+ compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE);
+ return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+ == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+ buf_buddy_free_t* buf, /*!< in/out: block to stamp */
+ ulint i) /*!< in: block size */
+{
+ ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i));
+ buf_buddy_mem_invalid(buf, i);
+ mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+ BUF_BUDDY_STAMP_FREE);
+ buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in,out] buf block to stamp
+@param[in] i block size */
+static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i)
+{
+ buf_buddy_mem_invalid(buf, i);
+ compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU);
+ memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4);
+}
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+ byte* page, /*!< in: compressed page */
+ ulint size) /*!< in: page size in bytes */
+{
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= BUF_BUDDY_LOW);
+ ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+ ut_ad(size < BUF_BUDDY_HIGH);
+ ut_ad(BUF_BUDDY_HIGH == srv_page_size);
+ ut_ad(!ut_align_offset(page, size));
+
+ if (((ulint) page) & size) {
+ return(page - size);
+ } else {
+ return(page + size);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a given zip_free list. */
+struct CheckZipFree {
+ CheckZipFree(ulint i) : m_i(i) {}
+
+ void operator()(const buf_buddy_free_t* elem) const
+ {
+ ut_ad(buf_buddy_stamp_is_free(elem));
+ ut_ad(elem->stamp.size <= m_i);
+ }
+
+ const ulint m_i;
+};
+
+/** Validate a buddy list.
+@param[in] i buddy size to validate */
+static void buf_buddy_list_validate(ulint i)
+{
+ ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i));
+}
+
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@param[in] buf block to check
+@param[in] i index of buf_pool.zip_free[]
+@return true if free */
+static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i)
+{
+ const ulint size = BUF_BUDDY_LOW << i;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(!ut_align_offset(buf, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ buf_buddy_free_t* itr;
+
+ for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+ itr && itr != buf;
+ itr = UT_LIST_GET_NEXT(list, itr)) {
+ }
+
+ return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static MY_ATTRIBUTE((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+ buf_buddy_free_t* buf, /*!< in: block to check */
+ ulint i) /*!< in: index of
+ buf_pool.zip_free[] */
+{
+#ifdef UNIV_DEBUG
+ const ulint size = BUF_BUDDY_LOW << i;
+ ut_ad(!ut_align_offset(buf, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+ /* We assume that all memory from buf_buddy_alloc()
+ is used for compressed page frames. */
+
+ /* We look inside the allocated objects returned by
+ buf_buddy_alloc() and assume that each block is a compressed
+ page that contains one of the following in space_id.
+ * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+ * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+ not initialized yet or
+ * A valid space_id of a compressed tablespace
+
+ The call below attempts to read from free memory. The memory
+ is "owned" by the buddy allocator (and it has been allocated
+ from the buffer pool), so there is nothing wrong about this. */
+ if (!buf_buddy_stamp_is_free(buf)) {
+ return(BUF_BUDDY_STATE_USED);
+ }
+
+ /* A block may be free but a fragment of it may still be in use.
+ To guard against that we write the free block size in terms of
+ zip_free index at start of stamped block. Note that we can
+ safely rely on this value only if the buf is free. */
+ ut_ad(buf->stamp.size <= i);
+ return(buf->stamp.size == i
+ ? BUF_BUDDY_STATE_FREE
+ : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/** Add a block to the head of the appropriate buddy free list.
+@param[in,out] buf block to be freed
+@param[in] i index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.zip_free[i].start != buf);
+
+ buf_buddy_stamp_free(buf, i);
+ UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf);
+ ut_d(buf_buddy_list_validate(i));
+}
+
+/** Remove a block from the appropriate buddy free list.
+@param[in,out] buf block to be freed
+@param[in] i index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_buddy_check_free(buf, i));
+
+ UT_LIST_REMOVE(buf_pool.zip_free[i], buf);
+ buf_buddy_stamp_nonfree(buf, i);
+}
+
+/** Try to allocate a block from buf_pool.zip_free[].
+@param[in] i index of buf_pool.zip_free[]
+@return allocated block, or NULL if buf_pool.zip_free[] was empty */
+static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
+{
+ buf_buddy_free_t* buf;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(i < BUF_BUDDY_SIZES);
+ ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ ut_d(buf_buddy_list_validate(i));
+
+ buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+ if (buf_pool.is_shrinking()
+ && UT_LIST_GET_LEN(buf_pool.withdraw)
+ < buf_pool.withdraw_target) {
+
+ while (buf != NULL
+ && buf_pool.will_be_withdrawn(
+ reinterpret_cast<byte*>(buf))) {
+ /* This should be withdrawn, not to be allocated */
+ buf = UT_LIST_GET_NEXT(list, buf);
+ }
+ }
+
+ if (buf) {
+ buf_buddy_remove_from_free(buf, i);
+ } else if (i + 1 < BUF_BUDDY_SIZES) {
+ /* Attempt to split. */
+ buf = buf_buddy_alloc_zip(i + 1);
+
+ if (buf) {
+ buf_buddy_free_t* buddy =
+ reinterpret_cast<buf_buddy_free_t*>(
+ reinterpret_cast<byte*>(buf)
+ + (BUF_BUDDY_LOW << i));
+ ut_ad(!buf_pool.contains_zip(buddy));
+ buf_buddy_add_to_free(buddy, i);
+ }
+ }
+
+ if (buf) {
+ /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+ MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET);
+ MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes,
+ (BUF_BUDDY_LOW << i)
+ - (BUF_BUDDY_STAMP_OFFSET + 4));
+ ut_ad(mach_read_from_4(buf->stamp.bytes
+ + BUF_BUDDY_STAMP_OFFSET)
+ == BUF_BUDDY_STAMP_NONFREE);
+ }
+
+ return(buf);
+}
+
+/** Deallocate a buffer frame of srv_page_size.
+@param[in] buf buffer frame to deallocate */
+static
+void
+buf_buddy_block_free(void* buf)
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
+ buf_page_t* bpage;
+ buf_block_t* block;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(!ut_align_offset(buf, srv_page_size));
+
+ HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
+ ut_ad(bpage->state() == buf_page_t::MEMORY
+ && bpage->in_zip_hash),
+ bpage->frame == buf);
+ ut_a(bpage);
+ ut_a(bpage->state() == buf_page_t::MEMORY);
+ ut_ad(bpage->in_zip_hash);
+ ut_d(bpage->in_zip_hash = false);
+ HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
+ bpage->hash = nullptr;
+
+ ut_d(memset(buf, 0, srv_page_size));
+ MEM_UNDEFINED(buf, srv_page_size);
+
+ block = (buf_block_t*) bpage;
+ buf_LRU_block_free_non_file_page(block);
+
+ ut_ad(buf_pool.buddy_n_frames > 0);
+ ut_d(buf_pool.buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+ buf_block_t* block) /*!< in: buffer frame to allocate */
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD(block);
+ ut_ad(block->page.state() == buf_page_t::MEMORY);
+
+ ut_a(block->page.frame);
+ ut_a(!ut_align_offset(block->page.frame, srv_page_size));
+
+ ut_ad(!block->page.in_zip_hash);
+ ut_d(block->page.in_zip_hash = true);
+ HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page);
+
+ ut_d(buf_pool.buddy_n_frames++);
+}
+
+/** Allocate a block from a bigger object.
+@param[in] buf a block that is free to use
+@param[in] i index of buf_pool.zip_free[]
+@param[in] j size of buf as an index of buf_pool.zip_free[]
+@return allocated block */
+static
+void*
+buf_buddy_alloc_from(void* buf, ulint i, ulint j)
+{
+ ulint offs = BUF_BUDDY_LOW << j;
+ ut_ad(j <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ ut_ad(j >= i);
+ ut_ad(!ut_align_offset(buf, offs));
+
+ /* Add the unused parts of the block to the free lists. */
+ while (j > i) {
+ buf_buddy_free_t* zip_buf;
+
+ offs >>= 1;
+ j--;
+
+ zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+ reinterpret_cast<byte*>(buf) + offs);
+ buf_buddy_add_to_free(zip_buf, j);
+ }
+
+ buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+ return(buf);
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru)
+{
+ buf_block_t* block;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ if (i < BUF_BUDDY_SIZES) {
+ /* Try to allocate from the buddy system. */
+ block = (buf_block_t*) buf_buddy_alloc_zip(i);
+
+ if (block) {
+ goto func_exit;
+ }
+ }
+
+ /* Try allocating from the buf_pool.free list. */
+ block = buf_LRU_get_free_only();
+
+ if (block) {
+ goto alloc_big;
+ }
+
+ /* Try replacing an uncompressed page in the buffer pool. */
+ block = buf_LRU_get_free_block(true);
+ if (lru) {
+ *lru = true;
+ }
+
+alloc_big:
+ buf_buddy_block_register(block);
+
+ block = reinterpret_cast<buf_block_t*>(
+ buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES));
+
+func_exit:
+ buf_pool.buddy_stat[i].used++;
+ return reinterpret_cast<byte*>(block);
+}
+
+/** Try to relocate a block. The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@param[in] src block to relocate
+@param[in] dst free block to relocated to
+@param[in] i index of buf_pool.zip_free[]
+@param[in] force true if we must relocated always
+@return true if relocated */
+static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
+{
+ buf_page_t* bpage;
+ const ulint size = BUF_BUDDY_LOW << i;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(!ut_align_offset(src, size));
+ ut_ad(!ut_align_offset(dst, size));
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ MEM_CHECK_ADDRESSABLE(dst, size);
+
+ uint32_t space = mach_read_from_4(static_cast<const byte*>(src)
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ uint32_t offset = mach_read_from_4(static_cast<const byte*>(src)
+ + FIL_PAGE_OFFSET);
+
+ /* Suppress Valgrind or MSAN warnings. */
+ MEM_MAKE_DEFINED(&space, sizeof space);
+ MEM_MAKE_DEFINED(&offset, sizeof offset);
+
+ ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+ const page_id_t page_id(space, offset);
+ /* FIXME: we are computing this while holding buf_pool.mutex */
+ auto &cell= buf_pool.page_hash.cell_get(page_id.fold());
+
+ bpage = buf_pool.page_hash.get(page_id, cell);
+
+ if (!bpage || bpage->zip.data != src) {
+ /* The block has probably been freshly
+ allocated by buf_LRU_get_free_block() but not
+ added to buf_pool.page_hash yet. Obviously,
+ it cannot be relocated. */
+
+ if (!force || space != 0 || offset != 0) {
+ return(false);
+ }
+
+ /* It might be just uninitialized page.
+ We should search from LRU list also. */
+
+ bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ while (bpage != NULL) {
+ if (bpage->zip.data == src) {
+ ut_ad(bpage->id() == page_id);
+ break;
+ }
+ bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ }
+
+ if (bpage == NULL) {
+ return(false);
+ }
+ }
+
+ if (page_zip_get_size(&bpage->zip) != size) {
+ /* The block is of different size. We would
+ have to relocate all blocks covered by src.
+ For the sake of simplicity, give up. */
+ ut_ad(page_zip_get_size(&bpage->zip) < size);
+ return(false);
+ }
+
+ /* The block must have been allocated, but it may
+ contain uninitialized data. */
+ MEM_CHECK_ADDRESSABLE(src, size);
+
+ if (!bpage->can_relocate()) {
+ return false;
+ }
+
+ page_hash_latch &hash_lock = buf_pool.page_hash.lock_get(cell);
+ /* It does not make sense to use transactional_lock_guard here,
+ because the memcpy() of 1024 to 16384 bytes would likely make the
+ memory transaction too large. */
+ hash_lock.lock();
+
+ if (bpage->can_relocate()) {
+ /* Relocate the compressed page. */
+ const ulonglong ns = my_interval_timer();
+
+ ut_a(bpage->zip.data == src);
+
+ memcpy(dst, src, size);
+ bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
+
+ hash_lock.unlock();
+
+ buf_buddy_mem_invalid(
+ reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+ buf_buddy_stat_t* buddy_stat = &buf_pool.buddy_stat[i];
+ buddy_stat->relocated++;
+ buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000;
+ return(true);
+ }
+
+ hash_lock.unlock();
+
+ return(false);
+}
+
+/** Deallocate a block.
+@param[in] buf block to be freed, must not be pointed to
+ by the buffer pool
+@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i)
+{
+ buf_buddy_free_t* buddy;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+ ut_ad(buf_pool.buddy_stat[i].used > 0);
+
+ buf_pool.buddy_stat[i].used--;
+recombine:
+ MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+
+ if (i == BUF_BUDDY_SIZES) {
+ buf_buddy_block_free(buf);
+ return;
+ }
+
+ ut_ad(i < BUF_BUDDY_SIZES);
+ ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+ ut_ad(!buf_pool.contains_zip(buf));
+
+ /* Do not recombine blocks if there are few free blocks.
+ We may waste up to 15360*max_len bytes to free blocks
+ (1024 + 2048 + 4096 + 8192 = 15360) */
+ if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
+ && !buf_pool.is_shrinking()) {
+ goto func_exit;
+ }
+
+ /* Try to combine adjacent blocks. */
+ buddy = reinterpret_cast<buf_buddy_free_t*>(
+ buf_buddy_get(reinterpret_cast<byte*>(buf),
+ BUF_BUDDY_LOW << i));
+
+ switch (buf_buddy_is_free(buddy, i)) {
+ case BUF_BUDDY_STATE_FREE:
+ /* The buddy is free: recombine */
+ buf_buddy_remove_from_free(buddy, i);
+buddy_is_free:
+ ut_ad(!buf_pool.contains_zip(buddy));
+ i++;
+ buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+ goto recombine;
+
+ case BUF_BUDDY_STATE_USED:
+ ut_d(buf_buddy_list_validate(i));
+
+ /* The buddy is not free. Is there a free block of
+ this size? */
+ if (buf_buddy_free_t* zip_buf =
+ UT_LIST_GET_FIRST(buf_pool.zip_free[i])) {
+
+ /* Remove the block from the free list, because
+ a successful buf_buddy_relocate() will overwrite
+ zip_free->list. */
+ buf_buddy_remove_from_free(zip_buf, i);
+
+ /* Try to relocate the buddy of buf to the free
+ block. */
+ if (buf_buddy_relocate(buddy, zip_buf, i, false)) {
+ goto buddy_is_free;
+ }
+
+ buf_buddy_add_to_free(zip_buf, i);
+ }
+
+ break;
+ case BUF_BUDDY_STATE_PARTIALLY_USED:
+ /* Some sub-blocks in the buddy are still in use.
+ Relocation will fail. No need to try. */
+ break;
+ }
+
+func_exit:
+ /* Free the block to the buddy list. */
+ buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+}
+
+/** Try to reallocate a block.
+@param[in] buf buf_pool block to be reallocated
+@param[in] size block size, up to srv_page_size
+@return whether the reallocation succeeded */
+bool
+buf_buddy_realloc(void* buf, ulint size)
+{
+ buf_block_t* block = NULL;
+ ulint i = buf_buddy_get_slot(size);
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+ if (i < BUF_BUDDY_SIZES) {
+ /* Try to allocate from the buddy system. */
+ block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i));
+ }
+
+ if (block == NULL) {
+ /* Try allocating from the buf_pool.free list. */
+ block = buf_LRU_get_free_only();
+
+ if (block == NULL) {
+ return(false); /* free_list was not enough */
+ }
+
+ buf_buddy_block_register(block);
+
+ block = reinterpret_cast<buf_block_t*>(
+ buf_buddy_alloc_from(
+ block->page.frame, i, BUF_BUDDY_SIZES));
+ }
+
+ buf_pool.buddy_stat[i].used++;
+
+ /* Try to relocate the buddy of buf to the free block. */
+ if (buf_buddy_relocate(buf, block, i, true)) {
+ /* succeeded */
+ buf_buddy_free_low(buf, i);
+ } else {
+ /* failed */
+ buf_buddy_free_low(block, i);
+ }
+
+ return(true); /* free_list was enough */
+}
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.is_shrinking());
+
+ for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
+ buf_buddy_free_t* buf =
+ UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+ /* seek to withdraw target */
+ while (buf != NULL
+ && !buf_pool.will_be_withdrawn(
+ reinterpret_cast<byte*>(buf))) {
+ buf = UT_LIST_GET_NEXT(list, buf);
+ }
+
+ while (buf != NULL) {
+ buf_buddy_free_t* next =
+ UT_LIST_GET_NEXT(list, buf);
+
+ buf_buddy_free_t* buddy =
+ reinterpret_cast<buf_buddy_free_t*>(
+ buf_buddy_get(
+ reinterpret_cast<byte*>(buf),
+ BUF_BUDDY_LOW << i));
+
+ /* seek to the next withdraw target */
+ while (true) {
+ while (next != NULL
+ && !buf_pool.will_be_withdrawn(
+ reinterpret_cast<byte*>(next))) {
+ next = UT_LIST_GET_NEXT(list, next);
+ }
+
+ if (buddy != next) {
+ break;
+ }
+
+ next = UT_LIST_GET_NEXT(list, next);
+ }
+
+ if (buf_buddy_is_free(buddy, i)
+ == BUF_BUDDY_STATE_FREE) {
+ /* Both buf and buddy are free.
+ Try to combine them. */
+ buf_buddy_remove_from_free(buf, i);
+ buf_pool.buddy_stat[i].used++;
+
+ buf_buddy_free_low(buf, i);
+ }
+
+ buf = next;
+ }
+ }
+}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000..8ef18ee0
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,4180 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "assume_aligned.h"
+#include "mtr0types.h"
+#include "mach0data.h"
+#include "buf0checksum.h"
+#include "mariadb_stats.h"
+#include <string.h>
+
+#ifdef UNIV_INNOCHECKSUM
+# include "my_sys.h"
+# include "buf0buf.h"
+#else
+#include "my_cpu.h"
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "buf0rea.h"
+#include "buf0flu.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "lock0lock.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "dict0stats_bg.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "srv0mon.h"
+#include "log0crypt.h"
+#include "fil0pagecompress.h"
+#endif /* !UNIV_INNOCHECKSUM */
+#include "page0zip.h"
+#include "buf0dump.h"
+#include <map>
+#include <sstream>
+#include "log.h"
+
+using st_::span;
+
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+struct set_numa_interleave_t
+{
+ set_numa_interleave_t()
+ {
+ if (srv_numa_interleave) {
+
+ struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
+ ib::info() << "Setting NUMA memory policy to"
+ " MPOL_INTERLEAVE";
+ if (set_mempolicy(MPOL_INTERLEAVE,
+ numa_mems_allowed->maskp,
+ numa_mems_allowed->size) != 0) {
+
+ ib::warn() << "Failed to set NUMA memory"
+ " policy to MPOL_INTERLEAVE: "
+ << strerror(errno);
+ }
+ numa_bitmask_free(numa_mems_allowed);
+ }
+ }
+
+ ~set_numa_interleave_t()
+ {
+ if (srv_numa_interleave) {
+
+ ib::info() << "Setting NUMA memory policy to"
+ " MPOL_DEFAULT";
+ if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
+ ib::warn() << "Failed to set NUMA memory"
+ " policy to MPOL_DEFAULT: "
+ << strerror(errno);
+ }
+ }
+ }
+};
+
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
+#else
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
+#endif /* HAVE_LIBNUMA */
+
+/*
+ IMPLEMENTATION OF THE BUFFER POOL
+ =================================
+
+ Buffer frames and blocks
+ ------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+ Buffer pool struct
+ ------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool.mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool.mutex.
+
+The buf_pool.mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool.mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool.mutex hold time.
+
+ Control blocks
+ --------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+ Lists of blocks
+ ---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool.free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list. The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame. A block is in unzip_LRU if and
+only if the predicate block->page.belongs_to_unzip_LRU()
+holds. The blocks in unzip_LRU will be in same order as they are in
+the common LRU list. That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool.flush_list) contains the blocks
+holding persistent file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool.flush_list_mutex.
+
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
+that are not in buf_pool.flush_list and for which no uncompressed
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
+
+The chains of free memory blocks (buf_pool.zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size 1024..innodb_page_size / 2. These
+blocks are inside the memory blocks of size innodb_page_size and type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool. The buddy allocator is solely used for allocating
+ROW_FORMAT=COMPRESSED page frames.
+
+ Loading a file page
+ -------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and releases the io_fix
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+ Read-ahead
+ ----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_INNOCHECKSUM
+# ifdef SUX_LOCK_GENERIC
+void page_hash_latch::read_lock_wait()
+{
+ /* First, try busy spinning for a while. */
+ for (auto spin= srv_n_spin_wait_rounds; spin--; )
+ {
+ LF_BACKOFF();
+ if (read_trylock())
+ return;
+ }
+ /* Fall back to yielding to other threads. */
+ do
+ std::this_thread::yield();
+ while (!read_trylock());
+}
+
+void page_hash_latch::write_lock_wait()
+{
+ write_lock_wait_start();
+
+ /* First, try busy spinning for a while. */
+ for (auto spin= srv_n_spin_wait_rounds; spin--; )
+ {
+ if (write_lock_poll())
+ return;
+ LF_BACKOFF();
+ }
+
+ /* Fall back to yielding to other threads. */
+ do
+ std::this_thread::yield();
+ while (!write_lock_poll());
+}
+# endif
+
+/** Number of attempts made to read in a page in the buffer pool */
+constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100;
+/** The maximum portion of the buffer pool that can be used for the
+read-ahead buffer. (Divide buf_pool size by this amount) */
+constexpr uint32_t BUF_READ_AHEAD_PORTION= 32;
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+const byte *field_ref_zero;
+
+/** The InnoDB buffer pool */
+buf_pool_t buf_pool;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
+
+#ifdef UNIV_DEBUG
+/** This is used to insert validation operations in execution
+in the debug version */
+static Atomic_counter<size_t> buf_dbg_counter;
+#endif /* UNIV_DEBUG */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(read, counter) \
+ (read ? (counter##_READ) : (counter##_WRITTEN))
+
+/** Decrypt a page for temporary tablespace.
+@param[in,out] tmp_frame Temporary buffer
+@param[in] src_frame Page to decrypt
+@return true if temporary tablespace decrypted, false if not */
+static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
+{
+ if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
+ return true;
+ }
+
+ /* read space & lsn */
+ uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+ /* Copy FIL page header, it is not encrypted */
+ memcpy(tmp_frame, src_frame, header_len);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + header_len;
+ byte* dst = tmp_frame + header_len;
+ uint srclen = uint(srv_page_size)
+ - (header_len + FIL_PAGE_FCRC32_CHECKSUM);
+ ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+ if (!log_tmp_block_decrypt(src, srclen, dst,
+ (offset * srv_page_size))) {
+ return false;
+ }
+
+ static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+ memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ FIL_PAGE_FCRC32_CHECKSUM);
+
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(src_frame, tmp_frame,
+ srv_page_size);
+ srv_stats.pages_decrypted.inc();
+ srv_stats.n_temp_blocks_decrypted.inc();
+
+ return true; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in,out] bpage Page control block
+@param[in] node data file
+@return whether the operation was successful */
+static bool buf_page_decrypt_after_read(buf_page_t *bpage,
+ const fil_node_t &node)
+{
+ ut_ad(node.space->referenced());
+ ut_ad(node.space->id == bpage->id().space());
+ const auto flags = node.space->flags;
+
+ byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame;
+ bool page_compressed = node.space->is_compressed()
+ && buf_page_is_compressed(dst_frame, flags);
+ const page_id_t id(bpage->id());
+
+ if (id.page_no() == 0) {
+ /* File header pages are not encrypted/compressed */
+ return (true);
+ }
+
+ buf_tmp_buffer_t* slot;
+
+ if (id.space() == SRV_TMP_SPACE_ID
+ && innodb_encrypt_temporary_tables) {
+ slot = buf_pool.io_buf_reserve();
+ slot->allocate();
+ bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame);
+ slot->release();
+ return ok;
+ }
+
+ /* Page is encrypted if encryption information is found from
+ tablespace and page contains used key_version. This is true
+ also for pages first compressed and then encrypted. */
+
+ uint key_version = buf_page_get_key_version(dst_frame, flags);
+
+ if (page_compressed && !key_version) {
+ /* the page we read is unencrypted */
+ /* Find free slot from temporary memory array */
+decompress:
+ if (fil_space_t::full_crc32(flags)
+ && buf_page_is_corrupted(true, dst_frame, flags)) {
+ return false;
+ }
+
+ slot = buf_pool.io_buf_reserve();
+ slot->allocate();
+
+decompress_with_slot:
+ ulint write_size = fil_page_decompress(
+ slot->crypt_buf, dst_frame, flags);
+ slot->release();
+ ut_ad(node.space->referenced());
+ return write_size != 0;
+ }
+
+ if (key_version && node.space->crypt_data) {
+ /* Verify encryption checksum before we even try to
+ decrypt. */
+ if (!buf_page_verify_crypt_checksum(dst_frame, flags)) {
+decrypt_failed:
+ ib::error() << "Encrypted page " << id
+ << " in file " << node.name
+ << " looks corrupted; key_version="
+ << key_version;
+ return false;
+ }
+
+ slot = buf_pool.io_buf_reserve();
+ slot->allocate();
+
+ /* decrypt using crypt_buf to dst_frame */
+ if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
+ slot->release();
+ goto decrypt_failed;
+ }
+
+ if ((fil_space_t::full_crc32(flags) && page_compressed)
+ || fil_page_get_type(dst_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ goto decompress_with_slot;
+ }
+
+ slot->release();
+ } else if (fil_page_get_type(dst_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ goto decompress;
+ }
+
+ ut_ad(node.space->referenced());
+ return true;
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Checks if the page is in crc32 checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in crc32 checksum format. */
+static
+bool
+buf_page_is_checksum_valid_crc32(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+ const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+ extern FILE* log_file;
+ extern uint32_t cur_page_num;
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " crc32 calculated = " UINT32PF ";"
+ " recorded checksum field1 = " ULINTPF " recorded"
+ " checksum field2 =" ULINTPF "\n", cur_page_num,
+ crc32, checksum_field1, checksum_field2);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ if (checksum_field1 != checksum_field2) {
+ return false;
+ }
+
+ return checksum_field1 == crc32;
+}
+
+/** Checks whether the lsn present in the page is lesser than the
+peek current lsn.
+@param[in] check_lsn lsn to check
+@param[in] read_buf page. */
+static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
+{
+#ifndef UNIV_INNOCHECKSUM
+ if (check_lsn && recv_lsn_checks_on) {
+ const lsn_t current_lsn = log_sys.get_lsn();
+ const lsn_t page_lsn
+ = mach_read_from_8(read_buf + FIL_PAGE_LSN);
+
+ /* Since we are going to reset the page LSN during the import
+ phase it makes no sense to spam the log with error messages. */
+ if (current_lsn < page_lsn) {
+
+ const uint32_t space_id = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_ID);
+ const uint32_t page_no = mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET);
+
+ ib::error() << "Page " << page_id_t(space_id, page_no)
+ << " log sequence number " << page_lsn
+ << " is in the future! Current system"
+ << " log sequence number "
+ << current_lsn << ".";
+
+ ib::error() << "Your database may be corrupt or"
+ " you may have copied the InnoDB"
+ " tablespace but not the InnoDB"
+ " log files. "
+ << FORCE_RECOVERY_MSG;
+
+ }
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+
+/** Check if a buffer is all zeroes.
+@param[in] buf data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(span<const byte> buf)
+{
+ ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX);
+ return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
+}
+
+/** Check if a page is corrupt.
+@param check_lsn whether FIL_PAGE_LSN should be checked
+@param read_buf database page
+@param fsp_flags contents of FIL_SPACE_FLAGS
+@return whether the page is corrupted */
+bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
+ uint32_t fsp_flags)
+{
+ if (fil_space_t::full_crc32(fsp_flags)) {
+ bool compressed = false, corrupted = false;
+ const uint size = buf_page_full_crc32_size(
+ read_buf, &compressed, &corrupted);
+ if (corrupted) {
+ return true;
+ }
+ const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM);
+ uint crc32 = mach_read_from_4(end);
+
+ if (!crc32 && size == srv_page_size
+ && buf_is_zeroes(span<const byte>(read_buf, size))) {
+ return false;
+ }
+
+ DBUG_EXECUTE_IF(
+ "page_intermittent_checksum_mismatch", {
+ static int page_counter;
+ if (page_counter++ == 3) {
+ crc32++;
+ }
+ });
+
+ if (crc32 != my_crc32c(0, read_buf,
+ size - FIL_PAGE_FCRC32_CHECKSUM)) {
+ return true;
+ }
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ if (!compressed
+ && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION
+ + read_buf)
+ && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4),
+ end - (FIL_PAGE_FCRC32_END_LSN
+ - FIL_PAGE_FCRC32_CHECKSUM),
+ 4)) {
+ return true;
+ }
+
+ buf_page_check_lsn(check_lsn, read_buf);
+ return false;
+ }
+
+ const ulint zip_size = fil_space_t::zip_size(fsp_flags);
+ const uint16_t page_type = fil_page_get_type(read_buf);
+
+ /* We can trust page type if page compression is set on tablespace
+ flags because page compression flag means file must have been
+ created with 10.1 (later than 5.5 code base). In 10.1 page
+ compressed tables do not contain post compression checksum and
+ FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
+ be null if we are in fil_check_first_page() and first page
+ is not compressed or encrypted. Page checksum is verified
+ after decompression (i.e. normally pages are already
+ decompressed at this stage). */
+ if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+ page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+#ifndef UNIV_INNOCHECKSUM
+ && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags)
+#endif
+ ) {
+ return(false);
+ }
+
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment");
+
+ if (!zip_size
+ && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4,
+ read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+ /* Stored log sequence numbers at the start and the end
+ of page do not match */
+
+ return(true);
+ }
+
+ buf_page_check_lsn(check_lsn, read_buf);
+
+ /* Check whether the checksum fields have correct values */
+
+ if (zip_size) {
+ return !page_zip_verify_checksum(read_buf, zip_size);
+ }
+
+ const uint32_t checksum_field1 = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ const uint32_t checksum_field2 = mach_read_from_4(
+ read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+
+ /* A page filled with NUL bytes is considered not corrupted.
+ Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7),
+ the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero
+ for the first page of each file of the system tablespace.
+ We want to ignore it for the system tablespace, but because
+ we do not know the expected tablespace here, we ignore the
+ field for all data files, except for
+ innodb_checksum_algorithm=full_crc32 which we handled above. */
+ if (!checksum_field1 && !checksum_field2) {
+ /* Checksum fields can have valid value as zero.
+ If the page is not empty then do the checksum
+ calculation for the page. */
+ bool all_zeroes = true;
+ for (size_t i = 0; i < srv_page_size; i++) {
+#ifndef UNIV_INNOCHECKSUM
+ if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) {
+ i += 8;
+ }
+#endif
+ if (read_buf[i]) {
+ all_zeroes = false;
+ break;
+ }
+ }
+
+ if (all_zeroes) {
+ return false;
+ }
+ }
+
+#ifndef UNIV_INNOCHECKSUM
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+#endif /* !UNIV_INNOCHECKSUM */
+ return !buf_page_is_checksum_valid_crc32(
+ read_buf, checksum_field1, checksum_field2);
+#ifndef UNIV_INNOCHECKSUM
+ default:
+ if (checksum_field1 == BUF_NO_CHECKSUM_MAGIC
+ && checksum_field2 == BUF_NO_CHECKSUM_MAGIC) {
+ return false;
+ }
+
+ const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+
+ /* Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page. */
+
+ /* Since innodb_checksum_algorithm is not strict_* allow
+ any of the algos to match for the old field */
+
+ if (checksum_field2
+ != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+ DBUG_EXECUTE_IF(
+ "page_intermittent_checksum_mismatch", {
+ static int page_counter;
+ if (page_counter++ == 3) return true;
+ });
+
+ if ((checksum_field1 != crc32
+ || checksum_field2 != crc32)
+ && checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+ return true;
+ }
+ }
+
+ switch (checksum_field1) {
+ case 0:
+ case BUF_NO_CHECKSUM_MAGIC:
+ return false;
+ }
+ return (checksum_field1 != crc32 || checksum_field2 != crc32)
+ && checksum_field1
+ != buf_calc_page_new_checksum(read_buf);
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+/** Enable buffers to be dumped to core files
+
+A convience function, not called anyhwere directly however
+it is left available for gdb or any debugger to call
+in the event that you want all of the memory to be dumped
+to a core file.
+
+Returns number of errors found in madvise calls. */
+MY_ATTRIBUTE((used))
+int
+buf_madvise_do_dump()
+{
+ int ret= 0;
+
+ /* mirrors allocation in log_t::create() */
+ if (log_sys.buf) {
+ ret += madvise(log_sys.buf, log_sys.buf_size, MADV_DODUMP);
+ ret += madvise(log_sys.flush_buf, log_sys.buf_size,
+ MADV_DODUMP);
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ auto chunk = buf_pool.chunks;
+
+ for (ulint n = buf_pool.n_chunks; n--; chunk++) {
+ ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return ret;
+}
+#endif
+
+#ifndef UNIV_DEBUG
+static inline byte hex_to_ascii(byte hex_digit)
+{
+ const int offset= hex_digit <= 9 ? '0' : 'a' - 10;
+ return byte(hex_digit + offset);
+}
+#endif
+
+/** Dump a page to stderr.
+@param[in] read_buf database page
+@param[in] zip_size compressed page size, or 0 */
+ATTRIBUTE_COLD
+void buf_page_print(const byte *read_buf, ulint zip_size)
+{
+#ifndef UNIV_DEBUG
+ const size_t size = zip_size ? zip_size : srv_page_size;
+ const byte * const end= read_buf + size;
+ sql_print_information("InnoDB: Page dump (%zu bytes):", size);
+
+ do
+ {
+ byte row[64];
+
+ for (byte *r= row; r != &row[64]; r+= 2, read_buf++)
+ {
+ r[0]= hex_to_ascii(byte(*read_buf >> 4));
+ r[1]= hex_to_ascii(*read_buf & 15);
+ }
+
+ sql_print_information("InnoDB: %.*s", 64, row);
+ }
+ while (read_buf != end);
+
+ sql_print_information("InnoDB: End of page dump");
+#endif
+}
+
+/** Initialize a buffer page descriptor.
+@param[in,out] block buffer page descriptor
+@param[in] frame buffer page frame */
+static
+void
+buf_block_init(buf_block_t* block, byte* frame)
+{
+ /* This function should only be executed at database startup or by
+ buf_pool.resize(). Either way, adaptive hash index must not exist. */
+ assert_block_ahi_empty_on_init(block);
+
+ block->page.frame = frame;
+
+ MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock);
+ ut_ad(!block->modify_clock);
+ MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock);
+ block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL));
+#ifdef BTR_CUR_HASH_ADAPT
+ MEM_MAKE_DEFINED(&block->index, sizeof block->index);
+ ut_ad(!block->index);
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(block->in_unzip_LRU_list = false);
+ ut_d(block->in_withdraw_list = false);
+
+ page_zip_des_init(&block->page.zip);
+
+ MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash);
+ ut_ad(!block->page.hash);
+}
+
+/** Allocate a chunk of buffer frames.
+@param bytes requested size
+@return whether the allocation succeeded */
+inline bool buf_pool_t::chunk_t::create(size_t bytes)
+{
+ DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
+ /* Round down to a multiple of page size, although it already should be. */
+ bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
+
+ mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
+
+ if (UNIV_UNLIKELY(!mem))
+ return false;
+
+ MEM_UNDEFINED(mem, mem_size());
+
+#ifdef HAVE_LIBNUMA
+ if (srv_numa_interleave)
+ {
+ struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+ if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
+ numa_mems_allowed->maskp, numa_mems_allowed->size,
+ MPOL_MF_MOVE))
+ {
+ ib::warn() << "Failed to set NUMA memory policy of"
+ " buffer pool page frames to MPOL_INTERLEAVE"
+ " (error: " << strerror(errno) << ").";
+ }
+ numa_bitmask_free(numa_mems_allowed);
+ }
+#endif /* HAVE_LIBNUMA */
+
+
+ /* Allocate the block descriptors from
+ the start of the memory block. */
+ blocks= reinterpret_cast<buf_block_t*>(mem);
+
+ /* Align a pointer to the first frame. Note that when
+ opt_large_page_size is smaller than srv_page_size,
+ (with max srv_page_size at 64k don't think any hardware
+ makes this true),
+ we may allocate one fewer block than requested. When
+ it is bigger, we may allocate more blocks than requested. */
+ static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
+
+ byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
+ srv_page_size - 1) &
+ ~ulint{srv_page_size - 1});
+ size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
+
+ /* Subtract the space needed for block descriptors. */
+ {
+ ulint s= size;
+
+ while (frame < reinterpret_cast<const byte*>(blocks + s))
+ {
+ frame+= srv_page_size;
+ s--;
+ }
+
+ size= s;
+ }
+
+ /* Init block structs and assign frames for them. Then we assign the
+ frames to the first blocks (we already mapped the memory above). */
+
+ buf_block_t *block= blocks;
+
+ for (auto i= size; i--; ) {
+ buf_block_init(block, frame);
+ MEM_UNDEFINED(block->page.frame, srv_page_size);
+ /* Add the block to the free list */
+ UT_LIST_ADD_LAST(buf_pool.free, &block->page);
+
+ ut_d(block->page.in_free_list = TRUE);
+ block++;
+ frame+= srv_page_size;
+ }
+
+ reg();
+
+ return true;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
+{
+ buf_block_t *block= blocks;
+ for (auto i= size; i--; block++)
+ {
+ if (block->page.in_file())
+ {
+ /* The uncompressed buffer pool should never
+ contain ROW_FORMAT=COMPRESSED block descriptors. */
+ ut_ad(block->page.frame);
+ const lsn_t lsn= block->page.oldest_modification();
+
+ if (srv_read_only_mode)
+ {
+ /* The page cleaner is disabled in read-only mode. No pages
+ can be dirtied, so all of them must be clean. */
+ ut_ad(lsn == 0 || lsn == recv_sys.lsn ||
+ srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+ break;
+ }
+
+ if (fsp_is_system_temporary(block->page.id().space()))
+ {
+ ut_ad(lsn == 0 || lsn == 2);
+ break;
+ }
+
+ if (lsn > 1 || !block->page.can_relocate())
+ return block;
+
+ break;
+ }
+ }
+
+ return nullptr;
+}
+#endif /* UNIV_DEBUG */
+
+/** Create the hash table.
+@param n the lower bound of n_cells */
+void buf_pool_t::page_hash_table::create(ulint n)
+{
+ n_cells= ut_find_prime(n);
+ const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array,
+ CPU_LEVEL1_DCACHE_LINESIZE);
+ void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+ memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+ array= static_cast<hash_chain*>(v);
+}
+
+/** Create the buffer pool.
+@return whether the creation failed */
+bool buf_pool_t::create()
+{
+ ut_ad(this == &buf_pool);
+ ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
+ ut_ad(!is_initialised());
+ ut_ad(srv_buf_pool_size > 0);
+ ut_ad(!resizing);
+ ut_ad(!chunks_old);
+ /* mariabackup loads tablespaces, and it requires field_ref_zero to be
+ allocated before innodb initialization */
+ ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero);
+
+ NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+ if (!field_ref_zero) {
+ if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+ field_ref_zero= static_cast<const byte*>
+ (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
+ else
+ return true;
+ }
+
+ chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
+
+ new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
+
+ n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
+ const size_t chunk_size= srv_buf_pool_chunk_unit;
+
+ chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
+ UT_LIST_INIT(free, &buf_page_t::list);
+ curr_size= 0;
+ auto chunk= chunks;
+
+ do
+ {
+ if (!chunk->create(chunk_size))
+ {
+ while (--chunk >= chunks)
+ {
+ buf_block_t* block= chunk->blocks;
+
+ for (auto i= chunk->size; i--; block++)
+ block->page.lock.free();
+
+ allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+ }
+ ut_free(chunks);
+ chunks= nullptr;
+ UT_DELETE(chunk_t::map_reg);
+ chunk_t::map_reg= nullptr;
+ aligned_free(const_cast<byte*>(field_ref_zero));
+ field_ref_zero= nullptr;
+ ut_ad(!is_initialised());
+ return true;
+ }
+
+ curr_size+= chunk->size;
+ }
+ while (++chunk < chunks + n_chunks);
+
+ ut_ad(is_initialised());
+#if defined(__aarch64__)
+ mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+#else
+ mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr);
+#endif
+
+ UT_LIST_INIT(LRU, &buf_page_t::LRU);
+ UT_LIST_INIT(withdraw, &buf_page_t::list);
+ withdraw_target= 0;
+ UT_LIST_INIT(flush_list, &buf_page_t::list);
+ UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
+
+ for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
+ UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
+ ulint s= curr_size;
+ s/= BUF_READ_AHEAD_PORTION;
+ read_ahead_area= s >= READ_AHEAD_PAGES
+ ? READ_AHEAD_PAGES
+ : my_round_up_to_next_power(static_cast<uint32_t>(s));
+ curr_pool_size= srv_buf_pool_size;
+
+ n_chunks_new= n_chunks;
+
+ page_hash.create(2 * curr_size);
+ zip_hash.create(2 * curr_size);
+ last_printout_time= time(NULL);
+
+ mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+ MY_MUTEX_INIT_FAST);
+
+ pthread_cond_init(&done_flush_LRU, nullptr);
+ pthread_cond_init(&done_flush_list, nullptr);
+ pthread_cond_init(&do_flush_list, nullptr);
+ pthread_cond_init(&done_free, nullptr);
+
+ try_LRU_scan= true;
+
+ ut_d(flush_hp.m_mutex= &flush_list_mutex;);
+ ut_d(lru_hp.m_mutex= &mutex);
+ ut_d(lru_scan_itr.m_mutex= &mutex);
+
+ io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+
+ /* FIXME: remove some of these variables */
+ srv_buf_pool_curr_size= curr_pool_size;
+ srv_buf_pool_old_size= srv_buf_pool_size;
+ srv_buf_pool_base_size= srv_buf_pool_size;
+
+ last_activity_count= srv_get_activity_count();
+
+ chunk_t::map_ref= chunk_t::map_reg;
+ buf_LRU_old_ratio_update(100 * 3 / 8, false);
+ btr_search_sys_create();
+ ut_ad(is_initialised());
+ return false;
+}
+
+/** Clean up after successful create() */
+void buf_pool_t::close()
+{
+ ut_ad(this == &buf_pool);
+ if (!is_initialised())
+ return;
+
+ mysql_mutex_destroy(&mutex);
+ mysql_mutex_destroy(&flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
+ bpage= prev_bpage)
+ {
+ prev_bpage= UT_LIST_GET_PREV(LRU, bpage);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->in_LRU_list);
+ /* The buffer pool must be clean during normal shutdown.
+ Only on aborted startup (with recovery) or with innodb_fast_shutdown=2
+ we may discard changes. */
+ ut_d(const lsn_t oldest= bpage->oldest_modification();)
+ ut_ad(fsp_is_system_temporary(bpage->id().space())
+ ? (oldest == 0 || oldest == 2)
+ : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
+
+ if (UNIV_UNLIKELY(!bpage->frame))
+ {
+ bpage->lock.free();
+ ut_free(bpage);
+ }
+ }
+
+ for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
+ {
+ buf_block_t *block= chunk->blocks;
+
+ for (auto i= chunk->size; i--; block++)
+ block->page.lock.free();
+
+ allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+ }
+
+ pthread_cond_destroy(&done_flush_LRU);
+ pthread_cond_destroy(&done_flush_list);
+ pthread_cond_destroy(&do_flush_list);
+ pthread_cond_destroy(&done_free);
+
+ ut_free(chunks);
+ chunks= nullptr;
+ page_hash.free();
+ zip_hash.free();
+
+ io_buf.close();
+ UT_DELETE(chunk_t::map_reg);
+ chunk_t::map_reg= chunk_t::map_ref= nullptr;
+ aligned_free(const_cast<byte*>(field_ref_zero));
+ field_ref_zero= nullptr;
+}
+
+/** Try to reallocate a control block.
+@param block control block to reallocate
+@return whether the reallocation succeeded */
+inline bool buf_pool_t::realloc(buf_block_t *block)
+{
+ buf_block_t* new_block;
+
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(block->page.in_file());
+ ut_ad(block->page.frame);
+
+ new_block = buf_LRU_get_free_only();
+
+ if (new_block == NULL) {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ page_cleaner_wakeup();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ return(false); /* free list was not enough */
+ }
+
+ const page_id_t id{block->page.id()};
+ hash_chain& chain = page_hash.cell_get(id.fold());
+ page_hash_latch& hash_lock = page_hash.lock_get(chain);
+ /* It does not make sense to use transactional_lock_guard
+ here, because copying innodb_page_size (4096 to 65536) bytes
+ as well as other changes would likely make the memory
+ transaction too large. */
+ hash_lock.lock();
+
+ if (block->page.can_relocate()) {
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
+ new_block->page.frame, block->page.frame,
+ srv_page_size);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const auto frame = new_block->page.frame;
+ new_block->page.lock.free();
+ new (&new_block->page) buf_page_t(block->page);
+ new_block->page.frame = frame;
+
+ /* relocate LRU list */
+ if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) {
+ UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
+ } else {
+ UT_LIST_ADD_FIRST(LRU, &new_block->page);
+ }
+
+ if (LRU_old == &block->page) {
+ LRU_old = &new_block->page;
+ }
+
+ ut_ad(new_block->page.in_LRU_list);
+
+ /* relocate unzip_LRU list */
+ if (block->page.zip.data != NULL) {
+ ut_ad(block->in_unzip_LRU_list);
+ ut_d(new_block->in_unzip_LRU_list = true);
+
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+ UT_LIST_REMOVE(unzip_LRU, block);
+
+ ut_d(block->in_unzip_LRU_list = false);
+ block->page.zip.data = NULL;
+ page_zip_set_size(&block->page.zip, 0);
+
+ if (prev_block != NULL) {
+ UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
+ } else {
+ UT_LIST_ADD_FIRST(unzip_LRU, new_block);
+ }
+ } else {
+ ut_ad(!block->in_unzip_LRU_list);
+ ut_d(new_block->in_unzip_LRU_list = false);
+ }
+
+ /* relocate page_hash */
+ hash_chain& chain = page_hash.cell_get(id.fold());
+ ut_ad(&block->page == page_hash.get(id, chain));
+ buf_pool.page_hash.replace(chain, &block->page,
+ &new_block->page);
+ buf_block_modify_clock_inc(block);
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ memset_aligned<4>(block->page.frame
+ + FIL_PAGE_OFFSET, 0xff, 4);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memset_aligned<2>(block->page.frame
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+ MEM_UNDEFINED(block->page.frame, srv_page_size);
+ block->page.set_state(buf_page_t::REMOVE_HASH);
+ if (!fsp_is_system_temporary(id.space())) {
+ buf_flush_relocate_on_flush_list(&block->page,
+ &new_block->page);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ block->page.set_corrupt_id();
+
+ /* set other flags of buf_block_t */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* This code should only be executed by resize(),
+ while the adaptive hash index is disabled. */
+ assert_block_ahi_empty(block);
+ assert_block_ahi_empty_on_init(new_block);
+ ut_ad(!block->index);
+ new_block->index = NULL;
+ new_block->n_hash_helps = 0;
+ new_block->n_fields = 1;
+ new_block->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(block->page.set_state(buf_page_t::MEMORY));
+ /* free block */
+ new_block = block;
+ }
+
+ hash_lock.unlock();
+ buf_LRU_block_free_non_file_page(new_block);
+ return(true); /* free_list was enough */
+}
+
+void buf_pool_t::io_buf_t::create(ulint n_slots)
+{
+ this->n_slots= n_slots;
+ slots= static_cast<buf_tmp_buffer_t*>
+ (ut_malloc_nokey(n_slots * sizeof *slots));
+ memset((void*) slots, 0, n_slots * sizeof *slots);
+}
+
+void buf_pool_t::io_buf_t::close()
+{
+ for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+ {
+ aligned_free(s->crypt_buf);
+ aligned_free(s->comp_buf);
+ }
+ ut_free(slots);
+ slots= nullptr;
+ n_slots= 0;
+}
+
+buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve()
+{
+ for (;;)
+ {
+ for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+ if (s->acquire())
+ return s;
+ os_aio_wait_until_no_pending_writes(true);
+ for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+ if (s->acquire())
+ return s;
+ os_aio_wait_until_no_pending_reads(true);
+ }
+}
+
+/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3).
+@param[in] fmt format
+@param[in] ... extra parameters according to fmt */
+static
+void
+buf_resize_status(
+ const char* fmt,
+ ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_resize_status,
+ sizeof(export_vars.innodb_buffer_pool_resize_status),
+ fmt, ap);
+
+ va_end(ap);
+
+ ib::info() << export_vars.innodb_buffer_pool_resize_status;
+}
+
+/** Withdraw blocks from the buffer pool until meeting withdraw_target.
+@return whether retry is needed */
+inline bool buf_pool_t::withdraw_blocks()
+{
+ buf_block_t* block;
+ ulint loop_count = 0;
+
+ ib::info() << "Start to withdraw the last "
+ << withdraw_target << " blocks.";
+
+ while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+
+ /* try to withdraw from free_list */
+ ulint count1 = 0;
+
+ mysql_mutex_lock(&mutex);
+ buf_buddy_condense_free();
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(free));
+ while (block != NULL
+ && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+ ut_ad(block->page.in_free_list);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+ ut_a(!block->page.in_file());
+
+ buf_block_t* next_block;
+ next_block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_NEXT(
+ list, &block->page));
+
+ if (will_be_withdrawn(block->page)) {
+ /* This should be withdrawn */
+ UT_LIST_REMOVE(free, &block->page);
+ UT_LIST_ADD_LAST(withdraw, &block->page);
+ ut_d(block->in_withdraw_list = true);
+ count1++;
+ }
+
+ block = next_block;
+ }
+
+ /* reserve free_list length */
+ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+ buf_flush_LRU(
+ std::max<ulint>(withdraw_target
+ - UT_LIST_GET_LEN(withdraw),
+ srv_LRU_scan_depth),
+ true);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_dblwr.flush_buffered_writes();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_wait_LRU_batch_end();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+
+ /* relocate blocks/buddies in withdrawn area */
+ ulint count2 = 0;
+
+ buf_pool_mutex_exit_forbid();
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage;
+ bpage; bpage = next_bpage) {
+ ut_ad(bpage->in_file());
+ next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ if (UNIV_LIKELY_NULL(bpage->zip.data)
+ && will_be_withdrawn(bpage->zip.data)
+ && bpage->can_relocate()) {
+ if (!buf_buddy_realloc(
+ bpage->zip.data,
+ page_zip_get_size(&bpage->zip))) {
+ /* failed to allocate block */
+ break;
+ }
+ count2++;
+ if (bpage->frame) {
+ goto realloc_frame;
+ }
+ }
+
+ if (bpage->frame && will_be_withdrawn(*bpage)
+ && bpage->can_relocate()) {
+realloc_frame:
+ if (!realloc(reinterpret_cast<buf_block_t*>(
+ bpage))) {
+ /* failed to allocate block */
+ break;
+ }
+ count2++;
+ }
+ }
+ buf_pool_mutex_exit_allow();
+ mysql_mutex_unlock(&mutex);
+
+ buf_resize_status(
+ "Withdrawing blocks. (" ULINTPF "/" ULINTPF ").",
+ UT_LIST_GET_LEN(withdraw),
+ withdraw_target);
+
+ ib::info() << "Withdrew "
+ << count1 << " blocks from free list."
+ << " Tried to relocate " << count2 << " blocks ("
+ << UT_LIST_GET_LEN(withdraw) << "/"
+ << withdraw_target << ").";
+
+ if (++loop_count >= 10) {
+ /* give up for now.
+ retried after user threads paused. */
+
+ ib::info() << "will retry to withdraw later";
+
+ /* need retry later */
+ return(true);
+ }
+ }
+
+ /* confirm withdrawn enough */
+ for (const chunk_t* chunk = chunks + n_chunks_new,
+ * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
+ block = chunk->blocks;
+ for (ulint j = chunk->size; j--; block++) {
+ ut_a(block->page.state() == buf_page_t::NOT_USED);
+ ut_ad(block->in_withdraw_list);
+ }
+ }
+
+ ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw)
+ << " blocks.";
+
+ return(false);
+}
+
+
+
+inline void buf_pool_t::page_hash_table::write_lock_all()
+{
+ for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+ {
+ reinterpret_cast<page_hash_latch&>(array[n]).lock();
+ if (!n)
+ break;
+ }
+}
+
+
+inline void buf_pool_t::page_hash_table::write_unlock_all()
+{
+ for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+ {
+ reinterpret_cast<page_hash_latch&>(array[n]).unlock();
+ if (!n)
+ break;
+ }
+}
+
+
+namespace
+{
+
+struct find_interesting_trx
+{
+ void operator()(const trx_t &trx)
+ {
+ if (trx.state == TRX_STATE_NOT_STARTED)
+ return;
+ if (trx.mysql_thd == nullptr)
+ return;
+ if (withdraw_started <= trx.start_time_micro)
+ return;
+
+ if (!found)
+ {
+ ib::warn() << "The following trx might hold "
+ "the blocks in buffer pool to "
+ "be withdrawn. Buffer pool "
+ "resizing can complete only "
+ "after all the transactions "
+ "below release the blocks.";
+ found= true;
+ }
+
+ lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
+ }
+
+ bool &found;
+ /** microsecond_interval_timer() */
+ const ulonglong withdraw_started;
+ const my_hrtime_t current_time;
+};
+
+} // namespace
+
+/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+inline void buf_pool_t::resize()
+{
+ ut_ad(this == &buf_pool);
+
+ bool warning = false;
+
+ NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+ ut_ad(!resize_in_progress());
+ ut_ad(srv_buf_pool_chunk_unit > 0);
+
+ ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
+ std::ostringstream str_old_size, str_new_size, str_chunk_size;
+ str_old_size << ib::bytes_iec{srv_buf_pool_old_size};
+ str_new_size << ib::bytes_iec{srv_buf_pool_size};
+ str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit};
+
+ buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).",
+ str_old_size.str().c_str(),
+ str_new_size.str().c_str(),
+ str_chunk_size.str().c_str());
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* disable AHI if needed */
+ buf_resize_status("Disabling adaptive hash index.");
+
+ btr_search_s_lock_all();
+ const bool btr_search_disabled = btr_search_enabled;
+ btr_search_s_unlock_all();
+
+ btr_search_disable();
+
+ if (btr_search_disabled) {
+ ib::info() << "disabled adaptive hash index.";
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ mysql_mutex_lock(&mutex);
+ ut_ad(n_chunks_new == n_chunks);
+ ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+
+ n_chunks_new = (new_instance_size << srv_page_size_shift)
+ / srv_buf_pool_chunk_unit;
+ curr_size = n_chunks_new * chunks->size;
+ mysql_mutex_unlock(&mutex);
+
+ if (is_shrinking()) {
+ /* set withdraw target */
+ size_t w = 0;
+
+ for (const chunk_t* chunk = chunks + n_chunks_new,
+ * const echunk = chunks + n_chunks;
+ chunk != echunk; chunk++)
+ w += chunk->size;
+
+ ut_ad(withdraw_target == 0);
+ withdraw_target = w;
+ }
+
+ buf_resize_status("Withdrawing blocks to be shrunken.");
+
+ ulonglong withdraw_started = microsecond_interval_timer();
+ ulonglong message_interval = 60ULL * 1000 * 1000;
+ ulint retry_interval = 1;
+
+withdraw_retry:
+ /* wait for the number of blocks fit to the new size (if needed)*/
+ bool should_retry_withdraw = is_shrinking()
+ && withdraw_blocks();
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ /* abort to resize for shutdown. */
+ return;
+ }
+
+ /* abort buffer pool load */
+ buf_load_abort();
+
+ const ulonglong current_time = microsecond_interval_timer();
+
+ if (should_retry_withdraw
+ && current_time - withdraw_started >= message_interval) {
+
+ if (message_interval > 900000000) {
+ message_interval = 1800000000;
+ } else {
+ message_interval *= 2;
+ }
+
+ bool found= false;
+ find_interesting_trx f
+ {found, withdraw_started, my_hrtime_coarse()};
+ withdraw_started = current_time;
+
+ /* This is going to exceed the maximum size of a
+ memory transaction. */
+ LockMutexGuard g{SRW_LOCK_CALL};
+ trx_sys.trx_list.for_each(f);
+ }
+
+ if (should_retry_withdraw) {
+ ib::info() << "Will retry to withdraw " << retry_interval
+ << " seconds later.";
+ std::this_thread::sleep_for(
+ std::chrono::seconds(retry_interval));
+
+ if (retry_interval > 5) {
+ retry_interval = 10;
+ } else {
+ retry_interval *= 2;
+ }
+
+ goto withdraw_retry;
+ }
+
+ buf_resize_status("Latching entire buffer pool.");
+
+#ifndef DBUG_OFF
+ {
+ bool should_wait = true;
+
+ while (should_wait) {
+ should_wait = false;
+ DBUG_EXECUTE_IF(
+ "ib_buf_pool_resize_wait_before_resize",
+ should_wait = true;
+ std::this_thread::sleep_for(
+ std::chrono::milliseconds(10)););
+ }
+ }
+#endif /* !DBUG_OFF */
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ return;
+ }
+
+ /* Indicate critical path */
+ resizing.store(true, std::memory_order_relaxed);
+
+ mysql_mutex_lock(&mutex);
+ page_hash.write_lock_all();
+
+ chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
+
+ /* add/delete chunks */
+
+ buf_resize_status("Resizing buffer pool from "
+ ULINTPF " chunks to " ULINTPF " chunks.",
+ n_chunks, n_chunks_new);
+
+ if (is_shrinking()) {
+ /* delete chunks */
+ chunk_t* chunk = chunks + n_chunks_new;
+ const chunk_t* const echunk = chunks + n_chunks;
+
+ ulint sum_freed = 0;
+
+ while (chunk < echunk) {
+ /* buf_LRU_block_free_non_file_page() invokes
+ MEM_NOACCESS() on any buf_pool.free blocks.
+ We must cancel the effect of that. In
+ MemorySanitizer, MEM_NOACCESS() is no-op, so
+ we must not do anything special for it here. */
+#ifdef HAVE_valgrind
+# if !__has_feature(memory_sanitizer)
+ MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
+# endif
+#else
+ MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
+#endif
+
+ buf_block_t* block = chunk->blocks;
+
+ for (ulint j = chunk->size; j--; block++) {
+ block->page.lock.free();
+ }
+
+ allocator.deallocate_large_dodump(
+ chunk->mem, &chunk->mem_pfx);
+ sum_freed += chunk->size;
+ ++chunk;
+ }
+
+ /* discard withdraw list */
+ UT_LIST_INIT(withdraw, &buf_page_t::list);
+ withdraw_target = 0;
+
+ ib::info() << n_chunks - n_chunks_new
+ << " Chunks (" << sum_freed
+ << " blocks) were freed.";
+
+ n_chunks = n_chunks_new;
+ }
+
+ {
+ /* reallocate chunks */
+ const size_t new_chunks_size
+ = n_chunks_new * sizeof(chunk_t);
+
+ chunk_t* new_chunks = static_cast<chunk_t*>(
+ ut_zalloc_nokey_nofatal(new_chunks_size));
+
+ DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
+ ut_free(new_chunks); new_chunks= nullptr; );
+
+ if (!new_chunks) {
+ ib::error() << "failed to allocate"
+ " the chunk array.";
+ n_chunks_new = n_chunks;
+ warning = true;
+ chunks_old = NULL;
+ goto calc_buf_pool_size;
+ }
+
+ ulint n_chunks_copy = ut_min(n_chunks_new, n_chunks);
+
+ memcpy(new_chunks, chunks,
+ n_chunks_copy * sizeof *new_chunks);
+
+ for (ulint j = 0; j < n_chunks_copy; j++) {
+ new_chunks[j].reg();
+ }
+
+ chunks_old = chunks;
+ chunks = new_chunks;
+ }
+
+ if (n_chunks_new > n_chunks) {
+ /* add chunks */
+ ulint sum_added = 0;
+ ulint n = n_chunks;
+ const size_t unit = srv_buf_pool_chunk_unit;
+
+ for (chunk_t* chunk = chunks + n_chunks,
+ * const echunk = chunks + n_chunks_new;
+ chunk != echunk; chunk++) {
+ if (!chunk->create(unit)) {
+ ib::error() << "failed to allocate"
+ " memory for buffer pool chunk";
+
+ warning = true;
+ n_chunks_new = n_chunks;
+ break;
+ }
+
+ sum_added += chunk->size;
+ ++n;
+ }
+
+ ib::info() << n_chunks_new - n_chunks
+ << " chunks (" << sum_added
+ << " blocks) were added.";
+
+ n_chunks = n;
+ }
+calc_buf_pool_size:
+ /* recalc curr_size */
+ ulint new_size = 0;
+
+ {
+ chunk_t* chunk = chunks;
+ const chunk_t* const echunk = chunk + n_chunks;
+ do {
+ new_size += chunk->size;
+ } while (++chunk != echunk);
+ }
+
+ curr_size = new_size;
+ n_chunks_new = n_chunks;
+
+ if (chunks_old) {
+ ut_free(chunks_old);
+ chunks_old = NULL;
+ }
+
+ chunk_t::map* chunk_map_old = chunk_t::map_ref;
+ chunk_t::map_ref = chunk_t::map_reg;
+
+ /* set size */
+ ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+ ulint s= curr_size;
+ s/= BUF_READ_AHEAD_PORTION;
+ read_ahead_area= s >= READ_AHEAD_PAGES
+ ? READ_AHEAD_PAGES
+ : my_round_up_to_next_power(static_cast<uint32_t>(s));
+ curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
+ srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
+ extern ulonglong innobase_buffer_pool_size;
+ innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size);
+
+ const bool new_size_too_diff
+ = srv_buf_pool_base_size > srv_buf_pool_size * 2
+ || srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+
+ mysql_mutex_unlock(&mutex);
+ page_hash.write_unlock_all();
+
+ UT_DELETE(chunk_map_old);
+
+ resizing.store(false, std::memory_order_relaxed);
+
+ /* Normalize other components, if the new size is too different */
+ if (!warning && new_size_too_diff) {
+ srv_buf_pool_base_size = srv_buf_pool_size;
+
+ buf_resize_status("Resizing other hash tables.");
+
+ srv_lock_table_size = 5
+ * (srv_buf_pool_size >> srv_page_size_shift);
+ lock_sys.resize(srv_lock_table_size);
+ dict_sys.resize();
+
+ ib::info() << "Resized hash tables: lock_sys,"
+#ifdef BTR_CUR_HASH_ADAPT
+ " adaptive hash index,"
+#endif /* BTR_CUR_HASH_ADAPT */
+ " and dictionary.";
+ }
+
+ /* normalize ibuf.max_size */
+ ibuf_max_size_update(srv_change_buffer_max_size);
+
+ if (srv_buf_pool_old_size != srv_buf_pool_size) {
+
+ buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes."
+ ,srv_buf_pool_old_size, srv_buf_pool_size);
+ srv_buf_pool_old_size = srv_buf_pool_size;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* enable AHI if needed */
+ if (btr_search_disabled) {
+ btr_search_enable(true);
+ ib::info() << "Re-enabled adaptive hash index.";
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (warning)
+ buf_resize_status("Resizing buffer pool failed");
+
+ ut_d(validate());
+
+ return;
+}
+
+/** Thread pool task invoked by innodb_buffer_pool_size changes. */
+static void buf_resize_callback(void *)
+{
+ DBUG_ENTER("buf_resize_callback");
+ ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+ mysql_mutex_lock(&buf_pool.mutex);
+ const auto size= srv_buf_pool_size;
+ const bool work= srv_buf_pool_old_size != size;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (work)
+ buf_pool.resize();
+ else
+ {
+ std::ostringstream sout;
+ sout << "Size did not change: old size = new size = " << size;
+ buf_resize_status(sout.str().c_str());
+ }
+ DBUG_VOID_RETURN;
+}
+
+/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
+static tpool::task_group single_threaded_group(1);
+static tpool::waitable_task buf_resize_task(buf_resize_callback,
+ nullptr, &single_threaded_group);
+
+void buf_resize_start()
+{
+ srv_thread_pool->submit_task(&buf_resize_task);
+}
+
+void buf_resize_shutdown()
+{
+ buf_resize_task.wait();
+}
+
+
+/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
+buf_pool.page_hash.
+The caller must relocate bpage->list.
+@param bpage ROW_FORMAT=COMPRESSED only block
+@param dpage destination control block */
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
+{
+ const page_id_t id{bpage->id()};
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+ ut_ad(!bpage->frame);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
+ ut_ad(bpage == buf_pool.page_hash.get(id, chain));
+ ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+ ut_d(const auto state= bpage->state());
+ ut_ad(state >= buf_page_t::FREED);
+ ut_ad(state <= buf_page_t::READ_FIX);
+ ut_ad(bpage->lock.is_write_locked());
+ const auto frame= dpage->frame;
+
+ dpage->lock.free();
+ new (dpage) buf_page_t(*bpage);
+
+ dpage->frame= frame;
+
+ /* Important that we adjust the hazard pointer before
+ removing bpage from LRU list. */
+ if (buf_page_t *b= buf_pool.LRU_remove(bpage))
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage);
+ else
+ UT_LIST_ADD_FIRST(buf_pool.LRU, dpage);
+
+ if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage))
+ {
+ buf_pool.LRU_old= dpage;
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) ||
+ !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) ||
+ UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+ }
+ else
+ {
+ /* Check that the "old" flag is consistent in
+ the block and its neighbours. */
+ dpage->set_old(dpage->is_old());
+#endif /* UNIV_LRU_DEBUG */
+ }
+
+ ut_d(CheckInLRUList::validate());
+
+ buf_pool.page_hash.replace(chain, bpage, dpage);
+}
+
+buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+ buf_pool_t::hash_chain &chain)
+{
+ ut_ad(&chain == &page_hash.cell_get(id.fold()));
+ page_hash.lock_get(chain).lock();
+
+ buf_page_t *bpage= page_hash.get(id, chain);
+
+ if (bpage)
+ {
+got_block:
+ bpage->fix();
+ if (watch_is_sentinel(*bpage))
+ bpage= nullptr;
+ page_hash.lock_get(chain).unlock();
+ return bpage;
+ }
+
+ page_hash.lock_get(chain).unlock();
+ /* Allocate a watch[] and then try to insert it into the page_hash. */
+ mysql_mutex_lock(&mutex);
+
+ /* The maximum number of purge tasks should never exceed
+ the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
+ watch when setting another watch. */
+ for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
+ {
+ ut_ad(w->access_time == 0);
+ ut_ad(!w->oldest_modification());
+ ut_ad(!w->zip.data);
+ ut_ad(!w->in_zip_hash);
+ static_assert(buf_page_t::NOT_USED == 0, "efficiency");
+ if (ut_d(auto s=) w->state())
+ {
+ /* This watch may be in use for some other page. */
+ ut_ad(s >= buf_page_t::UNFIXED);
+ continue;
+ }
+ /* w is pointing to watch[], which is protected by mutex.
+ Normally, buf_page_t::id for objects that are reachable by
+ page_hash.get(id, chain) are protected by hash_lock. */
+ w->set_state(buf_page_t::UNFIXED + 1);
+ w->id_= id;
+
+ page_hash.lock_get(chain).lock();
+ bpage= page_hash.get(id, chain);
+ if (UNIV_LIKELY_NULL(bpage))
+ {
+ w->set_state(buf_page_t::NOT_USED);
+ mysql_mutex_unlock(&mutex);
+ goto got_block;
+ }
+
+ ut_ad(w->state() == buf_page_t::UNFIXED + 1);
+ buf_pool.page_hash.append(chain, w);
+ mysql_mutex_unlock(&mutex);
+ page_hash.lock_get(chain).unlock();
+ return nullptr;
+ }
+
+ ut_error;
+}
+
+/** Stop watching whether a page has been read in.
+watch_set(id) must have returned nullptr before.
+@param id page identifier
+@param chain unlocked hash table chain */
+TRANSACTIONAL_TARGET
+void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain)
+{
+ mysql_mutex_assert_not_owner(&mutex);
+ buf_page_t *w;
+ {
+ transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)};
+ /* The page must exist because watch_set() did fix(). */
+ w= page_hash.get(id, chain);
+ ut_ad(w->in_page_hash);
+ if (!watch_is_sentinel(*w))
+ {
+ no_watch:
+ w->unfix();
+ w= nullptr;
+ }
+ else
+ {
+ const auto state= w->state();
+ ut_ad(~buf_page_t::LRU_MASK & state);
+ ut_ad(state >= buf_page_t::UNFIXED + 1);
+ if (state != buf_page_t::UNFIXED + 1)
+ goto no_watch;
+ }
+ }
+
+ if (!w)
+ return;
+
+ const auto old= w;
+ /* The following is based on buf_pool_t::watch_remove(). */
+ mysql_mutex_lock(&mutex);
+ w= page_hash.get(id, chain);
+
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ auto f= w->unfix();
+ ut_ad(f < buf_page_t::READ_FIX || w != old);
+
+ if (f == buf_page_t::UNFIXED && w == old)
+ {
+ page_hash.remove(chain, w);
+ // Now that w is detached from page_hash, release it to watch[].
+ ut_ad(w->id_ == id);
+ ut_ad(!w->frame);
+ ut_ad(!w->zip.data);
+ w->set_state(buf_page_t::NOT_USED);
+ }
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
+/** Mark the page status as FREED for the given tablespace and page number.
+@param[in,out] space tablespace
+@param[in] page page number
+@param[in,out] mtr mini-transaction */
+TRANSACTIONAL_TARGET
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
+{
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+
+ if (srv_immediate_scrub_data_uncompressed
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ || space->is_compressed()
+#endif
+ )
+ mtr->add_freed_offset(space, page);
+
+ ++buf_pool.stat.n_page_gets;
+ const page_id_t page_id(space->id, page);
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ uint32_t fix;
+ buf_block_t *block;
+ {
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash.get(page_id, chain));
+ if (!block || !block->page.frame)
+ /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */
+ return;
+ /* To avoid a deadlock with buf_LRU_free_page() of some other page
+ and buf_page_write_complete() of this page, we must not wait for a
+ page latch while holding a page_hash latch. */
+ fix= block->page.fix();
+ }
+
+ if (UNIV_UNLIKELY(fix < buf_page_t::UNFIXED))
+ {
+ block->page.unfix();
+ return;
+ }
+
+ block->page.lock.x_lock();
+ if (block->page.is_ibuf_exist())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size());
+#ifdef BTR_CUR_HASH_ADAPT
+ if (block->index)
+ btr_search_drop_page_hash_index(block, false);
+#endif /* BTR_CUR_HASH_ADAPT */
+ block->page.set_freed(block->page.state());
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
+}
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with unfix().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+TRANSACTIONAL_TARGET
+buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
+{
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ ++buf_pool.stat.n_page_gets;
+ mariadb_increment_pages_accessed();
+
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+ buf_page_t *bpage;
+
+lookup:
+ for (bool discard_attempted= false;;)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (hash_lock.is_locked())
+ xabort();
+ bpage= buf_pool.page_hash.get(page_id, chain);
+ if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ {
+ xend();
+ goto must_read_page;
+ }
+ if (!bpage->zip.data)
+ {
+ /* There is no ROW_FORMAT=COMPRESSED page. */
+ xend();
+ return nullptr;
+ }
+ if (discard_attempted || !bpage->frame)
+ {
+ if (!bpage->lock.s_lock_try())
+ xabort();
+ xend();
+ break;
+ }
+ xend();
+ }
+ else
+#endif
+ {
+ hash_lock.lock_shared();
+ bpage= buf_pool.page_hash.get(page_id, chain);
+ if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ {
+ hash_lock.unlock_shared();
+ goto must_read_page;
+ }
+
+ ut_ad(bpage->in_file());
+ ut_ad(page_id == bpage->id());
+
+ if (!bpage->zip.data)
+ {
+ /* There is no ROW_FORMAT=COMPRESSED page. */
+ hash_lock.unlock_shared();
+ return nullptr;
+ }
+
+ if (discard_attempted || !bpage->frame)
+ {
+ /* Even when we are holding a hash_lock, it should be
+ acceptable to wait for a page S-latch here, because
+ buf_page_t::read_complete() will not wait for buf_pool.mutex,
+ and because S-latch would not conflict with a U-latch
+ that would be protecting buf_page_t::write_complete(). */
+ bpage->lock.s_lock();
+ hash_lock.unlock_shared();
+ break;
+ }
+
+ hash_lock.unlock_shared();
+ }
+
+ discard_attempted= true;
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
+ buf_LRU_free_page(bpage, false);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+
+ {
+ ut_d(const auto s=) bpage->fix();
+ ut_ad(s >= buf_page_t::UNFIXED);
+ ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+ }
+
+ bpage->set_accessed();
+ buf_page_make_young_if_needed(bpage);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ return bpage;
+
+must_read_page:
+ switch (dberr_t err= buf_read_page(page_id, zip_size)) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ mariadb_increment_pages_read();
+ goto lookup;
+ default:
+ ib::error() << "Reading compressed page " << page_id
+ << " failed with error: " << err;
+ return nullptr;
+ }
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+ buf_block_t* block) /*!< in: block to init */
+{
+#ifdef BTR_CUR_HASH_ADAPT
+ /* No adaptive hash index entries may point to a previously
+ unused (and now freshly allocated) block. */
+ assert_block_ahi_empty_on_init(block);
+ block->index = NULL;
+
+ block->n_hash_helps = 0;
+ block->n_fields = 1;
+ block->n_bytes = 0;
+ block->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check) /*!< in: TRUE=verify the page checksum */
+{
+ const byte* frame = block->page.zip.data;
+ ulint size = page_zip_get_size(&block->page.zip);
+ /* The tablespace will not be found if this function is called
+ during IMPORT. */
+ fil_space_t* space= fil_space_t::get(block->page.id().space());
+ const unsigned key_version = mach_read_from_4(
+ frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
+ const bool encrypted = crypt_data
+ && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
+ && (!crypt_data->is_default_encryption()
+ || srv_encrypt_tables);
+
+ ut_ad(block->zip_size());
+ ut_a(block->page.id().space() != 0);
+
+ if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+ ib::error() << "Compressed page checksum mismatch for "
+ << (space ? space->chain.start->name : "")
+ << block->page.id() << ": stored: "
+ << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+ << ", crc32: "
+ << page_zip_calc_checksum(frame, size, false)
+ << " adler32: "
+ << page_zip_calc_checksum(frame, size, true);
+ goto err_exit;
+ }
+
+ switch (fil_page_get_type(frame)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ if (page_zip_decompress(&block->page.zip,
+ block->page.frame, TRUE)) {
+func_exit:
+ if (space) {
+ space->release();
+ }
+ return(TRUE);
+ }
+
+ ib::error() << "Unable to decompress "
+ << (space ? space->chain.start->name : "")
+ << block->page.id();
+ goto err_exit;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ /* Copy to uncompressed storage. */
+ memcpy(block->page.frame, frame, block->zip_size());
+ goto func_exit;
+ }
+
+ ib::error() << "Unknown compressed page type "
+ << fil_page_get_type(frame)
+ << " in " << (space ? space->chain.start->name : "")
+ << block->page.id();
+
+err_exit:
+ if (encrypted) {
+ ib::info() << "Row compressed page could be encrypted"
+ " with key_version " << key_version;
+ }
+
+ if (space) {
+ space->release();
+ }
+
+ return(FALSE);
+}
+
+/** Low level function used to get access to a database page.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+TRANSACTIONAL_TARGET
+buf_block_t*
+buf_page_get_low(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge)
+{
+ unsigned access_time;
+ ulint retries = 0;
+
+ ut_ad(!mtr || mtr->is_active());
+ ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL);
+ ut_ad((rw_latch == RW_S_LATCH)
+ || (rw_latch == RW_X_LATCH)
+ || (rw_latch == RW_SX_LATCH)
+ || (rw_latch == RW_NO_LATCH));
+
+ if (err) {
+ *err = DB_SUCCESS;
+ }
+
+#ifdef UNIV_DEBUG
+ switch (mode) {
+ default:
+ ut_ad(!allow_ibuf_merge);
+ ut_ad(mode == BUF_PEEK_IF_IN_POOL);
+ break;
+ case BUF_GET_POSSIBLY_FREED:
+ case BUF_GET_IF_IN_POOL:
+ /* The caller may pass a dummy page size,
+ because it does not really matter. */
+ break;
+ case BUF_GET:
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ ut_ad(!mtr->is_freeing_tree());
+ fil_space_t* s = fil_space_get(page_id.space());
+ ut_ad(s);
+ ut_ad(s->zip_size() == zip_size);
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(!mtr || !ibuf_inside(mtr)
+ || ibuf_page_low(page_id, zip_size, FALSE, NULL));
+
+ ++buf_pool.stat.n_page_gets;
+ mariadb_increment_pages_accessed();
+
+ auto& chain= buf_pool.page_hash.cell_get(page_id.fold());
+ page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+loop:
+ buf_block_t* block = guess;
+ uint32_t state;
+
+ if (block) {
+ transactional_shared_lock_guard<page_hash_latch> g{hash_lock};
+ if (buf_pool.is_uncompressed(block)
+ && page_id == block->page.id()) {
+ ut_ad(!block->page.in_zip_hash);
+ state = block->page.state();
+ /* Ignore guesses that point to read-fixed blocks.
+ We can only avoid a race condition by
+ looking up the block via buf_pool.page_hash. */
+ if ((state >= buf_page_t::FREED
+ && state < buf_page_t::READ_FIX)
+ || state >= buf_page_t::WRITE_FIX) {
+ state = block->page.fix();
+ goto got_block;
+ }
+ }
+ }
+
+ guess = nullptr;
+
+ /* A memory transaction would frequently be aborted here. */
+ hash_lock.lock_shared();
+ block = reinterpret_cast<buf_block_t*>(
+ buf_pool.page_hash.get(page_id, chain));
+ if (UNIV_LIKELY(block
+ && !buf_pool.watch_is_sentinel(block->page))) {
+ state = block->page.fix();
+ hash_lock.unlock_shared();
+ goto got_block;
+ }
+ hash_lock.unlock_shared();
+
+ /* Page not in buf_pool: needs to be read from file */
+ switch (mode) {
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ return nullptr;
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ /* Buffer-fixing inside watch_set() will prevent eviction */
+ block = reinterpret_cast<buf_block_t*>
+ (buf_pool.watch_set(page_id, chain));
+
+ if (block) {
+ state = block->page.state();
+ goto got_block_fixed;
+ }
+
+ return nullptr;
+ }
+
+ /* The call path is buf_read_page() ->
+ buf_read_page_low() (fil_space_t::io()) ->
+ buf_page_t::read_complete() ->
+ buf_decrypt_after_read(). Here fil_space_t* is used
+ and we decrypt -> buf_page_check_corrupt() where page
+ checksums are compared. Decryption, decompression as
+ well as error handling takes place at a lower level.
+ Here we only need to know whether the page really is
+ corrupted, or if an encrypted page with a valid
+ checksum cannot be decypted. */
+
+ switch (dberr_t local_err = buf_read_page(page_id, zip_size)) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ mariadb_increment_pages_read();
+ buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr));
+ break;
+ default:
+ if (mode != BUF_GET_POSSIBLY_FREED
+ && retries++ < BUF_PAGE_READ_MAX_RETRIES) {
+ DBUG_EXECUTE_IF("intermittent_read_failure",
+ retries = BUF_PAGE_READ_MAX_RETRIES;);
+ }
+ /* fall through */
+ case DB_PAGE_CORRUPTED:
+ if (err) {
+ *err = local_err;
+ }
+ return nullptr;
+ }
+
+ ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+ goto loop;
+
+got_block:
+ ut_ad(!block->page.in_zip_hash);
+ state++;
+got_block_fixed:
+ ut_ad(state > buf_page_t::FREED);
+
+ if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) {
+ if (mode == BUF_PEEK_IF_IN_POOL) {
+ignore_block:
+ ut_ad(mode == BUF_GET_POSSIBLY_FREED
+ || mode == BUF_PEEK_IF_IN_POOL);
+ block->unfix();
+ if (err) {
+ *err = DB_CORRUPTION;
+ }
+ return nullptr;
+ }
+
+ if (UNIV_UNLIKELY(!block->page.frame)) {
+ goto wait_for_unzip;
+ }
+ /* A read-fix is released after block->page.lock
+ in buf_page_t::read_complete() or
+ buf_pool_t::corrupted_evict(), or
+ after buf_zip_decompress() in this function. */
+ block->page.lock.s_lock();
+ state = block->page.state();
+ ut_ad(state < buf_page_t::READ_FIX
+ || state >= buf_page_t::WRITE_FIX);
+ const page_id_t id{block->page.id()};
+ block->page.lock.s_unlock();
+
+ if (UNIV_UNLIKELY(id != page_id)) {
+ ut_ad(id == page_id_t{~0ULL});
+ block->page.unfix();
+ if (++retries < BUF_PAGE_READ_MAX_RETRIES) {
+ goto loop;
+ }
+
+ if (err) {
+ *err = DB_PAGE_CORRUPTED;
+ }
+
+ return nullptr;
+ }
+ } else if (mode != BUF_PEEK_IF_IN_POOL) {
+ } else if (!mtr) {
+ ut_ad(!block->page.oldest_modification());
+ mysql_mutex_lock(&buf_pool.mutex);
+ block->unfix();
+
+free_unfixed_block:
+ if (!buf_LRU_free_page(&block->page, true)) {
+ ut_ad(0);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return nullptr;
+ } else if (UNIV_UNLIKELY(!block->page.frame)) {
+ /* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an
+ adaptive hash index. There cannot be an
+ adaptive hash index for a compressed-only page. */
+ goto ignore_block;
+ }
+
+ ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL
+ || block->zip_size() == zip_size);
+
+ if (UNIV_UNLIKELY(!block->page.frame)) {
+ if (!block->page.lock.x_lock_try()) {
+wait_for_unzip:
+ /* The page is being read or written, or
+ another thread is executing buf_zip_decompress()
+ in buf_page_get_low() on it. */
+ block->page.unfix();
+ std::this_thread::sleep_for(
+ std::chrono::microseconds(100));
+ goto loop;
+ }
+
+ buf_block_t *new_block = buf_LRU_get_free_block(false);
+ buf_block_init_low(new_block);
+
+wait_for_unfix:
+ mysql_mutex_lock(&buf_pool.mutex);
+ page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain);
+
+ /* It does not make sense to use
+ transactional_lock_guard here, because buf_relocate()
+ would likely make a memory transaction too large. */
+ hash_lock.lock();
+
+ /* block->page.lock implies !block->page.can_relocate() */
+ ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain));
+
+ /* Wait for any other threads to release their buffer-fix
+ on the compressed-only block descriptor.
+ FIXME: Never fix() before acquiring the lock.
+ Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free()
+ we are violating that principle. */
+ state = block->page.state();
+
+ switch (state) {
+ case buf_page_t::UNFIXED + 1:
+ case buf_page_t::IBUF_EXIST + 1:
+ case buf_page_t::REINIT + 1:
+ break;
+ default:
+ ut_ad(state < buf_page_t::READ_FIX);
+
+ if (state < buf_page_t::UNFIXED + 1) {
+ ut_ad(state > buf_page_t::FREED);
+ block->page.lock.x_unlock();
+ hash_lock.unlock();
+ buf_LRU_block_free_non_file_page(new_block);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto ignore_block;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ hash_lock.unlock();
+ std::this_thread::sleep_for(
+ std::chrono::microseconds(100));
+ goto wait_for_unfix;
+ }
+
+ /* Ensure that another buf_page_get_low() will wait for
+ new_block->page.lock.x_unlock(). */
+ block->page.set_state(buf_page_t::READ_FIX);
+
+ /* Move the compressed page from block->page to new_block,
+ and uncompress it. */
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_relocate(&block->page, &new_block->page);
+
+ /* X-latch the block for the duration of the decompression. */
+ new_block->page.lock.x_lock();
+ ut_d(block->page.lock.x_unlock());
+
+ buf_flush_relocate_on_flush_list(&block->page,
+ &new_block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ /* Insert at the front of unzip_LRU list */
+ buf_unzip_LRU_add_block(new_block, FALSE);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ hash_lock.unlock();
+
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+ block->page.lock.free();
+#endif
+ ut_free(reinterpret_cast<buf_page_t*>(block));
+ block = new_block;
+
+ buf_pool.n_pend_unzip++;
+
+ access_time = block->page.is_accessed();
+
+ if (!access_time && !recv_no_ibuf_operations
+ && ibuf_page_exists(block->page.id(), block->zip_size())) {
+ state = buf_page_t::IBUF_EXIST + 1;
+ }
+
+ /* Decompress the page while not holding
+ buf_pool.mutex. */
+ const auto ok = buf_zip_decompress(block, false);
+ --buf_pool.n_pend_unzip;
+ if (!ok) {
+ if (err) {
+ *err = DB_PAGE_CORRUPTED;
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ state = block->page.read_unfix(state);
+ block->page.lock.x_unlock();
+
+ if (!ok) {
+ goto free_unfixed_block;
+ }
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+re_evict:
+ if (mode != BUF_GET_IF_IN_POOL
+ && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
+ } else if (!ibuf_debug || recv_recovery_is_on()) {
+ } else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+ for (ulint i = 0; i < mtr->get_savepoint(); i++) {
+ if (buf_block_t* b = mtr->block_at_savepoint(i)) {
+ if (b->page.oldest_modification() > 2
+ && b->page.lock.have_any()) {
+ /* We are holding a dirty page latch
+ that would hang buf_flush_sync(). */
+ space->release();
+ goto re_evict_fail;
+ }
+ }
+ }
+
+ /* Try to evict the block from the buffer pool, to use the
+ insert buffer (change buffer) as much as possible. */
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ block->unfix();
+
+ /* Blocks cannot be relocated or enter or exit the
+ buf_pool while we are holding the buf_pool.mutex. */
+ const bool evicted = buf_LRU_free_page(&block->page, true);
+ space->release();
+
+ if (!evicted) {
+ block->fix();
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (evicted) {
+ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ buf_pool.watch_set(page_id, chain);
+ }
+ return(NULL);
+ }
+
+ buf_flush_sync();
+
+ state = block->page.state();
+
+ if (state == buf_page_t::UNFIXED + 1
+ && !block->page.oldest_modification()) {
+ goto re_evict;
+ }
+
+ /* Failed to evict the page; change it directly */
+ }
+re_evict_fail:
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+ goto ignore_block;
+ }
+ ut_ad((~buf_page_t::LRU_MASK) & state);
+ ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(block->page.frame);
+
+ if (state >= buf_page_t::UNFIXED
+ && allow_ibuf_merge
+ && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX
+ && page_is_leaf(block->page.frame)) {
+ block->page.lock.x_lock();
+ ut_ad(block->page.id() == page_id
+ || (state >= buf_page_t::READ_FIX
+ && state < buf_page_t::WRITE_FIX));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ dberr_t e;
+
+ if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+page_id_mismatch:
+ state = block->page.state();
+ e = DB_CORRUPTION;
+ibuf_merge_corrupted:
+ if (err) {
+ *err = e;
+ }
+
+ if (block->page.id().is_corrupted()) {
+ buf_pool.corrupted_evict(&block->page, state);
+ }
+ return nullptr;
+ }
+
+ state = block->page.state();
+ ut_ad(state < buf_page_t::READ_FIX);
+
+ if (state >= buf_page_t::IBUF_EXIST
+ && state < buf_page_t::REINIT) {
+ block->page.clear_ibuf_exist();
+ e = ibuf_merge_or_delete_for_page(block, page_id,
+ block->zip_size());
+ if (UNIV_UNLIKELY(e != DB_SUCCESS)) {
+ goto ibuf_merge_corrupted;
+ }
+ }
+
+ if (rw_latch == RW_X_LATCH) {
+ goto get_latch_valid;
+ } else {
+ block->page.lock.x_unlock();
+ goto get_latch;
+ }
+ } else {
+get_latch:
+ switch (rw_latch) {
+ case RW_NO_LATCH:
+ mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+ return block;
+ case RW_S_LATCH:
+ block->page.lock.s_lock();
+ ut_ad(!block->page.is_read_fixed());
+ if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+ block->page.lock.s_unlock();
+ block->page.lock.x_lock();
+ goto page_id_mismatch;
+ }
+get_latch_valid:
+ mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+#ifdef BTR_CUR_HASH_ADAPT
+ btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+ break;
+ case RW_SX_LATCH:
+ block->page.lock.u_lock();
+ ut_ad(!block->page.is_io_fixed());
+ if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+ block->page.lock.u_x_upgrade();
+ goto page_id_mismatch;
+ }
+ goto get_latch_valid;
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ if (block->page.lock.x_lock_upgraded()) {
+ ut_ad(block->page.id() == page_id);
+ block->unfix();
+ mtr->page_lock_upgrade(*block);
+ return block;
+ }
+ if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+ goto page_id_mismatch;
+ }
+ goto get_latch_valid;
+ }
+
+ ut_ad(page_id_t(page_get_space_id(block->page.frame),
+ page_get_page_no(block->page.frame))
+ == page_id);
+
+ if (mode == BUF_GET_POSSIBLY_FREED
+ || mode == BUF_PEEK_IF_IN_POOL) {
+ return block;
+ }
+
+ const bool not_first_access{block->page.set_accessed()};
+ buf_page_make_young_if_needed(&block->page);
+ if (!not_first_access) {
+ buf_read_ahead_linear(page_id, block->zip_size(),
+ ibuf_inside(mtr));
+ }
+ }
+
+ return block;
+}
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out] mtr mini-transaction, or NULL
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge)
+{
+ buf_block_t *block= recv_sys.recover(page_id);
+ if (UNIV_LIKELY(!block))
+ return buf_page_get_low(page_id, zip_size, rw_latch,
+ guess, mode, mtr, err, allow_ibuf_merge);
+ else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1)))
+ {
+ corrupted:
+ if (err)
+ *err= DB_CORRUPTION;
+ return nullptr;
+ }
+ /* Recovery is a special case; we fix() before acquiring lock. */
+ auto s= block->page.fix();
+ ut_ad(s >= buf_page_t::FREED);
+ /* The block may be write-fixed at this point because we are not
+ holding a lock, but it must not be read-fixed. */
+ ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+ if (err)
+ *err= DB_SUCCESS;
+ const bool must_merge= allow_ibuf_merge &&
+ ibuf_page_exists(page_id, block->zip_size());
+ if (s < buf_page_t::UNFIXED)
+ {
+ got_freed_page:
+ ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+ mysql_mutex_lock(&buf_pool.mutex);
+ block->page.unfix();
+ buf_LRU_free_page(&block->page, true);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto corrupted;
+ }
+ else if (must_merge &&
+ fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX &&
+ page_is_leaf(block->page.frame))
+ {
+ block->page.lock.x_lock();
+ s= block->page.state();
+ ut_ad(s > buf_page_t::FREED);
+ ut_ad(s < buf_page_t::READ_FIX);
+ if (s < buf_page_t::UNFIXED)
+ {
+ block->page.lock.x_unlock();
+ goto got_freed_page;
+ }
+ else
+ {
+ if (block->page.is_ibuf_exist())
+ block->page.clear_ibuf_exist();
+ if (dberr_t e=
+ ibuf_merge_or_delete_for_page(block, page_id, block->zip_size()))
+ {
+ if (err)
+ *err= e;
+ buf_pool.corrupted_evict(&block->page, s);
+ return nullptr;
+ }
+ }
+
+ if (rw_latch == RW_X_LATCH)
+ {
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ return block;
+ }
+ block->page.lock.x_unlock();
+ }
+ mtr->page_lock(block, rw_latch);
+ return block;
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+TRANSACTIONAL_TARGET
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+ uint64_t modify_clock, mtr_t *mtr)
+{
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+ if (have_transactional_memory);
+ else if (UNIV_UNLIKELY(!block->page.frame))
+ return false;
+ else
+ {
+ const auto state= block->page.state();
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+ state >= buf_page_t::READ_FIX))
+ return false;
+ }
+
+ bool success;
+ const page_id_t id{block->page.id()};
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+ bool have_u_not_x= false;
+
+ {
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame))
+ return false;
+ const auto state= block->page.state();
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+ state >= buf_page_t::READ_FIX))
+ return false;
+
+ if (rw_latch == RW_S_LATCH)
+ success= block->page.lock.s_lock_try();
+ else
+ {
+ have_u_not_x= block->page.lock.have_u_not_x();
+ success= have_u_not_x || block->page.lock.x_lock_try();
+ }
+ }
+
+ if (!success)
+ return false;
+
+ if (have_u_not_x)
+ {
+ block->page.lock.u_x_upgrade();
+ mtr->page_lock_upgrade(*block);
+ ut_ad(id == block->page.id());
+ ut_ad(modify_clock == block->modify_clock);
+ }
+ else
+ {
+ ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed());
+ ut_ad(id == block->page.id());
+ ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr));
+
+ if (modify_clock != block->modify_clock || block->page.is_freed())
+ {
+ if (rw_latch == RW_S_LATCH)
+ block->page.lock.s_unlock();
+ else
+ block->page.lock.x_unlock();
+ return false;
+ }
+
+ block->page.fix();
+ ut_ad(!block->page.is_read_fixed());
+ block->page.set_accessed();
+ buf_page_make_young_if_needed(&block->page);
+ mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+ }
+
+ ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+ ut_d(const auto state = block->page.state());
+ ut_ad(state > buf_page_t::UNFIXED);
+ ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX);
+ ut_ad(~buf_page_t::LRU_MASK & state);
+ ut_ad(block->page.frame);
+
+ return true;
+}
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
+@param[in] page_id page identifier
+@param[in,out] mtr mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+TRANSACTIONAL_TARGET
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr)
+{
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ buf_block_t *block;
+
+ {
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash.get(page_id, chain));
+ if (!block || !block->page.frame || !block->page.lock.s_lock_try())
+ return nullptr;
+ }
+
+ block->page.fix();
+ ut_ad(!block->page.is_read_fixed());
+ mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.id() == page_id);
+
+ ++buf_pool.stat.n_page_gets;
+ mariadb_increment_pages_accessed();
+ return block;
+}
+
+/** Initialize the block.
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param fix initial buf_fix_count() */
+void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
+ uint32_t fix)
+{
+ ut_ad(!page.in_file());
+ buf_block_init_low(this);
+ page.init(fix, page_id);
+ page.set_os_used();
+ page_zip_set_size(&page.zip, zip_size);
+}
+
+TRANSACTIONAL_TARGET
+static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size,
+ mtr_t *mtr, buf_block_t *free_block)
+{
+ ut_ad(mtr->is_active());
+ ut_ad(page_id.space() != 0 || !zip_size);
+
+ free_block->initialise(page_id, zip_size, buf_page_t::MEMORY);
+
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+retry:
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain);
+
+ if (bpage && !buf_pool.watch_is_sentinel(*bpage))
+ {
+#ifdef BTR_CUR_HASH_ADAPT
+ const dict_index_t *drop_hash_entry= nullptr;
+#endif
+ bool ibuf_exist= false;
+
+ if (!mtr->have_x_latch(reinterpret_cast<const buf_block_t&>(*bpage)))
+ {
+ const bool got= bpage->lock.x_lock_try();
+ if (!got)
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ bpage->lock.x_lock();
+ const page_id_t id{bpage->id()};
+ if (UNIV_UNLIKELY(id != page_id))
+ {
+ ut_ad(id.is_corrupted());
+ bpage->lock.x_unlock();
+ goto retry;
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+
+ auto state= bpage->fix();
+ ut_ad(state >= buf_page_t::FREED);
+ ut_ad(state < buf_page_t::READ_FIX);
+
+ if (state < buf_page_t::UNFIXED)
+ bpage->set_reinit(buf_page_t::FREED);
+ else
+ {
+ bpage->set_reinit(state & buf_page_t::LRU_MASK);
+ ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
+ }
+
+ if (UNIV_LIKELY(bpage->frame != nullptr))
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+ drop_hash_entry= block->index;
+#endif
+ }
+ else
+ {
+ auto state= bpage->state();
+ ut_ad(state >= buf_page_t::FREED);
+ ut_ad(state < buf_page_t::READ_FIX);
+
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+ /* It does not make sense to use transactional_lock_guard here,
+ because buf_relocate() would likely make the memory transaction
+ too large. */
+ hash_lock.lock();
+
+ if (state < buf_page_t::UNFIXED)
+ bpage->set_reinit(buf_page_t::FREED);
+ else
+ {
+ bpage->set_reinit(state & buf_page_t::LRU_MASK);
+ ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_relocate(bpage, &free_block->page);
+ free_block->page.lock.x_lock();
+ buf_flush_relocate_on_flush_list(bpage, &free_block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ buf_unzip_LRU_add_block(free_block, FALSE);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ hash_lock.unlock();
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+ bpage->lock.x_unlock();
+ bpage->lock.free();
+#endif
+ ut_free(bpage);
+ mtr->memo_push(free_block, MTR_MEMO_PAGE_X_FIX);
+ bpage= &free_block->page;
+ }
+ }
+ else
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ ut_ad(bpage->frame);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
+#endif
+ const auto state= bpage->state();
+ ut_ad(state >= buf_page_t::FREED);
+ bpage->set_reinit(state < buf_page_t::UNFIXED ? buf_page_t::FREED
+ : state & buf_page_t::LRU_MASK);
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (drop_hash_entry)
+ btr_search_drop_page_hash_index(reinterpret_cast<buf_block_t*>(bpage),
+ false);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (ibuf_exist && !recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+ return reinterpret_cast<buf_block_t*>(bpage);
+ }
+
+ /* If we get here, the page was not in buf_pool: init it there */
+
+ DBUG_PRINT("ib_buf", ("create page %u:%u",
+ page_id.space(), page_id.page_no()));
+
+ bpage= &free_block->page;
+
+ ut_ad(bpage->state() == buf_page_t::MEMORY);
+ bpage->lock.x_lock();
+
+ /* The block must be put to the LRU list */
+ buf_LRU_add_block(bpage, false);
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ bpage->set_state(buf_page_t::REINIT + 1);
+ buf_pool.page_hash.append(chain, bpage);
+ }
+
+ if (UNIV_UNLIKELY(zip_size))
+ {
+ bpage->zip.data= buf_buddy_alloc(zip_size);
+
+ /* To maintain the invariant block->in_unzip_LRU_list ==
+ block->page.belongs_to_unzip_LRU() we have to add this
+ block to unzip_LRU after block->page.zip.data is set. */
+ ut_ad(bpage->belongs_to_unzip_LRU());
+ buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+ }
+
+ buf_pool.stat.n_pages_created++;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ mtr->memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX);
+
+ bpage->set_accessed();
+
+ /* Delete possible entries for the page from the insert buffer:
+ such can exist if the page belonged to an index which was dropped */
+ if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
+ !srv_is_undo_tablespace(page_id.space()) &&
+ !recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+ static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
+ memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8);
+ mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+ /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
+ following pages:
+ (1) The first page of the InnoDB system tablespace (page 0:0)
+ (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
+ (3) key_version on encrypted pages (not page 0:0) */
+
+ memset(bpage->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+ memset_aligned<8>(bpage->frame + FIL_PAGE_LSN, 0, 8);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ return reinterpret_cast<buf_block_t*>(bpage);
+}
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
+ or deferred space id if space
+ object is null
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[in,out] free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+{
+ space->free_page(offset, false);
+ return buf_page_create_low({space->id, offset}, zip_size, mtr, free_block);
+}
+
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id space identfier
+@param zip_size ROW_FORMAT=COMPRESSED page size or 0
+@param mtr mini-transaction
+@param free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size,
+ mtr_t *mtr, buf_block_t *free_block)
+{
+ return buf_page_create_low({space_id, 0}, zip_size, mtr, free_block);
+}
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage buffer page whose read or write was completed
+@param read true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read)
+{
+ monitor_id_t counter;
+
+ const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame;
+
+ switch (fil_page_get_type(frame)) {
+ ulint level;
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ level = btr_page_get_level(frame);
+
+ /* Check if it is an index page for insert buffer */
+ if (fil_page_get_type(frame) == FIL_PAGE_INDEX
+ && btr_page_get_index_id(frame)
+ == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ read, MONITOR_INDEX_IBUF_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ read,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+ }
+ } else {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ read, MONITOR_INDEX_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ read, MONITOR_INDEX_NON_LEAF_PAGE);
+ }
+ }
+ break;
+
+ case FIL_PAGE_UNDO_LOG:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_UNDO_LOG_PAGE);
+ break;
+
+ case FIL_PAGE_INODE:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_FREE_LIST:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_BITMAP:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_SYS:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_TRX_SYS:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_TRX_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_FSP_HDR:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_FSP_HDR_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_XDES:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_XDES_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_BLOB:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_BLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB2:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB2_PAGE);
+ break;
+
+ default:
+ counter = MONITOR_RW_COUNTER(read, MONITOR_OTHER_PAGE);
+ }
+
+ MONITOR_INC_NOCHECK(counter);
+}
+
+/** Check if the encrypted page is corrupted for the full crc32 format.
+@param[in] space_id page belongs to space id
+@param[in] d page
+@param[in] is_compressed compressed page
+@return true if page is corrupted or false if it isn't */
+static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d,
+ bool is_compressed)
+{
+ if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID))
+ return true;
+
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+
+ return !is_compressed &&
+ memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d,
+ d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4);
+}
+
+/** Check if page is maybe compressed, encrypted or both when we encounter
+corrupted page. Note that we can't be 100% sure if page is corrupted
+or decrypt/decompress just failed.
+@param[in,out] bpage page
+@param[in] node data file
+@return whether the operation succeeded
+@retval DB_SUCCESS if page has been read and is not corrupted
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match. */
+static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
+ const fil_node_t &node)
+{
+ ut_ad(node.space->referenced());
+
+ byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame;
+ dberr_t err = DB_SUCCESS;
+ uint key_version = buf_page_get_key_version(dst_frame,
+ node.space->flags);
+
+ /* In buf_decrypt_after_read we have either decrypted the page if
+ page post encryption checksum matches and used key_id is found
+ from the encryption plugin. If checksum did not match page was
+ not decrypted and it could be either encrypted and corrupted
+ or corrupted or good page. If we decrypted, there page could
+ still be corrupted if used key does not match. */
+ const bool seems_encrypted = !node.space->full_crc32() && key_version
+ && node.space->crypt_data
+ && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+ ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY ||
+ node.space->full_crc32());
+
+ /* If traditional checksums match, we assume that page is
+ not anymore encrypted. */
+ if (node.space->full_crc32()
+ && !buf_is_zeroes(span<const byte>(dst_frame,
+ node.space->physical_size()))
+ && (key_version || node.space->is_compressed()
+ || node.space->purpose == FIL_TYPE_TEMPORARY)) {
+ if (buf_page_full_crc32_is_corrupted(
+ bpage->id().space(), dst_frame,
+ node.space->is_compressed())) {
+ err = DB_PAGE_CORRUPTED;
+ }
+ } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) {
+ err = DB_PAGE_CORRUPTED;
+ }
+
+ if (seems_encrypted && err == DB_PAGE_CORRUPTED
+ && bpage->id().page_no() != 0) {
+ err = DB_DECRYPTION_FAILED;
+
+ ib::error()
+ << "The page " << bpage->id()
+ << " in file '" << node.name
+ << "' cannot be decrypted; key_version="
+ << key_version;
+ }
+
+ return (err);
+}
+
+/** Complete a read of a page.
+@param node data file
+@return whether the operation succeeded
+@retval DB_PAGE_CORRUPTED if the checksum fails
+@retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+@retval DB_FAIL if the page contains the wrong ID */
+dberr_t buf_page_t::read_complete(const fil_node_t &node)
+{
+ const page_id_t expected_id{id()};
+ ut_ad(is_read_fixed());
+ ut_ad(!buf_dblwr.is_inside(id()));
+ ut_ad(id().space() == node.space->id);
+ ut_ad(zip_size() == node.space->zip_size());
+ ut_ad(!!zip.ssize == !!zip.data);
+
+ const byte *read_frame= zip.data ? zip.data : frame;
+ ut_ad(read_frame);
+
+ dberr_t err;
+ if (!buf_page_decrypt_after_read(this, node))
+ {
+ err= DB_DECRYPTION_FAILED;
+ goto database_corrupted;
+ }
+
+ if (belongs_to_unzip_LRU())
+ {
+ buf_pool.n_pend_unzip++;
+ auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(this), false);
+ buf_pool.n_pend_unzip--;
+
+ if (!ok)
+ {
+ ib::info() << "Page " << expected_id << " zip_decompress failure.";
+ err= DB_PAGE_CORRUPTED;
+ goto database_corrupted;
+ }
+ }
+
+ {
+ const page_id_t read_id(mach_read_from_4(read_frame + FIL_PAGE_SPACE_ID),
+ mach_read_from_4(read_frame + FIL_PAGE_OFFSET));
+
+ if (read_id == expected_id);
+ else if (read_id == page_id_t(0, 0))
+ {
+ /* This is likely an uninitialized (all-zero) page. */
+ err= DB_FAIL;
+ goto release_page;
+ }
+ else if (!node.space->full_crc32() &&
+ page_id_t(0, read_id.page_no()) == expected_id)
+ /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
+ before MySQL 4.1.1, which introduced innodb_file_per_table. */;
+ else if (node.space->full_crc32() &&
+ *reinterpret_cast<const uint32_t*>
+ (&read_frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+ node.space->crypt_data &&
+ node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
+ {
+ ib::error() << "Cannot decrypt " << expected_id;
+ err= DB_DECRYPTION_FAILED;
+ goto release_page;
+ }
+ else
+ {
+ ib::error() << "Space id and page no stored in the page, read in are "
+ << read_id << ", should be " << expected_id;
+ err= DB_PAGE_CORRUPTED;
+ goto release_page;
+ }
+ }
+
+ err= buf_page_check_corrupt(this, node);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ {
+database_corrupted:
+ if (belongs_to_unzip_LRU())
+ memset_aligned<UNIV_PAGE_SIZE_MIN>(frame, 0, srv_page_size);
+
+ if (err == DB_PAGE_CORRUPTED)
+ {
+ ib::error() << "Database page corruption on disk"
+ " or a failed read of file '"
+ << node.name << "' page " << expected_id
+ << ". You may have to recover from a backup.";
+
+ buf_page_print(read_frame, zip_size());
+
+ node.space->set_corrupted();
+
+ ib::info() << " You can use CHECK TABLE to scan"
+ " your table for corruption. "
+ << FORCE_RECOVERY_MSG;
+ }
+
+ if (!srv_force_recovery)
+ goto release_page;
+ }
+
+ if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
+ {
+release_page:
+ buf_pool.corrupted_evict(this, buf_page_t::READ_FIX);
+ return err;
+ }
+
+ const bool recovery= recv_recovery_is_on();
+
+ if (recovery && !recv_recover_page(node.space, this))
+ return DB_PAGE_CORRUPTED;
+
+ const bool ibuf_may_exist= frame && !recv_no_ibuf_operations &&
+ (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) &&
+ fil_page_get_type(read_frame) == FIL_PAGE_INDEX &&
+ page_is_leaf(read_frame);
+
+ if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+ buf_page_monitor(*this, true);
+ DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no()));
+
+ if (!recovery)
+ {
+ ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist
+ ? READ_FIX - IBUF_EXIST
+ : READ_FIX - UNFIXED);
+ ut_ad(f >= READ_FIX);
+ ut_ad(f < WRITE_FIX);
+ }
+ else if (ibuf_may_exist)
+ set_ibuf_exist();
+
+ lock.x_unlock(true);
+
+ return DB_SUCCESS;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all blocks are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+void buf_pool_t::assert_all_freed()
+{
+ mysql_mutex_lock(&mutex);
+ const chunk_t *chunk= chunks;
+ for (auto i= n_chunks; i--; chunk++)
+ if (const buf_block_t* block= chunk->not_freed())
+ ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+ mysql_mutex_unlock(&mutex);
+}
+#endif /* UNIV_DEBUG */
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats()
+{
+ buf_pool.last_printout_time = time(NULL);
+ buf_pool.old_stat = buf_pool.stat;
+}
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate()
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ /* It is possible that a write batch that has been posted
+ earlier is still not complete. For buffer pool invalidation to
+ proceed we must ensure there is NO write activity happening. */
+
+ ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+ ut_d(buf_pool.assert_all_freed());
+ ut_d(mysql_mutex_lock(&buf_pool.mutex));
+
+ while (UT_LIST_GET_LEN(buf_pool.LRU)) {
+ buf_LRU_scan_and_free_block();
+ }
+
+ ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+
+ buf_pool.freed_page_clock = 0;
+ buf_pool.LRU_old = NULL;
+ buf_pool.LRU_old_len = 0;
+ buf_pool.stat.init();
+
+ buf_refresh_io_stats();
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the buffer pool. */
+void buf_pool_t::validate()
+{
+ ulint n_lru = 0;
+ ulint n_flushing = 0;
+ ulint n_free = 0;
+ ulint n_zip = 0;
+
+ mysql_mutex_lock(&mutex);
+
+ chunk_t* chunk = chunks;
+
+ /* Check the uncompressed blocks. */
+
+ for (auto i = n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+
+ for (auto j = chunk->size; j--; block++) {
+ ut_ad(block->page.frame);
+ switch (const auto f = block->page.state()) {
+ case buf_page_t::NOT_USED:
+ n_free++;
+ break;
+
+ case buf_page_t::MEMORY:
+ case buf_page_t::REMOVE_HASH:
+ /* do nothing */
+ break;
+
+ default:
+ if (f >= buf_page_t::READ_FIX
+ && f < buf_page_t::WRITE_FIX) {
+ /* A read-fixed block is not
+ necessarily in the page_hash yet. */
+ break;
+ }
+ ut_ad(f >= buf_page_t::FREED);
+ const page_id_t id{block->page.id()};
+ ut_ad(page_hash.get(
+ id,
+ page_hash.cell_get(id.fold()))
+ == &block->page);
+ n_lru++;
+ }
+ }
+ }
+
+ /* Check dirty blocks. */
+
+ mysql_mutex_lock(&flush_list_mutex);
+ for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_ad(b->in_file());
+ ut_ad(b->oldest_modification());
+ ut_ad(!fsp_is_system_temporary(b->id().space()));
+ n_flushing++;
+
+ if (UNIV_UNLIKELY(!b->frame)) {
+ n_lru++;
+ n_zip++;
+ }
+ const page_id_t id{b->id()};
+ ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b);
+ }
+
+ ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
+
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ if (n_chunks_new == n_chunks
+ && n_lru + n_free > curr_size + n_zip) {
+
+ ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
+ << ", pool " << curr_size
+ << " zip " << n_zip << ". Aborting...";
+ }
+
+ ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
+
+ if (n_chunks_new == n_chunks
+ && UT_LIST_GET_LEN(free) != n_free) {
+
+ ib::fatal() << "Free list len "
+ << UT_LIST_GET_LEN(free)
+ << ", free blocks " << n_free << ". Aborting...";
+ }
+
+ mysql_mutex_unlock(&mutex);
+
+ ut_d(buf_LRU_validate());
+ ut_d(buf_flush_validate());
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Write information of the buf_pool to the error log. */
+void buf_pool_t::print()
+{
+ index_id_t* index_ids;
+ ulint* counts;
+ ulint size;
+ ulint i;
+ ulint j;
+ index_id_t id;
+ ulint n_found;
+ chunk_t* chunk;
+ dict_index_t* index;
+
+ size = curr_size;
+
+ index_ids = static_cast<index_id_t*>(
+ ut_malloc_nokey(size * sizeof *index_ids));
+
+ counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+
+ mysql_mutex_lock(&mutex);
+ mysql_mutex_lock(&flush_list_mutex);
+
+ ib::info()
+ << "[buffer pool: size=" << curr_size
+ << ", database pages=" << UT_LIST_GET_LEN(LRU)
+ << ", free pages=" << UT_LIST_GET_LEN(free)
+ << ", modified database pages="
+ << UT_LIST_GET_LEN(flush_list)
+ << ", n pending decompressions=" << n_pend_unzip
+ << ", n pending flush LRU=" << n_flush()
+ << " list=" << os_aio_pending_writes()
+ << ", pages made young=" << stat.n_pages_made_young
+ << ", not young=" << stat.n_pages_not_made_young
+ << ", pages read=" << stat.n_pages_read
+ << ", created=" << stat.n_pages_created
+ << ", written=" << stat.n_pages_written << "]";
+
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ /* Count the number of blocks belonging to each index in the buffer */
+
+ n_found = 0;
+
+ chunk = chunks;
+
+ for (i = n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ ulint n_blocks = chunk->size;
+
+ for (; n_blocks--; block++) {
+ const buf_frame_t* frame = block->page.frame;
+
+ if (fil_page_index_page_check(frame)) {
+
+ id = btr_page_get_index_id(frame);
+
+ /* Look for the id in the index_ids array */
+ j = 0;
+
+ while (j < n_found) {
+
+ if (index_ids[j] == id) {
+ counts[j]++;
+
+ break;
+ }
+ j++;
+ }
+
+ if (j == n_found) {
+ n_found++;
+ index_ids[j] = id;
+ counts[j] = 1;
+ }
+ }
+ }
+ }
+
+ mysql_mutex_unlock(&mutex);
+
+ for (i = 0; i < n_found; i++) {
+ index = dict_index_get_if_in_cache(index_ids[i]);
+
+ if (!index) {
+ ib::info() << "Block count for index "
+ << index_ids[i] << " in buffer is about "
+ << counts[i];
+ } else {
+ ib::info() << "Block count for index " << index_ids[i]
+ << " in buffer is about " << counts[i]
+ << ", index " << index->name
+ << " of table " << index->table->name;
+ }
+ }
+
+ ut_free(index_ids);
+ ut_free(counts);
+
+ validate();
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number()
+{
+ ulint fixed_pages_number= 0;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+ b= UT_LIST_GET_NEXT(LRU, b))
+ if (b->state() > buf_page_t::UNFIXED)
+ fixed_pages_number++;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ return fixed_pages_number;
+}
+#endif /* UNIV_DEBUG */
+
+/** Collect buffer pool metadata.
+@param[out] pool_info buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
+{
+ time_t current_time;
+ double time_elapsed;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ pool_info->pool_size = buf_pool.curr_size;
+
+ pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ pool_info->old_lru_len = buf_pool.LRU_old_len;
+
+ pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
+
+ pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+ pool_info->n_pend_reads = os_aio_pending_reads_approx();
+
+ pool_info->n_pending_flush_lru = buf_pool.n_flush();
+
+ pool_info->n_pending_flush_list = os_aio_pending_writes();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool.last_printout_time);
+
+ pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
+
+ pool_info->n_pages_not_made_young =
+ buf_pool.stat.n_pages_not_made_young;
+
+ pool_info->n_pages_read = buf_pool.stat.n_pages_read;
+
+ pool_info->n_pages_created = buf_pool.stat.n_pages_created;
+
+ pool_info->n_pages_written = buf_pool.stat.n_pages_written;
+
+ pool_info->n_page_gets = buf_pool.stat.n_page_gets;
+
+ pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
+ pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
+
+ pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
+
+ pool_info->page_made_young_rate =
+ static_cast<double>(buf_pool.stat.n_pages_made_young
+ - buf_pool.old_stat.n_pages_made_young)
+ / time_elapsed;
+
+ pool_info->page_not_made_young_rate =
+ static_cast<double>(buf_pool.stat.n_pages_not_made_young
+ - buf_pool.old_stat.n_pages_not_made_young)
+ / time_elapsed;
+
+ pool_info->pages_read_rate =
+ static_cast<double>(buf_pool.stat.n_pages_read
+ - buf_pool.old_stat.n_pages_read)
+ / time_elapsed;
+
+ pool_info->pages_created_rate =
+ static_cast<double>(buf_pool.stat.n_pages_created
+ - buf_pool.old_stat.n_pages_created)
+ / time_elapsed;
+
+ pool_info->pages_written_rate =
+ static_cast<double>(buf_pool.stat.n_pages_written
+ - buf_pool.old_stat.n_pages_written)
+ / time_elapsed;
+
+ pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
+ - buf_pool.old_stat.n_page_gets;
+
+ if (pool_info->n_page_get_delta) {
+ pool_info->page_read_delta = buf_pool.stat.n_pages_read
+ - buf_pool.old_stat.n_pages_read;
+
+ pool_info->young_making_delta =
+ buf_pool.stat.n_pages_made_young
+ - buf_pool.old_stat.n_pages_made_young;
+
+ pool_info->not_young_making_delta =
+ buf_pool.stat.n_pages_not_made_young
+ - buf_pool.old_stat.n_pages_not_made_young;
+ }
+ pool_info->pages_readahead_rnd_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
+ - buf_pool.old_stat.n_ra_pages_read_rnd)
+ / time_elapsed;
+
+
+ pool_info->pages_readahead_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_read
+ - buf_pool.old_stat.n_ra_pages_read)
+ / time_elapsed;
+
+ pool_info->pages_evicted_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_evicted
+ - buf_pool.old_stat.n_ra_pages_evicted)
+ / time_elapsed;
+
+ pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+ pool_info->io_sum = buf_LRU_stat_sum.io;
+
+ pool_info->io_cur = buf_LRU_stat_cur.io;
+
+ pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+ pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+ buf_refresh_io_stats();
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+static
+void
+buf_print_io_instance(
+/*==================*/
+ buf_pool_info_t*pool_info, /*!< in: buffer pool info */
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ ut_ad(pool_info);
+
+ fprintf(file,
+ "Buffer pool size " ULINTPF "\n"
+ "Free buffers " ULINTPF "\n"
+ "Database pages " ULINTPF "\n"
+ "Old database pages " ULINTPF "\n"
+ "Modified db pages " ULINTPF "\n"
+ "Percent of dirty pages(LRU & free pages): %.3f\n"
+ "Max dirty pages percent: %.3f\n"
+ "Pending reads " ULINTPF "\n"
+ "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
+ pool_info->pool_size,
+ pool_info->free_list_len,
+ pool_info->lru_len,
+ pool_info->old_lru_len,
+ pool_info->flush_list_len,
+ static_cast<double>(pool_info->flush_list_len)
+ / (static_cast<double>(pool_info->lru_len
+ + pool_info->free_list_len) + 1.0)
+ * 100.0,
+ srv_max_buf_pool_modified_pct,
+ pool_info->n_pend_reads,
+ pool_info->n_pending_flush_lru,
+ pool_info->n_pending_flush_list);
+
+ fprintf(file,
+ "Pages made young " ULINTPF ", not young " ULINTPF "\n"
+ "%.2f youngs/s, %.2f non-youngs/s\n"
+ "Pages read " ULINTPF ", created " ULINTPF
+ ", written " ULINTPF "\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+ pool_info->n_pages_made_young,
+ pool_info->n_pages_not_made_young,
+ pool_info->page_made_young_rate,
+ pool_info->page_not_made_young_rate,
+ pool_info->n_pages_read,
+ pool_info->n_pages_created,
+ pool_info->n_pages_written,
+ pool_info->pages_read_rate,
+ pool_info->pages_created_rate,
+ pool_info->pages_written_rate);
+
+ if (pool_info->n_page_get_delta) {
+ double hit_rate = static_cast<double>(
+ pool_info->page_read_delta)
+ / static_cast<double>(pool_info->n_page_get_delta);
+
+ if (hit_rate > 1) {
+ hit_rate = 1;
+ }
+
+ fprintf(file,
+ "Buffer pool hit rate " ULINTPF " / 1000,"
+ " young-making rate " ULINTPF " / 1000 not "
+ ULINTPF " / 1000\n",
+ ulint(1000 * (1 - hit_rate)),
+ ulint(1000
+ * double(pool_info->young_making_delta)
+ / double(pool_info->n_page_get_delta)),
+ ulint(1000 * double(pool_info->not_young_making_delta)
+ / double(pool_info->n_page_get_delta)));
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
+ }
+
+ /* Statistics about read ahead algorithm */
+ fprintf(file, "Pages read ahead %.2f/s,"
+ " evicted without access %.2f/s,"
+ " Random read ahead %.2f/s\n",
+
+ pool_info->pages_readahead_rate,
+ pool_info->pages_evicted_rate,
+ pool_info->pages_readahead_rnd_rate);
+
+ /* Print some values to help us with visualizing what is
+ happening with LRU eviction. */
+ fprintf(file,
+ "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
+ "I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
+ "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
+ pool_info->lru_len, pool_info->unzip_lru_len,
+ pool_info->io_sum, pool_info->io_cur,
+ pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ buf_pool_info_t pool_info;
+
+ buf_stats_get_pool_info(&pool_info);
+ buf_print_io_instance(&pool_info, file);
+}
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param page page frame
+@param fsp_flags contents of FSP_SPACE_FLAGS
+@return whether the page is encrypted and valid */
+bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags)
+{
+ if (!fil_space_t::full_crc32(fsp_flags)) {
+ return fil_space_verify_crypt_checksum(
+ page, fil_space_t::zip_size(fsp_flags));
+ }
+
+ return !buf_page_is_corrupted(true, page, fsp_flags);
+}
+
+/** Print the given page_id_t object.
+@param[in,out] out the output stream
+@param[in] page_id the page_id_t object to be printed
+@return the output stream */
+std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
+{
+ out << "[page id: space=" << page_id.space()
+ << ", page number=" << page_id.page_no() << "]";
+ return out;
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
new file mode 100644
index 00000000..662343ae
--- /dev/null
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "buf0checksum.h"
+#include "fil0fil.h"
+#include "ut0rnd.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "srv0srv.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in] page buffer page (srv_page_size bytes)
+@return CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page)
+{
+ /* Note: innodb_checksum_algorithm=crc32 could and should have
+ included the entire page in the checksum, and CRC-32 values
+ should be combined with the CRC-32 function, not with
+ exclusive OR. We stick to the current algorithm in order to
+ remain compatible with old data files. */
+ return my_crc32c(0, page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_OFFSET)
+ ^ my_crc32c(0, page + FIL_PAGE_DATA,
+ srv_page_size
+ - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page)
+{
+ ulint checksum;
+
+ /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+ to the first pages of data files, we have to skip them in the page
+ checksum calculation.
+ We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+ checksum is stored, and also the last 8 bytes of page because
+ there we store the old formula checksum. */
+
+ checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_OFFSET)
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ srv_page_size - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+ return(static_cast<uint32_t>(checksum));
+}
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in] page file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page)
+{
+ return(static_cast<uint32_t>
+ (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)));
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000..e9aea355
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,779 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+#include "buf0flu.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+
+using st_::span;
+
+/** The doublewrite buffer */
+buf_dblwr_t buf_dblwr;
+
+/** @return the TRX_SYS page */
+inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
+{
+ return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+ 0, RW_X_LATCH, mtr);
+}
+
+void buf_dblwr_t::init()
+{
+ if (!active_slot)
+ {
+ active_slot= &slots[0];
+ mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+ pthread_cond_init(&cond, nullptr);
+ }
+}
+
+/** Initialise the persistent storage of the doublewrite buffer.
+@param header doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
+{
+ ut_ad(!active_slot->first_free);
+ ut_ad(!active_slot->reserved);
+ ut_ad(!batch_running);
+
+ block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+ block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
+
+ const uint32_t buf_size= 2 * block_size();
+ for (int i= 0; i < 2; i++)
+ {
+ slots[i].write_buf= static_cast<byte*>
+ (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+ slots[i].buf_block_arr= static_cast<element*>
+ (ut_zalloc_nokey(buf_size * sizeof(element)));
+ }
+ active_slot= &slots[0];
+}
+
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
+{
+ if (is_created())
+ return true;
+
+ mtr_t mtr;
+ const ulint size= block_size();
+
+start_again:
+ mtr.start();
+
+ dberr_t err;
+ buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+ if (!trx_sys_block)
+ {
+ mtr.commit();
+ return false;
+ }
+
+ if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ trx_sys_block->page.frame) ==
+ TRX_SYS_DOUBLEWRITE_MAGIC_N)
+ {
+ /* The doublewrite buffer has already been created: just read in
+ some numbers */
+ init(TRX_SYS_DOUBLEWRITE + trx_sys_block->page.frame);
+ mtr.commit();
+ return true;
+ }
+
+ if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+ {
+ ib::error() << "Cannot create doublewrite buffer: "
+ "the first file in innodb_data_file_path must be at least "
+ << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+fail:
+ mtr.commit();
+ return false;
+ }
+ else
+ {
+ buf_block_t *b= fseg_create(fil_system.sys_space,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+ &mtr, &err, false, trx_sys_block);
+ if (!b)
+ {
+ ib::error() << "Cannot create doublewrite buffer: " << err;
+ goto fail;
+ }
+
+ ib::info() << "Doublewrite buffer not found: creating new";
+
+ /* FIXME: After this point, the doublewrite buffer creation
+ is not atomic. The doublewrite buffer should not exist in
+ the InnoDB system tablespace file in the first place.
+ It could be located in separate optional file(s) in a
+ user-specified location. */
+ }
+
+ byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+ trx_sys_block->page.frame;
+ for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+ i < 2 * size + extent_size / 2; i++)
+ {
+ buf_block_t *new_block=
+ fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP,
+ false, &mtr, &mtr, &err);
+ if (!new_block)
+ {
+ ib::error() << "Cannot create doublewrite buffer: "
+ " you must increase your tablespace size."
+ " Cannot continue operation.";
+ /* This may essentially corrupt the doublewrite
+ buffer. However, usually the doublewrite buffer
+ is created at database initialization, and it
+ should not matter (just remove all newly created
+ InnoDB files and restart). */
+ mtr.commit();
+ return false;
+ }
+
+ /* We read the allocated pages to the buffer pool; when they are
+ written to disk in a flush, the space id and page number fields
+ are also written to the pages. When we at database startup read
+ pages from the doublewrite buffer, we know that if the space id
+ and page number in them are the same as the page position in the
+ tablespace, then the page has not been written to in
+ doublewrite. */
+
+ ut_ad(new_block->page.lock.not_recursive());
+ const page_id_t id= new_block->page.id();
+ /* We only do this in the debug build, to ensure that the check in
+ buf_flush_init_for_writing() will see a valid page type. The
+ flushes of new_block are actually unnecessary here. */
+ ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame,
+ FIL_PAGE_TYPE_SYS));
+
+ if (i == size / 2)
+ {
+ ut_a(id.page_no() == size);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+ trx_sys_block->page.frame, id.page_no());
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+ TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame,
+ id.page_no());
+ }
+ else if (i == size / 2 + size)
+ {
+ ut_a(id.page_no() == 2 * size);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+ trx_sys_block->page.frame, id.page_no());
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+ TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame,
+ id.page_no());
+ }
+ else if (i > size / 2)
+ ut_a(id.page_no() == prev_page_no + 1);
+
+ if (((i + 1) & 15) == 0) {
+ /* rw_locks can only be recursively x-locked 2048 times. (on 32
+ bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+ negative number, and thus lock_word becomes like a shared lock).
+ For 4k page size this loop will lock the fseg header too many
+ times. Since this code is not done while any other threads are
+ active, restart the MTR occasionally. */
+ mtr.commit();
+ mtr.start();
+ trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+ fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+ trx_sys_block->page.frame;
+ }
+
+ prev_page_no= id.page_no();
+ }
+
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+ TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+ mtr.write<4>(*trx_sys_block,
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ trx_sys_block->page.frame,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+ mtr.commit();
+
+ buf_flush_wait_flushed(mtr.commit_lsn());
+
+ /* Remove doublewrite pages from LRU */
+ buf_pool_invalidate();
+ goto start_again;
+}
+
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
+@return DB_SUCCESS or error code */
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
+{
+ ut_ad(this == &buf_dblwr);
+ const uint32_t size= block_size();
+
+ /* We do the file i/o past the buffer pool */
+ byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+ /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+ dberr_t err= os_file_read(IORequestRead, file, read_buf,
+ TRX_SYS_PAGE_NO << srv_page_size_shift,
+ srv_page_size, nullptr);
+
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the system tablespace header page";
+func_exit:
+ aligned_free(read_buf);
+ return err;
+ }
+
+ /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+ if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+ read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+ {
+ /* There is no doublewrite buffer initialized in the TRX_SYS page.
+ This should normally not be possible; the doublewrite buffer should
+ be initialized when creating the database. */
+ err= DB_SUCCESS;
+ goto func_exit;
+ }
+
+ init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+ const bool upgrade_to_innodb_file_per_table=
+ mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+ TRX_SYS_DOUBLEWRITE + read_buf) !=
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+ auto write_buf= active_slot->write_buf;
+ /* Read the pages from the doublewrite buffer to memory */
+ err= os_file_read(IORequestRead, file, write_buf,
+ block1.page_no() << srv_page_size_shift,
+ size << srv_page_size_shift, nullptr);
+
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the first double write buffer extent";
+ goto func_exit;
+ }
+
+ err= os_file_read(IORequestRead, file,
+ write_buf + (size << srv_page_size_shift),
+ block2.page_no() << srv_page_size_shift,
+ size << srv_page_size_shift, nullptr);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to read the second double write buffer extent";
+ goto func_exit;
+ }
+
+ byte *page= write_buf;
+
+ if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+ {
+ ib::info() << "Resetting space id's in the doublewrite buffer";
+
+ for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+ {
+ memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+ /* For pre-MySQL-4.1 innodb_checksum_algorithm=innodb, we do not need to
+ calculate new checksums for the pages because the field
+ .._SPACE_ID does not affect them. Write the page back to where
+ we read it from. */
+ const ulint source_page_no= i < size
+ ? block1.page_no() + i
+ : block2.page_no() + i - size;
+ err= os_file_write(IORequestWrite, path, file, page,
+ source_page_no << srv_page_size_shift, srv_page_size);
+ if (err != DB_SUCCESS)
+ {
+ ib::error() << "Failed to upgrade the double write buffer";
+ goto func_exit;
+ }
+ }
+ os_file_flush(file);
+ }
+ else
+ for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+ if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+ /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+ recv_sys.dblwr.add(page);
+
+ err= DB_SUCCESS;
+ goto func_exit;
+}
+
+/** Process and remove the double write buffer pages for all tablespaces. */
+void buf_dblwr_t::recover()
+{
+ ut_ad(log_sys.last_checkpoint_lsn);
+ if (!is_created())
+ return;
+
+ uint32_t page_no_dblwr= 0;
+ byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+ srv_page_size));
+ byte *const buf= read_buf + srv_page_size;
+
+ for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+ i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+ {
+ byte *page= *i;
+ const uint32_t page_no= page_get_page_no(page);
+ if (!page_no) /* recovered via recv_dblwr_t::restore_first_page() */
+ continue;
+
+ const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+ if (log_sys.last_checkpoint_lsn > lsn)
+ /* Pages written before the checkpoint are not useful for recovery. */
+ continue;
+ const uint32_t space_id= page_get_space_id(page);
+ const page_id_t page_id(space_id, page_no);
+
+ if (recv_sys.scanned_lsn < lsn)
+ {
+ ib::info() << "Ignoring a doublewrite copy of page " << page_id
+ << " with future log sequence number " << lsn;
+ continue;
+ }
+
+ fil_space_t *space= fil_space_t::get(space_id);
+
+ if (!space)
+ /* The tablespace that this page once belonged to does not exist */
+ continue;
+
+ if (UNIV_UNLIKELY(page_no >= space->get_size()))
+ {
+ /* Do not report the warning for undo tablespaces, because they
+ can be truncated in place. */
+ if (!srv_is_undo_tablespace(space_id))
+ ib::warn() << "A copy of page " << page_no
+ << " in the doublewrite buffer slot " << page_no_dblwr
+ << " is beyond the end of " << space->chain.start->name
+ << " (" << space->size << " pages)";
+next_page:
+ space->release();
+ continue;
+ }
+
+ const ulint physical_size= space->physical_size();
+ ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+ /* We want to ensure that for partial reads the unread portion of
+ the page is NUL. */
+ memset(read_buf, 0x0, physical_size);
+
+ /* Read in the actual page from the file */
+ fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+ os_offset_t{page_no} * physical_size,
+ physical_size, read_buf);
+
+ if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+ {
+ ib::warn() << "Double write buffer recovery: " << page_id
+ << " ('" << space->chain.start->name
+ << "') read failed with error: " << fio.err;
+ continue;
+ }
+
+ if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+ {
+ /* We will check if the copy in the doublewrite buffer is
+ valid. If not, we will ignore this page (there should be redo
+ log records to initialize it). */
+ }
+ else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+ goto next_page;
+ else
+ /* We intentionally skip this message for all-zero pages. */
+ ib::info() << "Trying to recover page " << page_id
+ << " from the doublewrite buffer.";
+
+ page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+ if (!page)
+ goto next_page;
+
+ /* Write the good page from the doublewrite buffer to the intended
+ position. */
+ space->reacquire();
+ fio= space->io(IORequestWrite,
+ os_offset_t{page_id.page_no()} * physical_size,
+ physical_size, page);
+
+ if (fio.err == DB_SUCCESS)
+ ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+ << "' from the doublewrite buffer.";
+ goto next_page;
+ }
+
+ recv_sys.dblwr.pages.clear();
+ fil_flush_file_spaces();
+ aligned_free(read_buf);
+}
+
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
+{
+ if (!active_slot)
+ return;
+
+ ut_ad(!active_slot->reserved);
+ ut_ad(!active_slot->first_free);
+ ut_ad(!batch_running);
+
+ pthread_cond_destroy(&cond);
+ for (int i= 0; i < 2; i++)
+ {
+ aligned_free(slots[i].write_buf);
+ ut_free(slots[i].buf_block_arr);
+ }
+ mysql_mutex_destroy(&mutex);
+
+ memset((void*) this, 0, sizeof *this);
+}
+
+/** Update the doublewrite buffer on write completion. */
+void buf_dblwr_t::write_completed()
+{
+ ut_ad(this == &buf_dblwr);
+ ut_ad(!srv_read_only_mode);
+
+ mysql_mutex_lock(&mutex);
+
+ ut_ad(is_created());
+ ut_ad(srv_use_doublewrite_buf);
+ ut_ad(batch_running);
+ slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_ad(flush_slot->reserved);
+ ut_ad(flush_slot->reserved <= flush_slot->first_free);
+
+ if (!--flush_slot->reserved)
+ {
+ mysql_mutex_unlock(&mutex);
+ /* This will finish the batch. Sync data files to the disk. */
+ fil_flush_file_spaces();
+ mysql_mutex_lock(&mutex);
+
+ /* We can now reuse the doublewrite memory buffer: */
+ flush_slot->first_free= 0;
+ batch_running= false;
+ pthread_cond_broadcast(&cond);
+ }
+
+ mysql_mutex_unlock(&mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Check the LSN values on the page.
+@param[in] page page to check
+@param[in] s tablespace */
+static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
+{
+ /* Ignore page_compressed or encrypted pages */
+ if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
+ return;
+ const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
+ const byte* lsn_end= page + srv_page_size -
+ (s.full_crc32()
+ ? FIL_PAGE_FCRC32_END_LSN
+ : FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
+}
+
+static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
+{
+ if (fil_space_t *space= fil_space_t::get_for_write(b.id().space()))
+ {
+ buf_dblwr_check_page_lsn(page, *space);
+ space->release();
+ }
+}
+
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
+{
+ ut_ad(bpage->in_file());
+ const page_t *page= bpage->frame;
+ ut_ad(page);
+
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ if (page_is_comp(page))
+ {
+ if (page_simple_validate_new(page))
+ return;
+ }
+ else if (page_simple_validate_old(page))
+ return;
+ /* While it is possible that this is not an index page but just
+ happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+ be modified to without also adjusting the page type during page
+ allocation or buf_flush_init_for_writing() or
+ fil_block_reset_type(). */
+ buf_page_print(page);
+
+ ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+ << " to be written to data file. We intentionally crash"
+ " the server to prevent corrupt data from ending up in"
+ " data files.";
+ }
+}
+#endif /* UNIV_DEBUG */
+
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
+{
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(size == block_size());
+
+ for (;;)
+ {
+ if (!active_slot->first_free)
+ return false;
+ if (!batch_running)
+ break;
+ my_cond_wait(&cond, &mutex.m_mutex);
+ }
+
+ ut_ad(active_slot->reserved == active_slot->first_free);
+ ut_ad(!flushing_buffered_writes);
+
+ /* Disallow anyone else to start another batch of flushing. */
+ slot *flush_slot= active_slot;
+ /* Switch the active slot */
+ active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_a(active_slot->first_free == 0);
+ batch_running= true;
+ const ulint old_first_free= flush_slot->first_free;
+ auto write_buf= flush_slot->write_buf;
+ const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+ old_first_free > size;
+ flushing_buffered_writes= 1 + multi_batch;
+ /* Now safe to release the mutex. */
+ mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+ for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+ {
+ buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+ if (bpage->zip.data)
+ /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+ continue;
+
+ /* Check that the actual page in the buffer pool is not corrupt
+ and the LSN values are sane. */
+ buf_dblwr_check_block(bpage);
+ ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+ }
+#endif /* UNIV_DEBUG */
+ const IORequest request{nullptr, nullptr, fil_system.sys_space->chain.start,
+ IORequest::DBLWR_BATCH};
+ ut_a(fil_system.sys_space->acquire());
+ if (multi_batch)
+ {
+ fil_system.sys_space->reacquire();
+ os_aio(request, write_buf,
+ os_offset_t{block1.page_no()} << srv_page_size_shift,
+ size << srv_page_size_shift);
+ os_aio(request, write_buf + (size << srv_page_size_shift),
+ os_offset_t{block2.page_no()} << srv_page_size_shift,
+ (old_first_free - size) << srv_page_size_shift);
+ }
+ else
+ os_aio(request, write_buf,
+ os_offset_t{block1.page_no()} << srv_page_size_shift,
+ old_first_free << srv_page_size_shift);
+ return true;
+}
+
+static void *get_frame(const IORequest &request)
+{
+ if (request.slot)
+ return request.slot->out_buf;
+ const buf_page_t *bpage= request.bpage;
+ return bpage->zip.data ? bpage->zip.data : bpage->frame;
+}
+
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
+{
+ ut_ad(this == &buf_dblwr);
+ ut_ad(srv_use_doublewrite_buf);
+ ut_ad(is_created());
+ ut_ad(!srv_read_only_mode);
+ ut_ad(!request.bpage);
+ ut_ad(request.node == fil_system.sys_space->chain.start);
+ ut_ad(request.type == IORequest::DBLWR_BATCH);
+ mysql_mutex_lock(&mutex);
+ ut_ad(batch_running);
+ ut_ad(flushing_buffered_writes);
+ ut_ad(flushing_buffered_writes <= 2);
+ writes_completed++;
+ if (UNIV_UNLIKELY(--flushing_buffered_writes))
+ {
+ mysql_mutex_unlock(&mutex);
+ return;
+ }
+
+ slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+ ut_ad(flush_slot->reserved == flush_slot->first_free);
+ /* increment the doublewrite flushed pages counter */
+ pages_written+= flush_slot->first_free;
+ mysql_mutex_unlock(&mutex);
+
+ /* Now flush the doublewrite buffer data to disk */
+ fil_system.sys_space->flush<false>();
+
+ /* The writes have been flushed to disk now and in recovery we will
+ find them in the doublewrite buffer blocks. Next, write the data pages. */
+ for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
+ {
+ auto e= flush_slot->buf_block_arr[i];
+ buf_page_t* bpage= e.request.bpage;
+ ut_ad(bpage->in_file());
+
+ void *frame= get_frame(e.request);
+ ut_ad(frame);
+
+ auto e_size= e.size;
+
+ if (UNIV_LIKELY_NULL(bpage->zip.data))
+ {
+ e_size= bpage->zip_size();
+ ut_ad(e_size);
+ }
+ else
+ {
+ ut_ad(!bpage->zip_size());
+ ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+ }
+
+ const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN +
+ static_cast<const byte*>(frame)));
+ ut_ad(lsn);
+ ut_ad(lsn >= bpage->oldest_modification());
+ log_write_up_to(lsn, true);
+ e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+ frame, bpage);
+ }
+}
+
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
+{
+ if (!is_created() || !srv_use_doublewrite_buf)
+ {
+ fil_flush_file_spaces();
+ return;
+ }
+
+ ut_ad(!srv_read_only_mode);
+ const ulint size= block_size();
+
+ mysql_mutex_lock(&mutex);
+ if (!flush_buffered_writes(size))
+ mysql_mutex_unlock(&mutex);
+}
+
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request asynchronous write request
+@param size payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
+{
+ ut_ad(request.is_async());
+ ut_ad(request.is_write());
+ ut_ad(request.bpage);
+ ut_ad(request.bpage->in_file());
+ ut_ad(request.node);
+ ut_ad(request.node->space->purpose == FIL_TYPE_TABLESPACE);
+ ut_ad(request.node->space->id == request.bpage->id().space());
+ ut_ad(request.node->space->referenced());
+ ut_ad(!srv_read_only_mode);
+
+ const ulint buf_size= 2 * block_size();
+
+ mysql_mutex_lock(&mutex);
+
+ for (;;)
+ {
+ ut_ad(active_slot->first_free <= buf_size);
+ if (active_slot->first_free != buf_size)
+ break;
+
+ if (flush_buffered_writes(buf_size / 2))
+ mysql_mutex_lock(&mutex);
+ }
+
+ byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
+
+ /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+ and at least srv_page_size (4096-byte) for everything else. */
+ memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, get_frame(request), size);
+ /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+ memset_aligned<256>(p + size, 0, srv_page_size - size);
+ /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+ are integer multiples of 256, so the above can translate into simple
+ SIMD instructions. Currently, we make no such assumptions about the
+ non-pointer parameters that are passed to the _aligned templates. */
+ ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+ ut_ad(active_slot->reserved == active_slot->first_free);
+ ut_ad(active_slot->reserved < buf_size);
+ new (active_slot->buf_block_arr + active_slot->first_free++)
+ element{request, size};
+ active_slot->reserved= active_slot->first_free;
+
+ if (active_slot->first_free != buf_size ||
+ !flush_buffered_writes(buf_size / 2))
+ mysql_mutex_unlock(&mutex);
+}
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
new file mode 100644
index 00000000..957632db
--- /dev/null
+++ b/storage/innobase/buf/buf0dump.cc
@@ -0,0 +1,765 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "my_global.h"
+#include "mysqld.h"
+#include "my_sys.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "buf0rea.h"
+#include "buf0dump.h"
+#include "dict0dict.h"
+#include "os0file.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "ut0byte.h"
+
+#include <algorithm>
+
+#include "mysql/service_wsrep.h" /* wsrep_recovery */
+#include <my_service_manager.h>
+
+static void buf_do_load_dump();
+
+enum status_severity {
+ STATUS_INFO,
+ STATUS_ERR
+};
+
+#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE)
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static volatile bool buf_dump_should_start;
+static volatile bool buf_load_should_start;
+
+static bool buf_load_abort_flag;
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start()
+{
+ buf_dump_should_start= true;
+ buf_do_load_dump();
+}
+
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start()
+{
+ buf_load_should_start= true;
+ buf_do_load_dump();
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+ enum status_severity severity,/*!< in: status severity */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: extra parameters according
+ to fmt */
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_dump_status,
+ sizeof(export_vars.innodb_buffer_pool_dump_status),
+ fmt, ap);
+
+ switch (severity) {
+ case STATUS_INFO:
+ ib::info() << export_vars.innodb_buffer_pool_dump_status;
+ break;
+
+ case STATUS_ERR:
+ ib::error() << export_vars.innodb_buffer_pool_dump_status;
+ break;
+ }
+
+ va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+ enum status_severity severity,/*!< in: status severity */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: extra parameters according to fmt */
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_load_status,
+ sizeof(export_vars.innodb_buffer_pool_load_status),
+ fmt, ap);
+
+ switch (severity) {
+ case STATUS_INFO:
+ ib::info() << export_vars.innodb_buffer_pool_load_status;
+ break;
+
+ case STATUS_ERR:
+ ib::error() << export_vars.innodb_buffer_pool_load_status;
+ break;
+ }
+
+ va_end(ap);
+}
+
+/** Returns the directory path where the buffer pool dump file will be created.
+@return directory path */
+static
+const char*
+get_buf_dump_dir()
+{
+ const char* dump_dir;
+
+ /* The dump file should be created in the default data directory if
+ innodb_data_home_dir is set as an empty string. */
+ if (!*srv_data_home) {
+ dump_dir = fil_path_to_mysql_datadir;
+ } else {
+ dump_dir = srv_data_home;
+ }
+
+ return(dump_dir);
+}
+
+/** Generate the path to the buffer pool dump/load file.
+@param[out] path generated path
+@param[in] path_size size of 'path', used as in snprintf(3). */
+static void buf_dump_generate_path(char *path, size_t path_size)
+{
+ char buf[FN_REFLEN];
+
+ mysql_mutex_lock(&LOCK_global_system_variables);
+ snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(),
+ srv_buf_dump_filename);
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+
+ os_file_type_t type;
+ bool exists = false;
+ bool ret;
+
+ ret = os_file_status(buf, &exists, &type);
+
+ /* For realpath() to succeed the file must exist. */
+
+ if (ret && exists) {
+ /* my_realpath() assumes the destination buffer is big enough
+ to hold FN_REFLEN bytes. */
+ ut_a(path_size >= FN_REFLEN);
+
+ my_realpath(path, buf, 0);
+ } else {
+ /* If it does not exist, then resolve only srv_data_home
+ and append srv_buf_dump_filename to it. */
+ char srv_data_home_full[FN_REFLEN];
+
+ my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
+ const char *format;
+
+ switch (srv_data_home_full[strlen(srv_data_home_full) - 1]) {
+#ifdef _WIN32
+ case '\\':
+#endif
+ case '/':
+ format = "%s%s";
+ break;
+ default:
+ format = "%s/%s";
+ }
+
+ snprintf(path, path_size, format,
+ srv_data_home_full, srv_buf_dump_filename);
+ }
+}
+
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+ ibool obey_shutdown) /*!< in: quit if we are in a shutting down
+ state */
+{
+#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown)
+
+ char full_filename[OS_FILE_MAX_PATH];
+ char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
+ char now[32];
+ FILE* f;
+ int ret;
+
+ buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+ snprintf(tmp_filename, sizeof(tmp_filename),
+ "%s.incomplete", full_filename);
+
+ buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
+ full_filename);
+
+#ifdef _WIN32
+ /* use my_fopen() for correct permissions during bootstrap*/
+ f = my_fopen(tmp_filename, O_RDWR|O_TRUNC|O_CREAT, 0);
+#elif defined(__GLIBC__) || O_CLOEXEC == 0
+ f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
+#else
+ {
+ int fd;
+ fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640);
+ if (fd >= 0) {
+ f = fdopen(fd, "w");
+ }
+ else {
+ f = NULL;
+ }
+ }
+#endif
+ if (f == NULL) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot open '%s' for writing: %s",
+ tmp_filename, strerror(errno));
+ return;
+ }
+ const buf_page_t* bpage;
+ page_id_t* dump;
+ ulint n_pages;
+ ulint j;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ /* skip empty buffer pools */
+ if (n_pages == 0) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto done;
+ }
+
+ if (srv_buf_pool_dump_pct != 100) {
+ ulint t_pages;
+
+ /* limit the number of total pages dumped to X% of the
+ total number of pages */
+ t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
+ if (n_pages > t_pages) {
+ buf_dump_status(STATUS_INFO,
+ "Restricted to " ULINTPF
+ " pages due to "
+ "innodb_buf_pool_dump_pct=%lu",
+ t_pages, srv_buf_pool_dump_pct);
+ n_pages = t_pages;
+ }
+
+ if (n_pages == 0) {
+ n_pages = 1;
+ }
+ }
+
+ dump = static_cast<page_id_t*>(ut_malloc_nokey(
+ n_pages * sizeof(*dump)));
+
+ if (dump == NULL) {
+ std::ostringstream str_bytes;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ fclose(f);
+ str_bytes << ib::bytes_iec{n_pages * sizeof(*dump)};
+ buf_dump_status(STATUS_ERR,
+ "Cannot allocate %s: %s",
+ str_bytes.str().c_str(),
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+
+ for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
+ bpage != NULL && j < n_pages;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+ const auto status = bpage->state();
+ if (status < buf_page_t::UNFIXED) {
+ ut_a(status >= buf_page_t::FREED);
+ continue;
+ }
+ const page_id_t id{bpage->id()};
+
+ if (id.space() == SRV_TMP_SPACE_ID) {
+ /* Ignore the innodb_temporary tablespace. */
+ continue;
+ }
+
+ dump[j++] = id;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ ut_a(j <= n_pages);
+ n_pages = j;
+
+ for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+ ret = fprintf(f, "%u,%u\n",
+ dump[j].space(), dump[j].page_no());
+ if (ret < 0) {
+ ut_free(dump);
+ fclose(f);
+ buf_dump_status(STATUS_ERR,
+ "Cannot write to '%s': %s",
+ tmp_filename, strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ if (SHUTTING_DOWN() && !(j & 1023)) {
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Dumping buffer pool page "
+ ULINTPF "/" ULINTPF, j + 1, n_pages);
+ }
+ }
+
+ ut_free(dump);
+
+done:
+ ret = IF_WIN(my_fclose(f,0),fclose(f));
+ if (ret != 0) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot close '%s': %s",
+ tmp_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ ret = unlink(full_filename);
+ if (ret != 0 && errno != ENOENT) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot delete '%s': %s",
+ full_filename, strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ /* else */
+
+ ret = rename(tmp_filename, full_filename);
+ if (ret != 0) {
+ buf_dump_status(STATUS_ERR,
+ "Cannot rename '%s' to '%s': %s",
+ tmp_filename, full_filename,
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+ /* else */
+
+ /* success */
+
+ ut_sprintf_timestamp(now);
+
+ buf_dump_status(STATUS_INFO,
+ "Buffer pool(s) dump completed at %s", now);
+
+ /* Though dumping doesn't related to an incomplete load,
+ we reset this to 0 here to indicate that a shutdown can also perform
+ a dump */
+ export_vars.innodb_buffer_pool_load_incomplete = 0;
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+ char full_filename[OS_FILE_MAX_PATH];
+ char now[32];
+ FILE* f;
+ page_id_t* dump;
+ ulint dump_n;
+ ulint i;
+ uint32_t space_id;
+ uint32_t page_no;
+ int fscanf_ret;
+
+ /* Ignore any leftovers from before */
+ buf_load_abort_flag = false;
+
+ buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+ buf_load_status(STATUS_INFO,
+ "Loading buffer pool(s) from %s", full_filename);
+
+ f = fopen(full_filename, "r" STR_O_CLOEXEC);
+ if (f == NULL) {
+ buf_load_status(STATUS_INFO,
+ "Cannot open '%s' for reading: %s",
+ full_filename, strerror(errno));
+ return;
+ }
+ /* else */
+
+ /* First scan the file to estimate how many entries are in it.
+ This file is tiny (approx 500KB per 1GB buffer pool), reading it
+ two times is fine. */
+ dump_n = 0;
+ while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
+ && !SHUTTING_DOWN()) {
+ dump_n++;
+ }
+
+ if (!SHUTTING_DOWN() && !feof(f)) {
+ /* fscanf() returned != 2 */
+ const char* what;
+ if (ferror(f)) {
+ what = "reading";
+ } else {
+ what = "parsing";
+ }
+ fclose(f);
+ buf_load_status(STATUS_ERR, "Error %s '%s',"
+ " unable to load buffer pool (stage 1)",
+ what, full_filename);
+ return;
+ }
+
+ /* If dump is larger than the buffer pool(s), then we ignore the
+ extra trailing. This could happen if a dump is made, then buffer
+ pool is shrunk and then load is attempted. */
+ dump_n = std::min(dump_n, buf_pool.get_n_pages());
+
+ if (dump_n != 0) {
+ dump = static_cast<page_id_t*>(ut_malloc_nokey(
+ dump_n * sizeof(*dump)));
+ } else {
+ fclose(f);
+ ut_sprintf_timestamp(now);
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load completed at %s"
+ " (%s was empty)", now, full_filename);
+ return;
+ }
+
+ if (dump == NULL) {
+ std::ostringstream str_bytes;
+ fclose(f);
+ str_bytes << ib::bytes_iec{dump_n * sizeof(*dump)};
+ buf_dump_status(STATUS_ERR,
+ "Cannot allocate %s: %s",
+ str_bytes.str().c_str(),
+ strerror(errno));
+ /* leave tmp_filename to exist */
+ return;
+ }
+
+ rewind(f);
+
+ export_vars.innodb_buffer_pool_load_incomplete = 1;
+
+ for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+ fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
+
+ if (fscanf_ret != 2) {
+ if (feof(f)) {
+ break;
+ }
+ /* else */
+
+ ut_free(dump);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Error parsing '%s', unable"
+ " to load buffer pool (stage 2)",
+ full_filename);
+ return;
+ }
+
+ if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+ ut_free(dump);
+ fclose(f);
+ buf_load_status(STATUS_ERR,
+ "Error parsing '%s': bogus"
+ " space,page %u,%u at line " ULINTPF
+ ", unable to load buffer pool",
+ full_filename,
+ space_id, page_no,
+ i);
+ return;
+ }
+
+ dump[i] = page_id_t(space_id, page_no);
+ }
+
+ /* Set dump_n to the actual number of initialized elements,
+ i could be smaller than dump_n here if the file got truncated after
+ we read it the first time. */
+ dump_n = i;
+
+ fclose(f);
+
+ if (dump_n == 0) {
+ ut_free(dump);
+ ut_sprintf_timestamp(now);
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load completed at %s"
+ " (%s was empty or had errors)", now, full_filename);
+ return;
+ }
+
+ if (!SHUTTING_DOWN()) {
+ std::sort(dump, dump + dump_n);
+ }
+
+ /* Avoid calling the expensive fil_space_t::get() for each
+ page within the same tablespace. dump[] is sorted by (space, page),
+ so all pages from a given tablespace are consecutive. */
+ uint32_t cur_space_id = dump[0].space();
+ fil_space_t* space = fil_space_t::get(cur_space_id);
+ ulint zip_size = space ? space->zip_size() : 0;
+
+ PSI_stage_progress* pfs_stage_progress __attribute__((unused))
+ = mysql_set_stage(srv_stage_buffer_pool_load.m_key);
+ mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
+ mysql_stage_set_work_completed(pfs_stage_progress, 0);
+
+ for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+ /* space_id for this iteration of the loop */
+ const uint32_t this_space_id = dump[i].space();
+
+ if (this_space_id >= SRV_SPACE_ID_UPPER_BOUND) {
+ continue;
+ }
+
+ if (this_space_id != cur_space_id) {
+ if (space) {
+ space->release();
+ }
+
+ cur_space_id = this_space_id;
+ space = fil_space_t::get(cur_space_id);
+
+ if (!space) {
+ continue;
+ }
+
+ zip_size = space->zip_size();
+ }
+
+ /* JAN: TODO: As we use background page read below,
+ if tablespace is encrypted we cant use it. */
+ if (!space || dump[i].page_no() >= space->get_size() ||
+ (space->crypt_data &&
+ space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
+ space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+ continue;
+ }
+
+ if (space->is_stopping()) {
+ space->release();
+ space = nullptr;
+ continue;
+ }
+
+ space->reacquire();
+ buf_read_page_background(space, dump[i], zip_size);
+
+ if (buf_load_abort_flag) {
+ if (space) {
+ space->release();
+ }
+ buf_load_abort_flag = false;
+ ut_free(dump);
+ buf_load_status(
+ STATUS_INFO,
+ "Buffer pool(s) load aborted on request");
+ /* Premature end, set estimated = completed = i and
+ end the current stage event. */
+
+ mysql_stage_set_work_estimated(pfs_stage_progress, i);
+ mysql_stage_set_work_completed(pfs_stage_progress, i);
+
+ mysql_end_stage();
+ return;
+ }
+
+#ifdef UNIV_DEBUG
+ if ((i+1) >= srv_buf_pool_load_pages_abort) {
+ buf_load_abort_flag = true;
+ }
+#endif
+ }
+
+ if (space) {
+ space->release();
+ }
+
+ ut_free(dump);
+
+ if (i == dump_n) {
+ os_aio_wait_until_no_pending_reads(true);
+ }
+
+ ut_sprintf_timestamp(now);
+
+ if (i == dump_n) {
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load completed at %s", now);
+ export_vars.innodb_buffer_pool_load_incomplete = 0;
+ } else if (!buf_load_abort_flag) {
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load aborted due to user instigated abort at %s",
+ now);
+ /* intentionally don't reset innodb_buffer_pool_load_incomplete
+ as we don't want a shutdown to save the buffer pool */
+ } else {
+ buf_load_status(STATUS_INFO,
+ "Buffer pool(s) load aborted due to shutdown at %s",
+ now);
+ /* intentionally don't reset innodb_buffer_pool_load_incomplete
+ as we want to abort without saving the buffer pool */
+ }
+
+ /* Make sure that estimated = completed when we end. */
+ mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
+ /* End the stage progress event. */
+ mysql_end_stage();
+}
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort()
+{
+ buf_load_abort_flag= true;
+}
+
+/*****************************************************************//**
+This is the main task for buffer pool dump/load. when scheduled
+either performs a dump or load, depending on server state, state of the variables etc- */
+static void buf_dump_load_func(void *)
+{
+ ut_ad(!srv_read_only_mode);
+ static bool first_time = true;
+ if (first_time && srv_buffer_pool_load_at_startup) {
+
+#ifdef WITH_WSREP
+ if (!get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+ srv_thread_pool->set_concurrency(srv_n_read_io_threads);
+ buf_load();
+ srv_thread_pool->set_concurrency();
+#ifdef WITH_WSREP
+ }
+#endif /* WITH_WSREP */
+ }
+ first_time = false;
+
+ while (!SHUTTING_DOWN()) {
+ if (buf_dump_should_start) {
+ buf_dump_should_start = false;
+ buf_dump(true);
+ }
+ if (buf_load_should_start) {
+ buf_load_should_start = false;
+ buf_load();
+ }
+
+ if (!buf_dump_should_start && !buf_load_should_start) {
+ return;
+ }
+ }
+
+ /* In shutdown */
+ if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+ if (export_vars.innodb_buffer_pool_load_incomplete) {
+ buf_dump_status(STATUS_INFO,
+ "Dumping of buffer pool not started"
+ " as load was incomplete");
+#ifdef WITH_WSREP
+ } else if (get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+ } else {
+ buf_dump(false/* do complete dump at shutdown */);
+ }
+ }
+}
+
+
+/* Execute task with max.concurrency */
+static tpool::task_group tpool_group(1);
+static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
+static bool load_dump_enabled;
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup()
+{
+ load_dump_enabled= true;
+ if (srv_buffer_pool_load_at_startup)
+ buf_do_load_dump();
+}
+
+static void buf_do_load_dump()
+{
+ if (load_dump_enabled && !buf_dump_load_task.is_running())
+ srv_thread_pool->submit_task(&buf_dump_load_task);
+}
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end()
+{
+ ut_ad(SHUTTING_DOWN());
+ buf_dump_load_task.wait();
+}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
new file mode 100644
index 00000000..b6357989
--- /dev/null
+++ b/storage/innobase/buf/buf0flu.cc
@@ -0,0 +1,2765 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2014, Fusion-io
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <my_service_manager.h>
+#include <mysql/service_thd_wait.h>
+#include <sql_class.h>
+
+#include "buf0flu.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "buf0dblwr.h"
+#include "srv0start.h"
+#include "page0zip.h"
+#include "fil0fil.h"
+#include "log0crypt.h"
+#include "srv0mon.h"
+#include "fil0pagecompress.h"
+#include "lzo/lzo1x.h"
+#include "snappy-c.h"
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_pool.stat.n_pages_written. */
+ulint buf_lru_flush_page_count;
+
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+ulint buf_lru_freed_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+Atomic_relaxed<bool> buf_page_cleaner_is_active;
+
+/** Factor for scan length to determine n_pages for intended oldest LSN
+progress */
+static constexpr ulint buf_flush_lsn_scan_factor = 3;
+
+/** Average redo generation rate */
+static lsn_t lsn_avg_rate = 0;
+
+/** Target oldest_modification for the page cleaner background flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
+/** Target oldest_modification for the page cleaner furious flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** Page cleaner structure */
+static struct
+{
+ /** total elapsed time in adaptive flushing, in seconds */
+ ulint flush_time;
+ /** number of adaptive flushing passes */
+ ulint flush_pass;
+} page_cleaner;
+
+/* @} */
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+static void buf_flush_validate_low();
+
+/** Validates the flush list some of the time. */
+static void buf_flush_validate_skip()
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP 23
+
+ /** The buf_flush_validate_low() call skip counter.
+ Use a signed type because of the race condition below. */
+ static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+ /* There is a race condition below, but it does not matter,
+ because this call is only for heuristic purposes. We want to
+ reduce the call frequency of the costly buf_flush_validate_low()
+ check in debug builds. */
+ if (--buf_flush_validate_count > 0) {
+ return;
+ }
+
+ buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+ buf_flush_validate_low();
+}
+#endif /* UNIV_DEBUG */
+
+void buf_pool_t::page_cleaner_wakeup(bool for_LRU)
+{
+ ut_d(buf_flush_validate_skip());
+ if (!page_cleaner_idle())
+ {
+ if (for_LRU)
+ /* Ensure that the page cleaner is not in a timed wait. */
+ pthread_cond_signal(&do_flush_list);
+ return;
+ }
+ double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
+ double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+ double pct_lwm= srv_max_dirty_pages_pct_lwm;
+
+ /* if pct_lwm != 0.0, adaptive flushing is enabled.
+ signal buf page cleaner thread
+ - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
+ - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
+
+ idle_flushing:
+ dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
+ idle flushing use-case.
+
+ Why is last_activity_count not updated always?
+ - let's first understand when is server activity count updated.
+ - it is updated on commit of a transaction trx_t::commit() and not
+ on adding a page to the flush list.
+ - page_cleaner_wakeup is called when a page is added to the flush list.
+
+ - now let's say the first user thread, updates the count from X -> Y but
+ is yet to commit the transaction (so activity count is still Y).
+ followup user threads will see the updated count as (Y) that is matching
+ the universal server activity count (Y), giving a false impression that
+ the server is idle.
+
+ How to avoid this?
+ - by allowing last_activity_count to updated when page-cleaner is made
+ active and has work to do. This ensures that the last_activity signal
+ is consumed by the page-cleaner before the next one is generated. */
+ if (for_LRU ||
+ (pct_lwm != 0.0 && (pct_lwm <= dirty_pct ||
+ last_activity_count == srv_get_activity_count())) ||
+ srv_max_buf_pool_modified_pct <= dirty_pct)
+ {
+ page_cleaner_status-= PAGE_CLEANER_IDLE;
+ pthread_cond_signal(&do_flush_list);
+ }
+}
+
+/** Remove a block from flush_list.
+@param bpage buffer pool page */
+void buf_pool_t::delete_from_flush_list(buf_page_t *bpage) noexcept
+{
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ flush_hp.adjust(bpage);
+ UT_LIST_REMOVE(flush_list, bpage);
+ flush_list_bytes-= bpage->physical_size();
+ bpage->clear_oldest_modification();
+#ifdef UNIV_DEBUG
+ buf_flush_validate_skip();
+#endif /* UNIV_DEBUG */
+}
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id tablespace identifier */
+void buf_flush_remove_pages(uint32_t id)
+{
+ const page_id_t first(id, 0), end(id + 1, 0);
+ ut_ad(id);
+
+ for (;;)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ bool deferred= false;
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+ {
+ const auto s= bpage->state();
+ ut_ad(s >= buf_page_t::REMOVE_HASH);
+ ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+ const page_id_t bpage_id(bpage->id());
+
+ if (bpage_id < first || bpage_id >= end);
+ else if (s >= buf_page_t::WRITE_FIX)
+ deferred= true;
+ else
+ buf_pool.delete_from_flush_list(bpage);
+
+ bpage= prev;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (!deferred)
+ break;
+
+ os_aio_wait_until_no_pending_writes(true);
+ }
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+
+ const lsn_t lsn = bpage->oldest_modification();
+
+ if (!lsn) {
+ return;
+ }
+
+ ut_ad(lsn == 1 || lsn > 2);
+ ut_ad(dpage->oldest_modification() == lsn);
+
+ /* Important that we adjust the hazard pointer before removing
+ the bpage from the flush list. */
+ buf_pool.flush_hp.adjust(bpage);
+
+ prev = UT_LIST_GET_PREV(list, bpage);
+ UT_LIST_REMOVE(buf_pool.flush_list, bpage);
+
+ bpage->clear_oldest_modification();
+
+ if (lsn == 1) {
+ buf_pool.flush_list_bytes -= dpage->physical_size();
+ dpage->list.prev = nullptr;
+ dpage->list.next = nullptr;
+ dpage->clear_oldest_modification();
+ } else if (prev) {
+ ut_ad(prev->oldest_modification());
+ UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
+ }
+
+ ut_d(buf_flush_validate_low());
+}
+
+/** Note that a block is no longer dirty, while not removing
+it from buf_pool.flush_list
+@param temporary whether the page belongs to the temporary tablespace
+@param error whether an error may have occurred while writing */
+inline void buf_page_t::write_complete(bool temporary, bool error)
+{
+ ut_ad(temporary == fsp_is_system_temporary(id().space()));
+ if (UNIV_UNLIKELY(error));
+ else if (temporary)
+ {
+ ut_ad(oldest_modification() == 2);
+ oldest_modification_= 0;
+ }
+ else
+ {
+ /* We use release memory order to guarantee that callers of
+ oldest_modification_acquire() will observe the block as
+ being detached from buf_pool.flush_list, after reading the value 0. */
+ ut_ad(oldest_modification() > 2);
+ oldest_modification_.store(1, std::memory_order_release);
+ }
+ const auto s= state();
+ ut_ad(s >= WRITE_FIX);
+ zip.fix.fetch_sub((s >= WRITE_FIX_REINIT)
+ ? (WRITE_FIX_REINIT - UNFIXED)
+ : (WRITE_FIX - UNFIXED));
+ lock.u_unlock(true);
+}
+
+inline void buf_pool_t::n_flush_inc()
+{
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ page_cleaner_status+= LRU_FLUSH;
+}
+
+inline void buf_pool_t::n_flush_dec()
+{
+ mysql_mutex_lock(&flush_list_mutex);
+ ut_ad(page_cleaner_status >= LRU_FLUSH);
+ if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH)
+ pthread_cond_broadcast(&done_flush_LRU);
+ mysql_mutex_unlock(&flush_list_mutex);
+}
+
+inline void buf_pool_t::n_flush_dec_holding_mutex()
+{
+ mysql_mutex_assert_owner(&flush_list_mutex);
+ ut_ad(page_cleaner_status >= LRU_FLUSH);
+ page_cleaner_status-= LRU_FLUSH;
+}
+
+/** Complete write of a file page from buf_pool.
+@param request write request
+@param error whether the write may have failed */
+void buf_page_write_complete(const IORequest &request, bool error)
+{
+ ut_ad(request.is_write());
+ ut_ad(!srv_read_only_mode);
+ buf_page_t *bpage= request.bpage;
+ ut_ad(bpage);
+ const auto state= bpage->state();
+ /* io-fix can only be cleared by buf_page_t::write_complete()
+ and buf_page_t::read_complete() */
+ ut_ad(state >= buf_page_t::WRITE_FIX);
+ ut_ad(!buf_dblwr.is_inside(bpage->id()));
+ ut_ad(request.node->space->id == bpage->id().space());
+
+ if (request.slot)
+ request.slot->release();
+
+ if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+ buf_page_monitor(*bpage, false);
+ DBUG_PRINT("ib_buf", ("write page %u:%u",
+ bpage->id().space(), bpage->id().page_no()));
+
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+ if (request.is_LRU())
+ {
+ const bool temp= bpage->oldest_modification() == 2;
+ if (!temp && state < buf_page_t::WRITE_FIX_REINIT &&
+ request.node->space->use_doublewrite())
+ buf_dblwr.write_completed();
+ /* We must hold buf_pool.mutex while releasing the block, so that
+ no other thread can access it before we have freed it. */
+ mysql_mutex_lock(&buf_pool.mutex);
+ bpage->write_complete(temp, error);
+ if (!error)
+ buf_LRU_free_page(bpage, true);
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ buf_pool.n_flush_dec();
+ }
+ else
+ {
+ if (state < buf_page_t::WRITE_FIX_REINIT &&
+ request.node->space->use_doublewrite())
+ buf_dblwr.write_completed();
+ bpage->write_complete(false, error);
+ }
+}
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out] page page to update
+@param[in] size compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
+{
+ ut_ad(size > 0);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ page_zip_calc_checksum(page, size, false));
+}
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out] page page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page)
+{
+ ut_d(bool compressed = false);
+ ut_d(bool corrupted = false);
+ ut_d(const uint size = buf_page_full_crc32_size(page, &compressed,
+ &corrupted));
+ ut_ad(!compressed);
+ ut_ad(!corrupted);
+ ut_ad(size == uint(srv_page_size));
+ const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+ mach_write_to_4(page + payload, my_crc32c(0, page, payload));
+}
+
+/** Initialize a page for writing to the tablespace.
+@param[in] block buffer block; NULL if bypassing
+ the buffer pool
+@param[in,out] page page frame
+@param[in,out] page_zip_ compressed page, or NULL if
+ uncompressed
+@param[in] use_full_checksum whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+ const buf_block_t* block,
+ byte* page,
+ void* page_zip_,
+ bool use_full_checksum)
+{
+ if (block && block->page.frame != page) {
+ /* If page is encrypted in full crc32 format then
+ checksum stored already as a part of fil_encrypt_buf() */
+ ut_ad(use_full_checksum);
+ return;
+ }
+
+ ut_ad(!block || block->page.frame == page);
+ ut_ad(page);
+
+ if (page_zip_) {
+ page_zip_des_t* page_zip;
+ ulint size;
+
+ page_zip = static_cast<page_zip_des_t*>(page_zip_);
+ ut_ad(!block || &block->page.zip == page_zip);
+ size = page_zip_get_size(page_zip);
+
+ ut_ad(size);
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size <= UNIV_ZIP_SIZE_MAX);
+
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ memcpy(page_zip->data, page, size);
+ /* fall through */
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ buf_flush_update_zip_checksum(page_zip->data, size);
+ return;
+ }
+
+ ib::error() << "The compressed page to be written"
+ " seems corrupt:";
+ ut_print_buf(stderr, page, size);
+ fputs("\nInnoDB: Possibly older version of the page:", stderr);
+ ut_print_buf(stderr, page_zip->data, size);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ if (use_full_checksum) {
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
+ memcpy_aligned<4>(page + srv_page_size
+ - FIL_PAGE_FCRC32_END_LSN,
+ FIL_PAGE_LSN + 4 + page, 4);
+ return buf_flush_assign_full_crc32_checksum(page);
+ }
+
+ static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
+ static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
+ memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ FIL_PAGE_LSN + page, 8);
+
+ if (block && srv_page_size == 16384) {
+ /* The page type could be garbage in old files
+ created before MySQL 5.5. Such files always
+ had a page size of 16 kilobytes. */
+ ulint page_type = fil_page_get_type(page);
+ ulint reset_type = page_type;
+
+ switch (block->page.id().page_no() % 16384) {
+ case 0:
+ reset_type = block->page.id().page_no() == 0
+ ? FIL_PAGE_TYPE_FSP_HDR
+ : FIL_PAGE_TYPE_XDES;
+ break;
+ case 1:
+ reset_type = FIL_PAGE_IBUF_BITMAP;
+ break;
+ case FSP_TRX_SYS_PAGE_NO:
+ if (block->page.id()
+ == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
+ reset_type = FIL_PAGE_TYPE_TRX_SYS;
+ break;
+ }
+ /* fall through */
+ default:
+ switch (page_type) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ case FIL_PAGE_UNDO_LOG:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_FREE_LIST:
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_TYPE_SYS:
+ case FIL_PAGE_TYPE_TRX_SYS:
+ case FIL_PAGE_TYPE_BLOB:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_IBUF_BITMAP:
+ /* These pages should have
+ predetermined page numbers
+ (see above). */
+ default:
+ reset_type = FIL_PAGE_TYPE_UNKNOWN;
+ break;
+ }
+ }
+
+ if (UNIV_UNLIKELY(page_type != reset_type)) {
+ ib::info()
+ << "Resetting invalid page "
+ << block->page.id() << " type "
+ << page_type << " to "
+ << reset_type << " when flushing.";
+ fil_page_set_type(page, reset_type);
+ }
+ }
+
+ const uint32_t checksum = buf_calc_page_crc32(page);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+ mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ checksum);
+}
+
+/** Reserve a buffer for compression.
+@param[in,out] slot reserved slot */
+static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
+{
+ if (slot->comp_buf)
+ return;
+ /* Both Snappy and LZO compression methods require that the output
+ buffer be bigger than input buffer. Adjust the allocated size. */
+ ulint size= srv_page_size;
+ if (provider_service_lzo->is_loaded)
+ size= LZO1X_1_15_MEM_COMPRESS;
+ else if (provider_service_snappy->is_loaded)
+ size= snappy_max_compressed_length(size);
+ slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
+}
+
+/** Encrypt a buffer of temporary tablespace
+@param[in] offset Page offset
+@param[in] s Page to encrypt
+@param[in,out] d Output buffer
+@return encrypted buffer or NULL */
+static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
+{
+ /* Calculate the start offset in a page */
+ uint srclen= static_cast<uint>(srv_page_size) -
+ (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
+ FIL_PAGE_FCRC32_CHECKSUM);
+ const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+ memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
+ return NULL;
+
+ const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+ mach_write_to_4(d + payload, my_crc32c(0, d, payload));
+
+ srv_stats.pages_encrypted.inc();
+ srv_stats.n_temp_blocks_encrypted.inc();
+ return d;
+}
+
+/** Encryption and page_compression hook that is called just before
+a page is written to disk.
+@param[in,out] space tablespace
+@param[in,out] bpage buffer page
+@param[in] s physical page frame that is being encrypted
+@param[in,out] size payload size in bytes
+@return page frame to be written to file
+(may be src_frame or an encrypted/compressed copy of it) */
+static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
+ buf_tmp_buffer_t **slot, size_t *size)
+{
+ ut_ad(!bpage->is_freed());
+ ut_ad(space->id == bpage->id().space());
+ ut_ad(!*slot);
+
+ const uint32_t page_no= bpage->id().page_no();
+
+ switch (page_no) {
+ case TRX_SYS_PAGE_NO:
+ if (bpage->id().space() != TRX_SYS_SPACE)
+ break;
+ /* The TRX_SYS page is neither encrypted nor compressed, because
+ it contains the address of the doublewrite buffer. */
+ /* fall through */
+ case 0:
+ /* Page 0 of a tablespace is not encrypted/compressed */
+ return s;
+ }
+
+ fil_space_crypt_t *crypt_data= space->crypt_data;
+ bool encrypted, page_compressed;
+ if (space->purpose == FIL_TYPE_TEMPORARY)
+ {
+ ut_ad(!crypt_data);
+ encrypted= innodb_encrypt_temporary_tables;
+ page_compressed= false;
+ }
+ else
+ {
+ encrypted= crypt_data && !crypt_data->not_encrypted() &&
+ crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+ (!crypt_data->is_default_encryption() || srv_encrypt_tables);
+ page_compressed= space->is_compressed();
+ }
+
+ const bool full_crc32= space->full_crc32();
+
+ if (!encrypted && !page_compressed)
+ {
+ /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
+ "not perfect alignment");
+ if (full_crc32)
+ memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
+ else
+ memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+ return s;
+ }
+
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+ if (full_crc32)
+ memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
+ FIL_PAGE_LSN + 4 + s, 4);
+
+ ut_ad(!bpage->zip_size() || !page_compressed);
+ /* Find free slot from temporary memory array */
+ *slot= buf_pool.io_buf_reserve();
+ ut_a(*slot);
+ (*slot)->allocate();
+
+ byte *d= (*slot)->crypt_buf;
+
+ if (!page_compressed)
+ {
+not_compressed:
+ d= space->purpose == FIL_TYPE_TEMPORARY
+ ? buf_tmp_page_encrypt(page_no, s, d)
+ : fil_space_encrypt(space, page_no, s, d);
+ }
+ else
+ {
+ ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
+ /* First we compress the page content */
+ buf_tmp_reserve_compression_buf(*slot);
+ byte *tmp= (*slot)->comp_buf;
+ ulint len= fil_page_compress(s, tmp, space->flags,
+ fil_space_get_block_size(space, page_no),
+ encrypted);
+
+ if (!len)
+ goto not_compressed;
+
+ *size= len;
+
+ if (full_crc32)
+ {
+ ut_d(bool compressed = false);
+ len= buf_page_full_crc32_size(tmp,
+#ifdef UNIV_DEBUG
+ &compressed,
+#else
+ NULL,
+#endif
+ NULL);
+ ut_ad(compressed);
+ }
+
+ /* Workaround for MDEV-15527. */
+ memset(tmp + len, 0 , srv_page_size - len);
+
+ if (encrypted)
+ tmp= fil_space_encrypt(space, page_no, tmp, d);
+
+ if (full_crc32)
+ {
+ static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+ mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4));
+ ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
+ }
+
+ d= tmp;
+ }
+
+ (*slot)->out_buf= d;
+ return d;
+}
+
+/** Free a page whose underlying file page has been freed. */
+ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept
+{
+ mysql_mutex_assert_owner(&mutex);
+ ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
+ if (fsp_is_system_temporary(bpage->id().space()))
+ {
+ ut_ad(bpage->frame);
+ ut_ad(oldest_modification == 2);
+ bpage->clear_oldest_modification();
+ }
+ else
+ {
+ mysql_mutex_lock(&flush_list_mutex);
+ ut_ad(oldest_modification > 2);
+ delete_from_flush_list(bpage);
+ mysql_mutex_unlock(&flush_list_mutex);
+ }
+
+ bpage->lock.u_unlock(true);
+ buf_LRU_free_page(bpage, true);
+}
+
+/** Write a flushable page to a file or free a freeable block.
+@param evict whether to evict the page on write completion
+@param space tablespace
+@return whether a page write was initiated and buf_pool.mutex released */
+bool buf_page_t::flush(bool evict, fil_space_t *space)
+{
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ ut_ad(in_file());
+ ut_ad(in_LRU_list);
+ ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+ (space == fil_system.temp_space));
+ ut_ad(evict || space != fil_system.temp_space);
+ ut_ad(space->referenced());
+
+ const auto s= state();
+ ut_a(s >= FREED);
+
+ if (s < UNFIXED)
+ {
+ if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+ {
+ const lsn_t lsn=
+ mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
+ ut_ad(lsn >= oldest_modification());
+ if (lsn > log_sys.get_flushed_lsn())
+ {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ log_write_up_to(lsn, true);
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ }
+ buf_pool.release_freed_page(this);
+ return false;
+ }
+
+ ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
+ ut_ad(f >= UNFIXED);
+ ut_ad(f < READ_FIX);
+ ut_ad((space == fil_system.temp_space)
+ ? oldest_modification() == 2
+ : oldest_modification() > 2);
+
+ /* Increment the I/O operation count used for selecting LRU policy. */
+ buf_LRU_stat_inc_io();
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ IORequest::Type type= IORequest::WRITE_ASYNC;
+ if (UNIV_UNLIKELY(evict))
+ {
+ type= IORequest::WRITE_LRU;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_pool.n_flush_inc();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ }
+
+ /* Apart from the U-lock, this block will also be protected by
+ is_write_fixed() and oldest_modification()>1.
+ Thus, it cannot be relocated or removed. */
+
+ DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
+ evict ? "LRU" : "flush_list",
+ id().space(), id().page_no()));
+
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(this);
+ page_t *write_frame= zip.data;
+
+ space->reacquire();
+ size_t size;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ size_t orig_size;
+#endif
+ buf_tmp_buffer_t *slot= nullptr;
+
+ if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */
+ {
+ ut_ad(!space->full_crc32());
+ ut_ad(!space->is_compressed()); /* not page_compressed */
+ size= zip_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ orig_size= size;
+#endif
+ buf_flush_update_zip_checksum(write_frame, size);
+ write_frame= buf_page_encrypt(space, this, write_frame, &slot, &size);
+ ut_ad(size == zip_size());
+ }
+ else
+ {
+ byte *page= frame;
+ size= block->physical_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ orig_size= size;
+#endif
+
+ if (space->full_crc32())
+ {
+ /* innodb_checksum_algorithm=full_crc32 is not implemented for
+ ROW_FORMAT=COMPRESSED pages. */
+ ut_ad(!write_frame);
+ page= buf_page_encrypt(space, this, page, &slot, &size);
+ buf_flush_init_for_writing(block, page, nullptr, true);
+ }
+ else
+ {
+ buf_flush_init_for_writing(block, page, write_frame ? &zip : nullptr,
+ false);
+ page= buf_page_encrypt(space, this, write_frame ? write_frame : page,
+ &slot, &size);
+ }
+
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ if (size != orig_size)
+ {
+ switch (space->chain.start->punch_hole) {
+ case 1:
+ static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH ==
+ IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, "");
+ type=
+ IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC));
+ break;
+ case 2:
+ size= orig_size;
+ }
+ }
+#endif
+ write_frame= page;
+ }
+
+ if ((s & LRU_MASK) == REINIT || !space->use_doublewrite())
+ {
+ if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+ {
+ const lsn_t lsn=
+ mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN +
+ (write_frame ? write_frame
+ : frame)));
+ ut_ad(lsn >= oldest_modification());
+ log_write_up_to(lsn, true);
+ }
+ space->io(IORequest{type, this, slot}, physical_offset(), size,
+ write_frame, this);
+ }
+ else
+ buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type},
+ size);
+ return true;
+}
+
+/** Check whether a page can be flushed from the buf_pool.
+@param id page identifier
+@param fold id.fold()
+@param evict true=buf_pool.LRU; false=buf_pool.flush_list
+@return whether the page can be flushed */
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold,
+ bool evict)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(fold == id.fold());
+
+ /* FIXME: cell_get() is being invoked while holding buf_pool.mutex */
+ const buf_page_t *bpage=
+ buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold));
+
+ if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ return false;
+
+ /* We avoid flushing 'non-old' blocks in an eviction flush, because the
+ flushed blocks are soon freed */
+ if (evict && !bpage->is_old())
+ return false;
+
+ return bpage->oldest_modification() > 1 && !bpage->is_io_fixed();
+}
+
+/** Check which neighbors of a page can be flushed from the buf_pool.
+@param space tablespace
+@param id page identifier of a dirty page
+@param contiguous whether to consider contiguous areas of pages
+@param evict true=buf_pool.LRU; false=buf_pool.flush_list
+@return last page number that can be flushed */
+static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
+ page_id_t &id, bool contiguous,
+ bool evict)
+{
+ ut_ad(id.page_no() < space.size +
+ (space.physical_size() == 2048 ? 1
+ : space.physical_size() == 1024 ? 3 : 0));
+ /* When flushed, dirty blocks are searched in neighborhoods of this
+ size, and flushed along with the original page. */
+ const ulint s= buf_pool.curr_size / 16;
+ const uint32_t read_ahead= buf_pool.read_ahead_area;
+ const uint32_t buf_flush_area= read_ahead > s
+ ? static_cast<uint32_t>(s) : read_ahead;
+ page_id_t low= id - (id.page_no() % buf_flush_area);
+ page_id_t high= low + buf_flush_area;
+ high.set_page_no(std::min(high.page_no(), space.last_page_number()));
+
+ if (!contiguous)
+ {
+ high= std::max(id + 1, high);
+ id= low;
+ return high;
+ }
+
+ /* Determine the contiguous dirty area around id. */
+ const ulint id_fold= id.fold();
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (id > low)
+ {
+ ulint fold= id_fold;
+ for (page_id_t i= id - 1;; --i)
+ {
+ fold--;
+ if (!buf_flush_check_neighbor(i, fold, evict))
+ {
+ low= i + 1;
+ break;
+ }
+ if (i == low)
+ break;
+ }
+ }
+
+ page_id_t i= id;
+ id= low;
+ ulint fold= id_fold;
+ while (++i < high)
+ {
+ ++fold;
+ if (!buf_flush_check_neighbor(i, fold, evict))
+ break;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return i;
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Apply freed_ranges to the file.
+@param writable whether the file is writable
+@return number of pages written or hole-punched */
+uint32_t fil_space_t::flush_freed(bool writable)
+{
+ const bool punch_hole= chain.start->punch_hole == 1;
+ if (!punch_hole && !srv_immediate_scrub_data_uncompressed)
+ return 0;
+
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
+
+ for (;;)
+ {
+ freed_range_mutex.lock();
+ if (freed_ranges.empty())
+ {
+ freed_range_mutex.unlock();
+ return 0;
+ }
+ const lsn_t flush_lsn= last_freed_lsn;
+ if (log_sys.get_flushed_lsn() >= flush_lsn)
+ break;
+ freed_range_mutex.unlock();
+ log_write_up_to(flush_lsn, true);
+ }
+
+ const unsigned physical{physical_size()};
+
+ range_set freed= std::move(freed_ranges);
+ uint32_t written= 0;
+
+ if (!writable);
+ else if (punch_hole)
+ {
+ for (const auto &range : freed)
+ {
+ written+= range.last - range.first + 1;
+ reacquire();
+ io(IORequest(IORequest::PUNCH_RANGE),
+ os_offset_t{range.first} * physical,
+ (range.last - range.first + 1) * physical, nullptr);
+ }
+ }
+ else
+ {
+ for (const auto &range : freed)
+ {
+ written+= range.last - range.first + 1;
+ for (os_offset_t i= range.first; i <= range.last; i++)
+ {
+ reacquire();
+ io(IORequest(IORequest::WRITE_ASYNC), i * physical, physical,
+ const_cast<byte*>(field_ref_zero));
+ }
+ }
+ }
+
+ freed_range_mutex.unlock();
+ return written;
+}
+
+/** Flushes to disk all flushable pages within the flush area
+and also write zeroes or punch the hole for the freed ranges of pages.
+@param space tablespace
+@param page_id page identifier
+@param bpage buffer page
+@param contiguous whether to consider contiguous areas of pages
+@param evict true=buf_pool.LRU; false=buf_pool.flush_list
+@param n_flushed number of pages flushed so far in this batch
+@param n_to_flush maximum number of pages we are allowed to flush
+@return number of pages flushed */
+static ulint buf_flush_try_neighbors(fil_space_t *space,
+ const page_id_t page_id,
+ buf_page_t *bpage,
+ bool contiguous, bool evict,
+ ulint n_flushed, ulint n_to_flush)
+{
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ ut_ad(space->id == page_id.space());
+ ut_ad(bpage->id() == page_id);
+
+ ulint count= 0;
+ page_id_t id= page_id;
+ page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
+
+ ut_ad(page_id >= id);
+ ut_ad(page_id < high);
+
+ for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold)
+ {
+ if (UNIV_UNLIKELY(space->is_stopping_writes()))
+ {
+ if (bpage)
+ bpage->lock.u_unlock(true);
+ break;
+ }
+
+ if (count + n_flushed >= n_to_flush)
+ {
+ if (id > page_id)
+ break;
+ /* If the page whose neighbors we are flushing has not been
+ flushed yet, we must flush the page that we selected originally. */
+ id= page_id;
+ id_fold= id.fold();
+ }
+
+ const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold);
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (buf_page_t *b= buf_pool.page_hash.get(id, chain))
+ {
+ ut_ad(b->in_file());
+ if (id == page_id)
+ {
+ ut_ad(bpage == b);
+ bpage= nullptr;
+ ut_ad(!buf_pool.watch_is_sentinel(*b));
+ ut_ad(b->oldest_modification() > 1);
+ flush:
+ if (b->flush(evict, space))
+ {
+ ++count;
+ continue;
+ }
+ }
+ /* We avoid flushing 'non-old' blocks in an eviction flush,
+ because the flushed blocks are soon freed */
+ else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) &&
+ b->oldest_modification() > 1 && b->lock.u_lock_try(true))
+ {
+ if (b->oldest_modification() < 2)
+ b->lock.u_unlock(true);
+ else
+ goto flush;
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+
+ if (count > 1)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+ MONITOR_FLUSH_NEIGHBOR_COUNT,
+ MONITOR_FLUSH_NEIGHBOR_PAGES, count - 1);
+ }
+
+ return count;
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+@return number of blocks moved to the free list. */
+static ulint buf_free_from_unzip_LRU_list_batch()
+{
+ ulint scanned = 0;
+ ulint count = 0;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+
+ while (block
+ && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
+ && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+
+ ++scanned;
+ if (buf_LRU_free_page(&block->page, false)) {
+ /* Block was freed. buf_pool.mutex potentially
+ released and reacquired */
+ ++count;
+ block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+ } else {
+ block = UT_LIST_GET_PREV(unzip_LRU, block);
+ }
+ }
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ scanned);
+ }
+
+ return(count);
+}
+
+/** Acquire a tablespace reference for writing.
+@param id tablespace identifier
+@return tablespace
+@retval nullptr if the tablespace is missing or inaccessible */
+fil_space_t *fil_space_t::get_for_write(uint32_t id)
+{
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_space_t *space= fil_space_get_by_id(id);
+ const uint32_t n= space ? space->acquire_low(STOPPING_WRITES) : 0;
+
+ if (n & STOPPING_WRITES)
+ space= nullptr;
+ else if ((n & CLOSING) && !space->prepare_acquired())
+ space= nullptr;
+
+ mysql_mutex_unlock(&fil_system.mutex);
+ return space;
+}
+
+/** Start writing out pages for a tablespace.
+@param id tablespace identifier
+@return tablespace and number of pages written */
+static std::pair<fil_space_t*, uint32_t> buf_flush_space(const uint32_t id)
+{
+ if (fil_space_t *space= fil_space_t::get_for_write(id))
+ return {space, space->flush_freed(true)};
+ return {nullptr, 0};
+}
+
+struct flush_counters_t
+{
+ /** number of dirty pages flushed */
+ ulint flushed;
+ /** number of clean pages evicted */
+ ulint evicted;
+};
+
+/** Discard a dirty page, and release buf_pool.flush_list_mutex.
+@param bpage dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
+{
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->oldest_modification());
+
+ buf_pool.delete_from_flush_list(bpage);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ ut_d(const auto state= bpage->state());
+ ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED ||
+ state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT);
+ bpage->lock.u_unlock(true);
+ buf_LRU_free_page(bpage, true);
+}
+
+/** Flush dirty blocks from the end buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+@param max maximum number of blocks to flush
+@param evict whether dirty pages are to be evicted after flushing them
+@param n counts of flushed and evicted pages */
+static void buf_flush_LRU_list_batch(ulint max, bool evict,
+ flush_counters_t *n)
+{
+ ulint scanned= 0;
+ ulint free_limit= srv_LRU_scan_depth;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ if (buf_pool.withdraw_target && buf_pool.is_shrinking())
+ free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
+
+ const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+ ? 0 : srv_flush_neighbors;
+ fil_space_t *space= nullptr;
+ uint32_t last_space_id= FIL_NULL;
+ static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+ static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
+ bpage &&
+ ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
+ UT_LIST_GET_LEN(buf_pool.free) < free_limit) ||
+ recv_recovery_is_on());
+ ++scanned, bpage= buf_pool.lru_hp.get())
+ {
+ buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+ buf_pool.lru_hp.set(prev);
+ auto state= bpage->state();
+ ut_ad(state >= buf_page_t::FREED);
+ ut_ad(bpage->in_LRU_list);
+
+ if (!bpage->oldest_modification())
+ {
+ evict:
+ if (state != buf_page_t::FREED &&
+ (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state)))
+ continue;
+ buf_LRU_free_page(bpage, true);
+ ++n->evicted;
+ if (UNIV_LIKELY(scanned & 31))
+ continue;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ reacquire_mutex:
+ mysql_mutex_lock(&buf_pool.mutex);
+ continue;
+ }
+
+ if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true))
+ {
+ ut_ad(!bpage->is_io_fixed());
+ bool do_evict= evict;
+ switch (bpage->oldest_modification()) {
+ case 1:
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (ut_d(lsn_t lsn=) bpage->oldest_modification())
+ {
+ ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */
+ buf_pool.delete_from_flush_list(bpage);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ /* fall through */
+ case 0:
+ bpage->lock.u_unlock(true);
+ goto evict;
+ case 2:
+ /* LRU flushing will always evict pages of the temporary tablespace. */
+ do_evict= true;
+ }
+ /* Block is ready for flush. Dispatch an IO request.
+ If do_evict, the page may be evicted by buf_page_write_complete(). */
+ const page_id_t page_id(bpage->id());
+ const uint32_t space_id= page_id.space();
+ if (!space || space->id != space_id)
+ {
+ if (last_space_id != space_id)
+ {
+ buf_pool.lru_hp.set(bpage);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ if (space)
+ space->release();
+ auto p= buf_flush_space(space_id);
+ space= p.first;
+ last_space_id= space_id;
+ if (!space)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ goto no_space;
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_pool.stat.n_pages_written+= p.second;
+ }
+ else
+ {
+ ut_ad(!space);
+ goto no_space;
+ }
+ }
+ else if (space->is_stopping_writes())
+ {
+ space->release();
+ space= nullptr;
+ no_space:
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_discard_page(bpage);
+ continue;
+ }
+
+ if (n->flushed >= max && !recv_recovery_is_on())
+ {
+ bpage->lock.u_unlock(true);
+ break;
+ }
+
+ if (neighbors && space->is_rotational())
+ n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
+ neighbors == 1,
+ do_evict, n->flushed, max);
+ else if (bpage->flush(do_evict, space))
+ ++n->flushed;
+ else
+ continue;
+
+ goto reacquire_mutex;
+ }
+ else
+ /* Can't evict or dispatch this block. Go to previous. */
+ ut_ad(buf_pool.lru_hp.is_hp(prev));
+ }
+
+ buf_pool.lru_hp.set(nullptr);
+
+ if (space)
+ space->release();
+
+ if (scanned)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
+ MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+ scanned);
+}
+
+/** Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@param max maximum number of blocks to flush
+@param evict whether dirty pages are to be evicted after flushing them
+@param n counts of flushed and evicted pages */
+static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n)
+{
+ if (buf_LRU_evict_from_unzip_LRU())
+ buf_free_from_unzip_LRU_list_batch();
+ n->evicted= 0;
+ n->flushed= 0;
+ buf_flush_LRU_list_batch(max, evict, n);
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ buf_lru_freed_page_count+= n->evicted;
+ buf_lru_flush_page_count+= n->flushed;
+ buf_pool.stat.n_pages_written+= n->flushed;
+}
+
+/** This utility flushes dirty blocks from the end of the flush_list.
+The calling thread is not allowed to own any latches on pages!
+@param max_n maximum mumber of blocks to flush
+@param lsn once an oldest_modification>=lsn is found, terminate the batch
+@return number of blocks for which the write request was queued */
+static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
+{
+ ulint count= 0;
+ ulint scanned= 0;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+
+ const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+ ? 0 : srv_flush_neighbors;
+ fil_space_t *space= nullptr;
+ uint32_t last_space_id= FIL_NULL;
+ static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+ static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+ /* Start from the end of the list looking for a suitable block to be
+ flushed. */
+ ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+ bpage && len && count < max_n; ++scanned, len--)
+ {
+ const lsn_t oldest_modification= bpage->oldest_modification();
+ if (oldest_modification >= lsn)
+ break;
+ ut_ad(bpage->in_file());
+
+ {
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+ if (oldest_modification == 1)
+ {
+ clear:
+ buf_pool.delete_from_flush_list(bpage);
+ skip:
+ bpage= prev;
+ continue;
+ }
+
+ ut_ad(oldest_modification > 2);
+
+ if (!bpage->lock.u_lock_try(true))
+ goto skip;
+
+ ut_ad(!bpage->is_io_fixed());
+
+ if (bpage->oldest_modification() == 1)
+ {
+ bpage->lock.u_unlock(true);
+ goto clear;
+ }
+
+ /* In order not to degenerate this scan to O(n*n) we attempt to
+ preserve the pointer position. Any thread that would remove 'prev'
+ from buf_pool.flush_list must adjust the hazard pointer.
+
+ Note: A concurrent execution of buf_flush_list_space() may
+ terminate this scan prematurely. The buf_pool.flush_list_active
+ should prevent multiple threads from executing
+ buf_do_flush_list_batch() concurrently,
+ but buf_flush_list_space() is ignoring that. */
+ buf_pool.flush_hp.set(prev);
+ }
+
+ const page_id_t page_id(bpage->id());
+ const uint32_t space_id= page_id.space();
+ if (!space || space->id != space_id)
+ {
+ if (last_space_id != space_id)
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ if (space)
+ space->release();
+ auto p= buf_flush_space(space_id);
+ space= p.first;
+ last_space_id= space_id;
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_pool.stat.n_pages_written+= p.second;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
+ else
+ ut_ad(!space);
+ }
+ else if (space->is_stopping_writes())
+ {
+ space->release();
+ space= nullptr;
+ }
+
+ if (!space)
+ buf_flush_discard_page(bpage);
+ else
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ do
+ {
+ if (neighbors && space->is_rotational())
+ count+= buf_flush_try_neighbors(space, page_id, bpage,
+ neighbors == 1, false, count, max_n);
+ else if (bpage->flush(false, space))
+ ++count;
+ else
+ continue;
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ while (0);
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ bpage= buf_pool.flush_hp.get();
+ }
+
+ buf_pool.flush_hp.set(nullptr);
+
+ if (space)
+ space->release();
+
+ if (scanned)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+ MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+ MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+ scanned);
+ return count;
+}
+
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end()
+{
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
+
+ if (buf_pool.n_flush())
+ {
+ tpool::tpool_wait_begin();
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ do
+ my_cond_wait(&buf_pool.done_flush_LRU,
+ &buf_pool.flush_list_mutex.m_mutex);
+ while (buf_pool.n_flush());
+ tpool::tpool_wait_end();
+ thd_wait_end(nullptr);
+ }
+}
+
+/** Write out dirty blocks from buf_pool.flush_list.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n wished maximum mumber of blocks flushed
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED,
+ lsn_t lsn= LSN_MAX)
+{
+ ut_ad(lsn);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (buf_pool.flush_list_active())
+ {
+nothing_to_do:
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ return 0;
+ }
+ if (!buf_pool.get_oldest_modification(0))
+ {
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ goto nothing_to_do;
+ }
+ buf_pool.flush_list_set_active();
+ const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
+ if (n_flushed)
+ buf_pool.stat.n_pages_written+= n_flushed;
+ buf_pool.flush_list_set_inactive();
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (n_flushed)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ n_flushed);
+
+ DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
+ return n_flushed;
+}
+
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n wished maximum mumber of blocks flushed
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED,
+ lsn_t lsn= LSN_MAX)
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+ ulint n= buf_flush_list_holding_mutex(max_n, lsn);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_dblwr.flush_buffered_writes();
+ return n;
+}
+
+/** Try to flush all the dirty pages that belong to a given tablespace.
+@param space tablespace
+@param n_flushed number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
+{
+ const auto space_id= space->id;
+ ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+ bool may_have_skipped= false;
+ ulint max_n_flush= srv_io_capacity;
+ ulint n_flush= 0;
+
+ bool acquired= space->acquire_for_write();
+ {
+ const uint32_t written{space->flush_freed(acquired)};
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (written)
+ buf_pool.stat.n_pages_written+= written;
+ }
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+ {
+ ut_ad(bpage->oldest_modification());
+ ut_ad(bpage->in_file());
+
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+ if (bpage->oldest_modification() == 1)
+ clear:
+ buf_pool.delete_from_flush_list(bpage);
+ else if (bpage->id().space() != space_id);
+ else if (!bpage->lock.u_lock_try(true))
+ may_have_skipped= true;
+ else if (bpage->oldest_modification() == 1)
+ {
+ bpage->lock.u_unlock(true);
+ goto clear;
+ }
+ else
+ {
+ /* In order not to degenerate this scan to O(n*n) we attempt to
+ preserve the pointer position. Any thread that would remove 'prev'
+ from buf_pool.flush_list must adjust the hazard pointer.
+
+ Note: Multiple executions of buf_flush_list_space() may be
+ interleaved, and also buf_do_flush_list_batch() may be running
+ concurrently. This may terminate our iteration prematurely,
+ leading us to return may_have_skipped=true. */
+ buf_pool.flush_hp.set(prev);
+
+ if (!acquired)
+ was_freed:
+ buf_flush_discard_page(bpage);
+ else
+ {
+ if (space->is_stopping_writes())
+ {
+ space->release();
+ acquired= false;
+ goto was_freed;
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ if (bpage->flush(false, space))
+ {
+ ++n_flush;
+ if (!--max_n_flush)
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ may_have_skipped= true;
+ goto done;
+ }
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (!buf_pool.flush_hp.is_hp(prev))
+ may_have_skipped= true;
+ bpage= buf_pool.flush_hp.get();
+ continue;
+ }
+
+ bpage= prev;
+ }
+
+ /* Note: this loop may have been executed concurrently with
+ buf_do_flush_list_batch() as well as other threads executing
+ buf_flush_list_space(). We should always return true from
+ buf_flush_list_space() if that should be the case; in
+ buf_do_flush_list_batch() we will simply perform less work. */
+done:
+ buf_pool.flush_hp.set(nullptr);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ buf_pool.stat.n_pages_written+= n_flush;
+
+ buf_pool.try_LRU_scan= true;
+ pthread_cond_broadcast(&buf_pool.done_free);
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (n_flushed)
+ *n_flushed= n_flush;
+
+ if (acquired)
+ space->release();
+
+ if (space->purpose == FIL_TYPE_IMPORT)
+ os_aio_wait_until_no_pending_writes(true);
+ else
+ buf_dblwr.flush_buffered_writes();
+
+ return may_have_skipped;
+}
+
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n wished maximum mumber of blocks flushed
+@param evict whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written */
+ulint buf_flush_LRU(ulint max_n, bool evict)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ flush_counters_t n;
+ buf_do_LRU_batch(max_n, evict, &n);
+
+ ulint pages= n.flushed;
+
+ if (n.evicted)
+ {
+ if (evict)
+ pages+= n.evicted;
+ buf_pool.try_LRU_scan= true;
+ pthread_cond_broadcast(&buf_pool.done_free);
+ }
+
+ return pages;
+}
+
+#ifdef HAVE_PMEM
+# include <libpmem.h>
+#endif
+
+/** Write checkpoint information to the log header and release mutex.
+@param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
+inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(end_lsn >= next_checkpoint_lsn);
+ ut_ad(end_lsn <= get_lsn());
+ ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() ||
+ srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
+
+ DBUG_PRINT("ib_log",
+ ("checkpoint at " LSN_PF " written", next_checkpoint_lsn));
+
+ auto n= next_checkpoint_no;
+ const size_t offset{(n & 1) ? CHECKPOINT_2 : CHECKPOINT_1};
+ static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency");
+ static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility");
+ byte* c= my_assume_aligned<CPU_LEVEL1_DCACHE_LINESIZE>
+ (is_pmem() ? buf + offset : checkpoint_buf);
+ memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(c, 0, CPU_LEVEL1_DCACHE_LINESIZE);
+ mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn);
+ mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn);
+ mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60));
+
+ lsn_t resizing;
+
+#ifdef HAVE_PMEM
+ if (is_pmem())
+ {
+ resizing= resize_lsn.load(std::memory_order_relaxed);
+
+ if (resizing > 1 && resizing <= next_checkpoint_lsn)
+ {
+ memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64);
+ header_write(resize_buf, resizing, is_encrypted());
+ pmem_persist(resize_buf, resize_target);
+ }
+ pmem_persist(c, 64);
+ }
+ else
+#endif
+ {
+ ut_ad(!checkpoint_pending);
+ checkpoint_pending= true;
+ latch.wr_unlock();
+ log_write_and_flush_prepare();
+ resizing= resize_lsn.load(std::memory_order_relaxed);
+ /* FIXME: issue an asynchronous write */
+ log.write(offset, {c, get_block_size()});
+ if (resizing > 1 && resizing <= next_checkpoint_lsn)
+ {
+ byte *buf= static_cast<byte*>(aligned_malloc(4096, 4096));
+ memset_aligned<4096>(buf, 0, 4096);
+ header_write(buf, resizing, is_encrypted());
+ resize_log.write(0, {buf, 4096});
+ aligned_free(buf);
+ resize_log.write(CHECKPOINT_1, {c, get_block_size()});
+ }
+
+ if (srv_file_flush_method != SRV_O_DSYNC)
+ ut_a(log.flush());
+ latch.wr_lock(SRW_LOCK_CALL);
+ ut_ad(checkpoint_pending);
+ checkpoint_pending= false;
+ resizing= resize_lsn.load(std::memory_order_relaxed);
+ }
+
+ ut_ad(!checkpoint_pending);
+ next_checkpoint_no++;
+ const lsn_t checkpoint_lsn{next_checkpoint_lsn};
+ last_checkpoint_lsn= checkpoint_lsn;
+
+ DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF,
+ checkpoint_lsn, get_flushed_lsn()));
+ if (overwrite_warned)
+ {
+ sql_print_information("InnoDB: Crash recovery was broken "
+ "between LSN=" LSN_PF
+ " and checkpoint LSN=" LSN_PF ".",
+ overwrite_warned, checkpoint_lsn);
+ overwrite_warned= 0;
+ }
+
+ lsn_t resizing_completed= 0;
+
+ if (resizing > 1 && resizing <= checkpoint_lsn)
+ {
+ ut_ad(is_pmem() == !resize_flush_buf);
+
+ if (!is_pmem())
+ {
+ if (srv_file_flush_method != SRV_O_DSYNC)
+ ut_a(resize_log.flush());
+ IF_WIN(log.close(),);
+ }
+
+ if (resize_rename())
+ {
+ /* Resizing failed. Discard the log_sys.resize_log. */
+#ifdef HAVE_PMEM
+ if (is_pmem())
+ my_munmap(resize_buf, resize_target);
+ else
+#endif
+ {
+ ut_free_dodump(resize_buf, buf_size);
+ ut_free_dodump(resize_flush_buf, buf_size);
+#ifdef _WIN32
+ ut_ad(!log.is_opened());
+ bool success;
+ log.m_file=
+ os_file_create_func(get_log_file_path().c_str(),
+ OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
+ ut_a(success);
+ ut_a(log.is_opened());
+#endif
+ }
+ }
+ else
+ {
+ /* Adopt the resized log. */
+#ifdef HAVE_PMEM
+ if (is_pmem())
+ {
+ my_munmap(buf, file_size);
+ buf= resize_buf;
+ buf_free= START_OFFSET + (get_lsn() - resizing);
+ }
+ else
+#endif
+ {
+ IF_WIN(,log.close());
+ std::swap(log, resize_log);
+ ut_free_dodump(buf, buf_size);
+ ut_free_dodump(flush_buf, buf_size);
+ buf= resize_buf;
+ flush_buf= resize_flush_buf;
+ }
+ srv_log_file_size= resizing_completed= file_size= resize_target;
+ first_lsn= resizing;
+ set_capacity();
+ }
+ ut_ad(!resize_log.is_opened());
+ resize_buf= nullptr;
+ resize_flush_buf= nullptr;
+ resize_target= 0;
+ resize_lsn.store(0, std::memory_order_relaxed);
+ }
+
+ log_resize_release();
+
+ if (UNIV_LIKELY(resizing <= 1));
+ else if (resizing > checkpoint_lsn)
+ buf_flush_ahead(resizing, false);
+ else if (resizing_completed)
+ ib::info() << "Resized log to " << ib::bytes_iec{resizing_completed}
+ << "; start LSN=" << resizing;
+ else
+ buf_flush_ahead(end_lsn + 1, false);
+}
+
+/** Initiate a log checkpoint, discarding the start of the log.
+@param oldest_lsn the checkpoint LSN
+@param end_lsn log_sys.get_lsn()
+@return true if success, false if a checkpoint write was already running */
+static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
+{
+ ut_ad(!srv_read_only_mode);
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(log_sys.latch.is_write_locked());
+#endif
+ ut_ad(oldest_lsn <= end_lsn);
+ ut_ad(end_lsn == log_sys.get_lsn());
+
+ if (oldest_lsn == log_sys.last_checkpoint_lsn ||
+ (oldest_lsn == end_lsn &&
+ !log_sys.resize_in_progress() &&
+ oldest_lsn == log_sys.last_checkpoint_lsn +
+ (log_sys.is_encrypted()
+ ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT)))
+ {
+ /* Do nothing, because nothing was logged (other than a
+ FILE_CHECKPOINT record) since the previous checkpoint. */
+ do_nothing:
+ log_sys.latch.wr_unlock();
+ return true;
+ }
+
+ ut_ad(!recv_no_log_write);
+ ut_ad(oldest_lsn > log_sys.last_checkpoint_lsn);
+ /* Repeat the FILE_MODIFY records after the checkpoint, in case some
+ log records between the checkpoint and log_sys.lsn need them.
+ Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
+ see a FILE_CHECKPOINT after the checkpoint, except on clean
+ shutdown, where the log will be empty after the checkpoint.
+
+ It is important that we write out the redo log before any further
+ dirty pages are flushed to the tablespace files. At this point,
+ because we hold exclusive log_sys.latch,
+ mtr_t::commit() in other threads will be blocked,
+ and no pages can be added to buf_pool.flush_list. */
+ const lsn_t flush_lsn{fil_names_clear(oldest_lsn)};
+ ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
+ log_sys.latch.wr_unlock();
+ log_write_up_to(flush_lsn, true);
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ if (log_sys.last_checkpoint_lsn >= oldest_lsn)
+ goto do_nothing;
+
+ ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
+
+ if (log_sys.checkpoint_pending)
+ {
+ /* A checkpoint write is running */
+ log_sys.latch.wr_unlock();
+ return false;
+ }
+
+ log_sys.next_checkpoint_lsn= oldest_lsn;
+ log_sys.write_checkpoint(end_lsn);
+
+ return true;
+}
+
+/** Make a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log file. Use log_make_checkpoint() to flush also the pool.
+@retval true if the checkpoint was or had been made
+@retval false if a checkpoint write was already running */
+static bool log_checkpoint()
+{
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ switch (srv_file_flush_method) {
+ case SRV_NOSYNC:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ fil_flush_file_spaces();
+ }
+
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ const lsn_t end_lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ return log_checkpoint_low(oldest_lsn, end_lsn);
+}
+
+/** Make a checkpoint. */
+ATTRIBUTE_COLD void log_make_checkpoint()
+{
+ buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
+ while (!log_checkpoint());
+}
+
+/** Wait for all dirty pages up to an LSN to be written out.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+static void buf_flush_wait(lsn_t lsn)
+{
+ ut_ad(lsn <= log_sys.get_lsn());
+
+ lsn_t oldest_lsn;
+
+ while ((oldest_lsn= buf_pool.get_oldest_modification(lsn)) < lsn)
+ {
+ if (buf_flush_sync_lsn < lsn)
+ {
+ buf_flush_sync_lsn= lsn;
+ buf_pool.page_cleaner_set_idle(false);
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ my_cond_wait(&buf_pool.done_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ oldest_lsn= buf_pool.get_oldest_modification(lsn);
+ if (oldest_lsn >= lsn)
+ break;
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ os_aio_wait_until_no_pending_writes(false);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
+
+ if (oldest_lsn >= buf_flush_sync_lsn)
+ {
+ buf_flush_sync_lsn= 0;
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ }
+}
+
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
+{
+ ut_ad(sync_lsn);
+ ut_ad(sync_lsn < LSN_MAX);
+ ut_ad(!srv_read_only_mode);
+
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
+ {
+ MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+
+#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
+ if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
+ {
+ do
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
+ if (n_pages)
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES, n_pages);
+ }
+ os_aio_wait_until_no_pending_writes(false);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
+ while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+ }
+ else
+#endif
+ {
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ tpool::tpool_wait_begin();
+ buf_flush_wait(sync_lsn);
+ tpool::tpool_wait_end();
+ thd_wait_end(nullptr);
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
+ {
+ /* If the buffer pool was clean, no log write was guaranteed
+ to happen until now. There could be an outstanding FILE_CHECKPOINT
+ record from a previous fil_names_clear() call, which we must
+ write out before we can advance the checkpoint. */
+ log_write_up_to(sync_lsn, true);
+ DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;);
+ log_checkpoint();
+ }
+}
+
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
+{
+ ut_ad(!srv_read_only_mode);
+
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;);
+
+ Atomic_relaxed<lsn_t> &limit= furious
+ ? buf_flush_sync_lsn : buf_flush_async_lsn;
+
+ if (limit < lsn)
+ {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (limit < lsn)
+ {
+ limit= lsn;
+ buf_pool.page_cleaner_set_idle(false);
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ }
+}
+
+/** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
+and try to initiate checkpoints until the target is met.
+@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
+ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
+{
+ ut_ad(!srv_read_only_mode);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+ for (;;)
+ {
+ if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
+ {
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+ MONITOR_FLUSH_SYNC_COUNT,
+ MONITOR_FLUSH_SYNC_PAGES, n_flushed);
+ }
+
+ switch (srv_file_flush_method) {
+ case SRV_NOSYNC:
+ case SRV_O_DIRECT_NO_FSYNC:
+ break;
+ default:
+ fil_flush_file_spaces();
+ }
+
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ const lsn_t newest_lsn= log_sys.get_lsn();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ lsn_t measure= buf_pool.get_oldest_modification(0);
+ const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
+
+ if (!recv_recovery_is_on() &&
+ checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ log_checkpoint_low(checkpoint_lsn, newest_lsn);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ measure= buf_pool.get_oldest_modification(LSN_MAX);
+ }
+ else
+ {
+ log_sys.latch.wr_unlock();
+ if (!measure)
+ measure= LSN_MAX;
+ }
+
+ /* After attempting log checkpoint, check if we have reached our target. */
+ const lsn_t target= buf_flush_sync_lsn;
+
+ if (measure >= target)
+ buf_flush_sync_lsn= 0;
+ else if (measure >= buf_flush_async_lsn)
+ buf_flush_async_lsn= 0;
+
+ /* wake up buf_flush_wait() */
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ lsn= std::max(lsn, target);
+
+ if (measure >= lsn)
+ return;
+ }
+}
+
+/** Check if the adpative flushing threshold is recommended based on
+redo log capacity filled threshold.
+@param oldest_lsn buf_pool.get_oldest_modification()
+@return true if adaptive flushing is recommended. */
+static bool af_needed_for_redo(lsn_t oldest_lsn)
+{
+ lsn_t age= (log_sys.get_lsn() - oldest_lsn);
+ lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
+ static_cast<double>(log_sys.log_capacity) / 100);
+
+ /* if age > af_lwm adaptive flushing is recommended */
+ return (age > af_lwm);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+ lsn_t age) /*!< in: current age of LSN. */
+{
+ lsn_t af_lwm = static_cast<lsn_t>(
+ srv_adaptive_flushing_lwm
+ * static_cast<double>(log_sys.log_capacity) / 100);
+
+ if (age < af_lwm) {
+ /* No adaptive flushing. */
+ return(0);
+ }
+
+ lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
+
+ ut_ad(srv_max_io_capacity >= srv_io_capacity);
+ return static_cast<ulint>(
+ (static_cast<double>(srv_max_io_capacity / srv_io_capacity
+ * lsn_age_factor)
+ * sqrt(static_cast<double>(lsn_age_factor))
+ / 7.5));
+}
+
+/** This function is called approximately once every second by
+buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0
+and innodb_adaptive_flushing=ON.
+Based on various factors it decides if there is a need to do flushing.
+@return number of pages recommended to be flushed
+@param last_pages_in number of pages flushed in previous batch
+@param oldest_lsn buf_pool.get_oldest_modification(0)
+@param pct_lwm innodb_max_dirty_pages_pct_lwm, or 0 to ignore it
+@param dirty_blocks UT_LIST_GET_LEN(buf_pool.flush_list)
+@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */
+static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
+ lsn_t oldest_lsn,
+ double pct_lwm,
+ ulint dirty_blocks,
+ double dirty_pct)
+{
+ static lsn_t prev_lsn = 0;
+ static ulint sum_pages = 0;
+ static ulint avg_page_rate = 0;
+ static ulint n_iterations = 0;
+ static time_t prev_time;
+ lsn_t lsn_rate;
+ ulint n_pages = 0;
+
+ const lsn_t cur_lsn = log_sys.get_lsn();
+ ut_ad(oldest_lsn <= cur_lsn);
+ ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
+ time_t curr_time = time(nullptr);
+ const double max_pct = srv_max_buf_pool_modified_pct;
+
+ if (!prev_lsn || !pct_for_lsn) {
+ prev_time = curr_time;
+ prev_lsn = cur_lsn;
+ if (max_pct > 0.0) {
+ dirty_pct /= max_pct;
+ }
+
+ n_pages = ulint(dirty_pct * double(srv_io_capacity));
+ if (n_pages < dirty_blocks) {
+ n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
+ }
+
+func_exit:
+ page_cleaner.flush_pass++;
+ return n_pages;
+ }
+
+ sum_pages += last_pages_in;
+
+ const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1);
+
+ /* We update our variables every innodb_flushing_avg_loops
+ iterations to smooth out transition in workload. */
+ if (++n_iterations >= srv_flushing_avg_loops
+ || time_elapsed >= srv_flushing_avg_loops) {
+
+ avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2;
+
+ /* How much LSN we have generated since last call. */
+ lsn_rate = (cur_lsn - prev_lsn) / time_elapsed;
+
+ lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+ if (page_cleaner.flush_pass) {
+ page_cleaner.flush_time /= page_cleaner.flush_pass;
+ }
+
+ prev_lsn = cur_lsn;
+ prev_time = curr_time;
+
+ MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+ page_cleaner.flush_time);
+ MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+ page_cleaner.flush_pass);
+
+ page_cleaner.flush_time = 0;
+ page_cleaner.flush_pass = 0;
+
+ n_iterations = 0;
+ sum_pages = 0;
+ }
+
+ MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+ double total_ratio;
+ if (pct_lwm == 0.0 || max_pct == 0.0) {
+ total_ratio = 1;
+ } else {
+ total_ratio = std::max(double(pct_for_lsn) / 100,
+ (dirty_pct / max_pct));
+ }
+
+ MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, ulint(total_ratio * 100));
+
+ /* Estimate pages to be flushed for the lsn progress */
+ lsn_t target_lsn = oldest_lsn
+ + lsn_avg_rate * buf_flush_lsn_scan_factor;
+ ulint pages_for_lsn = 0;
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
+ b != NULL;
+ b = UT_LIST_GET_PREV(list, b)) {
+ if (b->oldest_modification() > target_lsn) {
+ break;
+ }
+ if (++pages_for_lsn >= srv_max_io_capacity) {
+ break;
+ }
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ pages_for_lsn /= buf_flush_lsn_scan_factor;
+ if (pages_for_lsn < 1) {
+ pages_for_lsn = 1;
+ }
+
+ n_pages = (ulint(double(srv_io_capacity) * total_ratio)
+ + avg_page_rate + pages_for_lsn) / 3;
+
+ if (n_pages > srv_max_io_capacity) {
+ n_pages = srv_max_io_capacity;
+ }
+
+ MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+ MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
+
+ MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+ MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+
+ goto func_exit;
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn".
+We would only need this for buf_flush_page_cleaner(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+/** page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one coordinator. */
+static void buf_flush_page_cleaner()
+{
+ my_thread_init();
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+ ut_ad(!srv_read_only_mode);
+ ut_ad(buf_page_cleaner_is_active);
+
+ ulint last_pages= 0;
+ timespec abstime;
+ set_timespec(abstime, 1);
+
+ lsn_t lsn_limit;
+ ulint last_activity_count= srv_get_activity_count();
+
+ for (;;)
+ {
+ lsn_limit= buf_flush_sync_lsn;
+
+ if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync))
+ {
+ furious_flush:
+ buf_flush_sync_for_checkpoint(lsn_limit);
+ last_pages= 0;
+ set_timespec(abstime, 1);
+ continue;
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (buf_pool.ran_out())
+ goto no_wait;
+ else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ break;
+
+ if (buf_pool.page_cleaner_idle() &&
+ (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
+ srv_max_dirty_pages_pct_lwm == 0.0))
+ /* We are idle; wait for buf_pool.page_cleaner_wakeup() */
+ my_cond_wait(&buf_pool.do_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ else
+ my_cond_timedwait(&buf_pool.do_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex, &abstime);
+ no_wait:
+ set_timespec(abstime, 1);
+
+ lsn_limit= buf_flush_sync_lsn;
+ lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
+
+ if (!oldest_lsn)
+ {
+ fully_unemployed:
+ buf_flush_sync_lsn= 0;
+ set_idle:
+ buf_pool.page_cleaner_set_idle(true);
+ set_almost_idle:
+ pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+ break;
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ buf_dblwr.flush_buffered_writes();
+
+ do
+ {
+ DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
+ DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;);
+
+ if (!recv_recovery_is_on() &&
+ !srv_startup_is_before_trx_rollback_phase &&
+ srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
+ log_checkpoint();
+ }
+ while (false);
+
+ if (!buf_pool.ran_out())
+ continue;
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ oldest_lsn= buf_pool.get_oldest_modification(0);
+ }
+
+ lsn_t soft_lsn_limit= buf_flush_async_lsn;
+
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+ if (srv_flush_sync)
+ goto do_furious_flush;
+ if (oldest_lsn >= lsn_limit)
+ {
+ buf_flush_sync_lsn= 0;
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ }
+ else if (lsn_limit > soft_lsn_limit)
+ soft_lsn_limit= lsn_limit;
+ }
+
+ double pct_lwm= 0.0;
+ ulint n_flushed= 0, n;
+
+ if (UNIV_UNLIKELY(soft_lsn_limit != 0))
+ {
+ if (oldest_lsn >= soft_lsn_limit)
+ buf_flush_async_lsn= soft_lsn_limit= 0;
+ }
+ else if (buf_pool.ran_out())
+ {
+ buf_pool.page_cleaner_set_idle(false);
+ buf_pool.n_flush_inc();
+ /* Remove clean blocks from buf_pool.flush_list before the LRU scan. */
+ for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; )
+ {
+ const lsn_t lsn{p->oldest_modification()};
+ ut_ad(lsn > 2 || lsn == 1);
+ buf_page_t *n= UT_LIST_GET_NEXT(list, p);
+ if (lsn <= 1)
+ buf_pool.delete_from_flush_list(p);
+ p= n;
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ n= srv_max_io_capacity;
+ mysql_mutex_lock(&buf_pool.mutex);
+ LRU_flush:
+ n= buf_flush_LRU(n, false);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ last_pages+= n;
+ check_oldest_and_set_idle:
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_pool.n_flush_dec_holding_mutex();
+ oldest_lsn= buf_pool.get_oldest_modification(0);
+ if (!oldest_lsn)
+ goto fully_unemployed;
+ if (oldest_lsn >= buf_flush_async_lsn)
+ buf_flush_async_lsn= 0;
+ buf_pool.page_cleaner_set_idle(false);
+ goto set_almost_idle;
+ }
+ else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+ break;
+
+ const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
+ /* We perform dirty reads of the LRU+free list lengths here.
+ Division by zero is not possible, because buf_pool.flush_list is
+ guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
+ const double dirty_pct= double(dirty_blocks) * 100.0 /
+ double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+ pct_lwm= srv_max_dirty_pages_pct_lwm;
+ if (pct_lwm != 0.0)
+ {
+ const ulint activity_count= srv_get_activity_count();
+ if (activity_count != last_activity_count)
+ {
+ last_activity_count= activity_count;
+ goto maybe_unemployed;
+ }
+ else if (buf_pool.page_cleaner_idle() && !os_aio_pending_reads())
+ {
+ /* reaching here means 3 things:
+ - last_activity_count == activity_count: suggesting server is idle
+ (no trx_t::commit() activity)
+ - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
+ - there are no pending reads but there are dirty pages to flush */
+ buf_pool.update_last_activity_count(activity_count);
+ buf_pool.n_flush_inc();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ goto idle_flush;
+ }
+ else
+ {
+ maybe_unemployed:
+ const bool below{dirty_pct < pct_lwm};
+ pct_lwm= 0.0;
+ if (below)
+ goto possibly_unemployed;
+ }
+ }
+ else if (dirty_pct < srv_max_buf_pool_modified_pct)
+ possibly_unemployed:
+ if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn))
+ goto set_idle;
+
+ buf_pool.page_cleaner_set_idle(false);
+ buf_pool.n_flush_inc();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (UNIV_UNLIKELY(soft_lsn_limit != 0))
+ {
+ n= srv_max_io_capacity;
+ goto background_flush;
+ }
+
+ if (!srv_adaptive_flushing)
+ {
+ idle_flush:
+ n= srv_io_capacity;
+ soft_lsn_limit= LSN_MAX;
+ background_flush:
+ mysql_mutex_lock(&buf_pool.mutex);
+ n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit);
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+ MONITOR_FLUSH_BACKGROUND_COUNT,
+ MONITOR_FLUSH_BACKGROUND_PAGES,
+ n_flushed);
+ }
+ else if ((n= page_cleaner_flush_pages_recommendation(last_pages,
+ oldest_lsn,
+ pct_lwm,
+ dirty_blocks,
+ dirty_pct)) != 0)
+ {
+ const ulint tm= ut_time_ms();
+ mysql_mutex_lock(&buf_pool.mutex);
+ last_pages= n_flushed= buf_flush_list_holding_mutex(n);
+ page_cleaner.flush_time+= ut_time_ms() - tm;
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+ MONITOR_FLUSH_ADAPTIVE_COUNT,
+ MONITOR_FLUSH_ADAPTIVE_PAGES,
+ n_flushed);
+ }
+ else if (buf_flush_async_lsn <= oldest_lsn)
+ goto check_oldest_and_set_idle;
+
+ n= n >= n_flushed ? n - n_flushed : 0;
+ goto LRU_flush;
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ if (srv_fast_shutdown != 2)
+ {
+ buf_dblwr.flush_buffered_writes();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_wait_LRU_batch_end();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ os_aio_wait_until_no_pending_writes(false);
+ }
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ lsn_limit= buf_flush_sync_lsn;
+ if (UNIV_UNLIKELY(lsn_limit != 0))
+ {
+ do_furious_flush:
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ goto furious_flush;
+ }
+ buf_page_cleaner_is_active= false;
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ my_thread_end();
+
+#ifdef UNIV_PFS_THREAD
+ pfs_delete_thread();
+#endif
+}
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
+{
+ ut_ad(!buf_page_cleaner_is_active);
+ ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+ buf_flush_async_lsn= 0;
+ buf_flush_sync_lsn= 0;
+ buf_page_cleaner_is_active= true;
+ std::thread(buf_flush_page_cleaner).detach();
+}
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool()
+{
+ ut_ad(!os_aio_pending_reads());
+ ut_ad(!buf_page_cleaner_is_active);
+ ut_ad(!buf_flush_sync_lsn);
+
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush the buffer pool");
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ while (buf_pool.get_oldest_modification(0))
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ buf_flush_list(srv_max_io_capacity);
+ os_aio_wait_until_no_pending_writes(false);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush " ULINTPF " pages",
+ UT_LIST_GET_LEN(buf_pool.flush_list));
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ ut_ad(!os_aio_pending_reads());
+}
+
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn)
+{
+ lsn= std::max(lsn, log_sys.get_lsn());
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_wait(lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync()
+{
+ if (recv_recovery_is_on())
+ {
+ mysql_mutex_lock(&recv_sys.mutex);
+ recv_sys.apply(true);
+ mysql_mutex_unlock(&recv_sys.mutex);
+ }
+
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ tpool::tpool_wait_begin();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ for (;;)
+ {
+ const lsn_t lsn= log_sys.get_lsn();
+ buf_flush_wait(lsn);
+ /* Wait for the page cleaner to be idle (for log resizing at startup) */
+ while (buf_flush_sync_lsn)
+ my_cond_wait(&buf_pool.done_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ if (lsn == log_sys.get_lsn())
+ break;
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ tpool::tpool_wait_end();
+ thd_wait_end(nullptr);
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the flush list. */
+struct Check {
+ void operator()(const buf_page_t* elem) const
+ {
+ ut_ad(elem->oldest_modification());
+ ut_ad(!fsp_is_system_temporary(elem->id().space()));
+ }
+};
+
+/** Validate the flush list. */
+static void buf_flush_validate_low()
+{
+ buf_page_t* bpage;
+
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+
+ ut_list_validate(buf_pool.flush_list, Check());
+
+ bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
+
+ while (bpage != NULL) {
+ const lsn_t om = bpage->oldest_modification();
+ /* A page in buf_pool.flush_list can be in
+ BUF_BLOCK_REMOVE_HASH state. This happens when a page
+ is in the middle of being relocated. In that case the
+ original descriptor can have this state and still be
+ in the flush list waiting to acquire the
+ buf_pool.flush_list_mutex to complete the relocation. */
+ ut_d(const auto s= bpage->state());
+ ut_ad(s >= buf_page_t::REMOVE_HASH);
+ ut_ad(om == 1 || om > 2);
+
+ bpage = UT_LIST_GET_NEXT(list, bpage);
+ ut_ad(om == 1 || !bpage || recv_recovery_is_on()
+ || om >= bpage->oldest_modification());
+ }
+}
+
+/** Validate the flush list. */
+void buf_flush_validate()
+{
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_validate_low();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
new file mode 100644
index 00000000..65ee8fa3
--- /dev/null
+++ b/storage/innobase/buf/buf0lru.cc
@@ -0,0 +1,1452 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.cc
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#include "my_cpu.h"
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+size_t innodb_lru_flush_size;
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN 5
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static bool buf_lru_switched_on_innodb_mon = false;
+
+/** True if diagnostic message about difficult to find free blocks
+in the buffer bool has already printed. */
+static bool buf_lru_free_blocks_error_printed;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well). From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */
+static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4;
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50;
+
+/** Sampled values buf_LRU_stat_cur.
+Not protected by any mutex. Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint buf_LRU_stat_arr_ind;
+
+/** Current operation counters. Not protected by any mutex. Cleared
+by buf_LRU_stat_update(). */
+buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Not Protected by any mutex. */
+buf_LRU_stat_t buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago. Not protected by any mutex or latch. */
+uint buf_LRU_old_threshold_ms;
+/* @} */
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If !bpage->frame && bpage->oldest_modification() <= 1,
+the object will be freed.
+
+@param bpage buffer block
+@param id page identifier
+@param chain locked buf_pool.page_hash chain (will be released here)
+@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if bpage with bpage->frame was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if block without bpage->frame was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+ buf_pool_t::hash_chain &chain,
+ bool zip);
+
+/** Free a block to buf_pool */
+static void buf_LRU_block_free_hashed_page(buf_block_t *block)
+{
+ block->page.free_file_page();
+ buf_LRU_block_free_non_file_page(block);
+}
+
+/** Increase LRU size in bytes by the page size.
+@param[in] bpage control block */
+static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ buf_pool.stat.LRU_bytes += bpage->physical_size();
+
+ ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size);
+}
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ /* If the unzip_LRU list is empty, we can only use the LRU. */
+ if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) {
+ return false;
+ }
+
+ /* If unzip_LRU is at most 10% of the size of the LRU list,
+ then use the LRU. This slack allows us to keep hot
+ decompressed pages in the buffer pool. */
+ if (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+ <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+ return false;
+ }
+
+ /* If eviction hasn't started yet, we assume by default
+ that a workload is disk bound. */
+ if (buf_pool.freed_page_clock == 0) {
+ return true;
+ }
+
+ /* Calculate the average over past intervals, and add the values
+ of the current interval. */
+ ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+ + buf_LRU_stat_cur.io;
+
+ ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+ + buf_LRU_stat_cur.unzip;
+
+ /* Decide based on our formula. If the load is I/O bound
+ (unzip_avg is smaller than the weighted io_avg), evict an
+ uncompressed frame from unzip_LRU. Otherwise we assume that
+ the load is CPU bound and evict from the regular LRU. */
+ return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/** Try to free an uncompressed page of a compressed block from the unzip
+LRU list. The compressed page is preserved, and it need not be clean.
+@param limit maximum number of blocks to scan
+@return true if freed */
+static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
+{
+ if (!buf_LRU_evict_from_unzip_LRU()) {
+ return(false);
+ }
+
+ ulint scanned = 0;
+ bool freed = false;
+
+ for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+ block && scanned < limit; ++scanned) {
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+
+ ut_ad(block->page.in_file());
+ ut_ad(block->page.belongs_to_unzip_LRU());
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+
+ freed = buf_LRU_free_page(&block->page, false);
+ if (freed) {
+ scanned++;
+ break;
+ }
+
+ block = prev_block;
+ }
+
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+ scanned);
+ }
+
+ return(freed);
+}
+
+/** Try to free a clean page from the common LRU list.
+@param limit maximum number of blocks to scan
+@return whether a page was freed */
+static bool buf_LRU_free_from_common_LRU_list(ulint limit)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ ulint scanned = 0;
+ bool freed = false;
+
+ for (buf_page_t* bpage = buf_pool.lru_scan_itr.start();
+ bpage && scanned < limit;
+ ++scanned, bpage = buf_pool.lru_scan_itr.get()) {
+ buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+ buf_pool.lru_scan_itr.set(prev);
+
+ const auto accessed = bpage->is_accessed();
+
+ if (buf_LRU_free_page(bpage, true)) {
+ if (!accessed) {
+ /* Keep track of pages that are evicted without
+ ever being accessed. This gives us a measure of
+ the effectiveness of readahead */
+ ++buf_pool.stat.n_ra_pages_evicted;
+ }
+
+ freed = true;
+ scanned++;
+ break;
+ }
+ }
+
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+ scanned);
+
+ return(freed);
+}
+
+/** @return a buffer block from the buf_pool.free list
+@retval NULL if the free list is empty */
+buf_block_t* buf_LRU_get_free_only()
+{
+ buf_block_t* block;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(buf_pool.free));
+
+ while (block != NULL) {
+ ut_ad(block->page.in_free_list);
+ ut_d(block->page.in_free_list = FALSE);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+ ut_a(!block->page.in_file());
+ UT_LIST_REMOVE(buf_pool.free, &block->page);
+
+ if (!buf_pool.is_shrinking()
+ || UT_LIST_GET_LEN(buf_pool.withdraw)
+ >= buf_pool.withdraw_target
+ || !buf_pool.will_be_withdrawn(block->page)) {
+ /* No adaptive hash index entries may point to
+ a free block. */
+ assert_block_ahi_empty(block);
+
+ block->page.set_state(buf_page_t::MEMORY);
+ block->page.set_os_used();
+ break;
+ }
+
+ /* This should be withdrawn */
+ UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page);
+ ut_d(block->in_withdraw_list = true);
+
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(buf_pool.free));
+ }
+
+ return(block);
+}
+
+/******************************************************************//**
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static void buf_LRU_check_size_of_non_data_objects()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks)
+ return;
+
+ const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+ if (s < buf_pool.curr_size / 20)
+ ib::fatal() << "Over 95 percent of the buffer pool is"
+ " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+ " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+ "! Check that your transactions do not set too many"
+ " row locks, or review if innodb_buffer_pool_size="
+ << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+ << "M could be bigger.";
+
+ if (s < buf_pool.curr_size / 3)
+ {
+ if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
+ {
+ /* Over 67 % of the buffer pool is occupied by lock heaps or
+ the adaptive hash index. This may be a memory leak! */
+ ib::warn() << "Over 67 percent of the buffer pool is"
+ " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+ " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+ "! Check that your transactions do not set too many row locks."
+ " innodb_buffer_pool_size="
+ << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+ << "M. Starting the InnoDB Monitor to print diagnostics.";
+ buf_lru_switched_on_innodb_mon= true;
+ srv_print_innodb_monitor= TRUE;
+ srv_monitor_timer_schedule_now();
+ }
+ }
+ else if (buf_lru_switched_on_innodb_mon)
+ {
+ /* Switch off the InnoDB Monitor; this is a simple way to stop the
+ monitor if the situation becomes less urgent, but may also
+ surprise users who did SET GLOBAL innodb_status_output=ON earlier! */
+ buf_lru_switched_on_innodb_mon= false;
+ srv_print_innodb_monitor= FALSE;
+ }
+}
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+ * get a block from the buf_pool.free list, success:done
+ * if buf_pool.try_LRU_scan is set
+ * scan LRU up to 100 pages to free a clean block
+ * success:retry the free list
+ * flush up to innodb_lru_flush_size LRU blocks to data files
+ (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+ * on buf_page_write_complete() the blocks will put on buf_pool.free list
+ * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+ * scan whole LRU list
+ * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t *buf_LRU_get_free_block(bool have_mutex)
+{
+ ulint n_iterations = 0;
+ ulint flush_failures = 0;
+ MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+ if (have_mutex) {
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ goto got_mutex;
+ }
+ DBUG_EXECUTE_IF("recv_ran_out_of_buffer",
+ if (recv_recovery_is_on()
+ && recv_sys.apply_log_recs) {
+ mysql_mutex_lock(&buf_pool.mutex);
+ goto flush_lru;
+ });
+get_mutex:
+ mysql_mutex_lock(&buf_pool.mutex);
+got_mutex:
+ buf_LRU_check_size_of_non_data_objects();
+ buf_block_t* block;
+
+ DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
+ if (!buf_lru_free_blocks_error_printed) {
+ n_iterations = 21;
+ goto not_found;});
+
+retry:
+ /* If there is a block in the free list, take it */
+ if ((block = buf_LRU_get_free_only()) != nullptr) {
+got_block:
+ if (!have_mutex) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+ block->page.zip.clear();
+ return block;
+ }
+
+ MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
+ if (n_iterations || buf_pool.try_LRU_scan) {
+ /* If no block was in the free list, search from the
+ end of the LRU list and try to free a block there.
+ If we are doing for the first time we'll scan only
+ tail of the LRU list otherwise we scan the whole LRU
+ list. */
+ if (buf_LRU_scan_and_free_block(n_iterations
+ ? ULINT_UNDEFINED : 100)) {
+ goto retry;
+ }
+
+ /* Tell other threads that there is no point
+ in scanning the LRU list. */
+ buf_pool.try_LRU_scan = false;
+ }
+
+ for (;;) {
+ if ((block = buf_LRU_get_free_only()) != nullptr) {
+ goto got_block;
+ }
+ mysql_mutex_unlock(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ const auto n_flush = buf_pool.n_flush();
+ if (!buf_pool.try_LRU_scan) {
+ buf_pool.page_cleaner_wakeup(true);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (!n_flush) {
+ goto not_found;
+ }
+ if (!buf_pool.try_LRU_scan) {
+ my_cond_wait(&buf_pool.done_free,
+ &buf_pool.mutex.m_mutex);
+ }
+ }
+
+not_found:
+ if (n_iterations > 1) {
+ MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
+ }
+
+ if (n_iterations == 21 && !buf_lru_free_blocks_error_printed
+ && srv_buf_pool_old_size == srv_buf_pool_size) {
+ buf_lru_free_blocks_error_printed = true;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ ib::warn() << "Difficult to find free blocks in the buffer pool"
+ " (" << n_iterations << " search iterations)! "
+ << flush_failures << " failed attempts to"
+ " flush a page!"
+ " Consider increasing innodb_buffer_pool_size."
+ " Pending flushes (fsync): "
+ << fil_n_pending_tablespace_flushes
+ << ". " << os_n_file_reads << " OS file reads, "
+ << os_n_file_writes << " OS file writes, "
+ << os_n_fsyncs
+ << " OS fsyncs.";
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+
+ /* No free block was found: try to flush the LRU list.
+ The freed blocks will be up for grabs for all threads.
+
+ TODO: A more elegant way would have been to return one freed
+ up block to the caller here but the code that deals with
+ removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
+ involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
+ can do that in a separate patch sometime in future. */
+#ifndef DBUG_OFF
+flush_lru:
+#endif
+ if (!buf_flush_LRU(innodb_lru_flush_size, true)) {
+ MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+ ++flush_failures;
+ }
+
+ n_iterations++;
+ buf_pool.stat.LRU_waits++;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_dblwr.flush_buffered_writes();
+ goto get_mutex;
+}
+
+/** Move the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+static void buf_LRU_old_adjust_len()
+{
+ ulint old_len;
+ ulint new_len;
+
+ ut_a(buf_pool.LRU_old);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+ ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+ compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
+ > BUF_LRU_OLD_RATIO_DIV
+ * (BUF_LRU_OLD_TOLERANCE + 5));
+ compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
+
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+ old_len = buf_pool.LRU_old_len;
+ new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+ * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+ UT_LIST_GET_LEN(buf_pool.LRU)
+ - (BUF_LRU_OLD_TOLERANCE
+ + BUF_LRU_NON_OLD_MIN_LEN));
+
+ for (;;) {
+ buf_page_t* LRU_old = buf_pool.LRU_old;
+
+ ut_a(LRU_old);
+ ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+ /* Update the LRU_old pointer if necessary */
+
+ if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+ buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV(
+ LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+ old_len = ++buf_pool.LRU_old_len;
+ LRU_old->set_old(true);
+
+ } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+ buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+ old_len = --buf_pool.LRU_old_len;
+ LRU_old->set_old(false);
+ } else {
+ return;
+ }
+ }
+}
+
+/** Initialize the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static void buf_LRU_old_init()
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN);
+
+ /* We first initialize all blocks in the LRU list as old and then use
+ the adjust function to move the LRU_old pointer to the right
+ position */
+
+ for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+ ut_ad(bpage->in_LRU_list);
+
+ /* This loop temporarily violates the
+ assertions of buf_page_t::set_old(). */
+ bpage->old = true;
+ }
+
+ buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU);
+ buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ buf_LRU_old_adjust_len();
+}
+
+/** Remove a block from the unzip_LRU list if it belonged to the list.
+@param[in] bpage control block */
+static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage)
+{
+ ut_ad(bpage->in_file());
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ if (bpage->belongs_to_unzip_LRU()) {
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+ ut_ad(block->in_unzip_LRU_list);
+ ut_d(block->in_unzip_LRU_list = false);
+
+ UT_LIST_REMOVE(buf_pool.unzip_LRU, block);
+ }
+}
+
+/** Removes a block from the LRU list.
+@param[in] bpage control block */
+static inline void buf_LRU_remove_block(buf_page_t* bpage)
+{
+ /* Important that we adjust the hazard pointers before removing
+ bpage from the LRU list. */
+ buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage);
+
+ /* If the LRU_old pointer is defined and points to just this block,
+ move it backward one step */
+
+ if (bpage == buf_pool.LRU_old) {
+
+ /* Below: the previous block is guaranteed to exist,
+ because the LRU_old pointer is only allowed to differ
+ by BUF_LRU_OLD_TOLERANCE from strict
+ buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+ list length. */
+ ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+ buf_pool.LRU_old = prev_bpage;
+ prev_bpage->set_old(true);
+
+ buf_pool.LRU_old_len++;
+ }
+
+ buf_pool.stat.LRU_bytes -= bpage->physical_size();
+
+ buf_unzip_LRU_remove_block_if_needed(bpage);
+
+ /* If the LRU list is so short that LRU_old is not defined,
+ clear the "old" flags and return */
+ if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+ /* This loop temporarily violates the
+ assertions of buf_page_t::set_old(). */
+ bpage->old = false;
+ }
+
+ buf_pool.LRU_old = NULL;
+ buf_pool.LRU_old_len = 0;
+
+ return;
+ }
+
+ ut_ad(buf_pool.LRU_old);
+
+ /* Update the LRU_old_len field if necessary */
+ if (bpage->old) {
+ buf_pool.LRU_old_len--;
+ }
+
+ /* Adjust the length of the old block list if necessary */
+ buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old) /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_a(block->page.belongs_to_unzip_LRU());
+ ut_ad(!block->in_unzip_LRU_list);
+ ut_d(block->in_unzip_LRU_list = true);
+
+ if (old) {
+ UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block);
+ } else {
+ UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block);
+ }
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+ buf_page_t* bpage, /*!< in: control block */
+ bool old) /*!< in: true if should be put to the old blocks
+ in the LRU list, else put to the start; if the
+ LRU list is very short, the block is added to
+ the start, regardless of this parameter */
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(!bpage->in_LRU_list);
+
+ if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+ UT_LIST_ADD_FIRST(buf_pool.LRU, bpage);
+
+ bpage->freed_page_clock = buf_pool.freed_page_clock
+ & ((1U << 31) - 1);
+ } else {
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old,
+ bpage);
+
+ buf_pool.LRU_old_len++;
+ }
+
+ ut_d(bpage->in_LRU_list = TRUE);
+
+ incr_LRU_size_in_bytes(bpage);
+
+ if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+ ut_ad(buf_pool.LRU_old);
+
+ /* Adjust the length of the old block list if necessary */
+
+ bpage->set_old(old);
+ buf_LRU_old_adjust_len();
+
+ } else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+ /* The LRU list is now long enough for LRU_old to become
+ defined: init it */
+
+ buf_LRU_old_init();
+ } else {
+ bpage->set_old(buf_pool.LRU_old != NULL);
+ }
+
+ /* If this is a zipped block with decompressed frame as well
+ then put it on the unzip_LRU list */
+ if (bpage->belongs_to_unzip_LRU()) {
+ buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+ }
+}
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage)
+{
+ if (bpage->is_read_fixed())
+ return;
+
+ ut_ad(bpage->in_file());
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (UNIV_UNLIKELY(bpage->old))
+ buf_pool.stat.n_pages_made_young++;
+
+ buf_LRU_remove_block(bpage);
+ buf_LRU_add_block(bpage, false);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage block to be freed
+@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+{
+ const page_id_t id{bpage->id()};
+ buf_page_t* b = nullptr;
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ /* First, perform a quick check before we acquire hash_lock. */
+ if (!bpage->can_relocate()) {
+ return false;
+ }
+
+ /* We must hold an exclusive hash_lock to prevent
+ bpage->can_relocate() from changing due to a concurrent
+ execution of buf_page_get_low(). */
+ buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold());
+ page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+ /* We cannot use transactional_lock_guard here,
+ because buf_buddy_relocate() in buf_buddy_free() could get stuck. */
+ hash_lock.lock();
+ const lsn_t oldest_modification = bpage->oldest_modification_acquire();
+
+ if (UNIV_UNLIKELY(!bpage->can_relocate())) {
+ /* Do not free buffer fixed and I/O-fixed blocks. */
+ goto func_exit;
+ }
+
+ switch (oldest_modification) {
+ case 2:
+ ut_ad(id.space() == SRV_TMP_SPACE_ID);
+ ut_ad(!bpage->zip.data);
+ if (!bpage->is_freed()) {
+ goto func_exit;
+ }
+ bpage->clear_oldest_modification();
+ break;
+ case 1:
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (const lsn_t om = bpage->oldest_modification()) {
+ ut_ad(om == 1);
+ buf_pool.delete_from_flush_list(bpage);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ ut_ad(!bpage->oldest_modification());
+ /* fall through */
+ case 0:
+ if (zip || !bpage->zip.data || !bpage->frame) {
+ break;
+ }
+relocate_compressed:
+ b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b));
+ ut_a(b);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ new (b) buf_page_t(*bpage);
+ b->frame = nullptr;
+ {
+ ut_d(uint32_t s=) b->fix();
+ ut_ad(s == buf_page_t::FREED
+ || s == buf_page_t::UNFIXED
+ || s == buf_page_t::IBUF_EXIST
+ || s == buf_page_t::REINIT);
+ }
+ break;
+ default:
+ if (zip || !bpage->zip.data || !bpage->frame) {
+ /* This would completely free the block. */
+ /* Do not completely free dirty blocks. */
+func_exit:
+ hash_lock.unlock();
+ return(false);
+ }
+ goto relocate_compressed;
+ }
+
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no()));
+
+ ut_ad(bpage->can_relocate());
+
+ if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) {
+ ut_ad(!b);
+ mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+ return(true);
+ }
+
+ /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr
+ then it was a compressed page with an uncompressed frame and
+ we are interested in freeing only the uncompressed frame.
+ Therefore we have to reinsert the compressed page descriptor
+ into the LRU and page_hash (and possibly flush_list).
+ if !b then it was a regular page that has been freed */
+
+ if (UNIV_LIKELY_NULL(b)) {
+ buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
+
+ ut_ad(!buf_pool.page_hash.get(id, chain));
+ ut_ad(b->zip_size());
+
+ /* The field in_LRU_list of
+ the to-be-freed block descriptor should have
+ been cleared in
+ buf_LRU_block_remove_hashed(), which
+ invokes buf_LRU_remove_block(). */
+ ut_ad(!bpage->in_LRU_list);
+ ut_ad(bpage->frame);
+ ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+ /* The fields of bpage were copied to b before
+ buf_LRU_block_remove_hashed() was invoked. */
+ ut_ad(!b->in_zip_hash);
+ ut_ad(b->in_LRU_list);
+ ut_ad(b->in_page_hash);
+ ut_d(b->in_page_hash = false);
+ b->hash = nullptr;
+
+ buf_pool.page_hash.append(chain, b);
+
+ /* Insert b where bpage was in the LRU list. */
+ if (prev_b) {
+ ulint lru_len;
+
+ ut_ad(prev_b->in_LRU_list);
+ ut_ad(prev_b->in_file());
+
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b);
+
+ incr_LRU_size_in_bytes(b);
+
+ if (b->is_old()) {
+ buf_pool.LRU_old_len++;
+ if (buf_pool.LRU_old
+ == UT_LIST_GET_NEXT(LRU, b)) {
+
+ buf_pool.LRU_old = b;
+ }
+ }
+
+ lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+ ut_ad(buf_pool.LRU_old);
+ /* Adjust the length of the
+ old block list if necessary */
+ buf_LRU_old_adjust_len();
+ } else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+ /* The LRU list is now long
+ enough for LRU_old to become
+ defined: init it */
+ buf_LRU_old_init();
+ }
+#ifdef UNIV_LRU_DEBUG
+ /* Check that the "old" flag is consistent
+ in the block and its neighbours. */
+ b->set_old(b->is_old());
+#endif /* UNIV_LRU_DEBUG */
+ } else {
+ ut_d(b->in_LRU_list = FALSE);
+ buf_LRU_add_block(b, b->old);
+ }
+
+ buf_flush_relocate_on_flush_list(bpage, b);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ bpage->zip.data = nullptr;
+
+ page_zip_set_size(&bpage->zip, 0);
+
+ b->lock.x_lock();
+ hash_lock.unlock();
+ } else if (!zip) {
+ hash_lock.unlock();
+ }
+
+ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (block->index) {
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ /* Remove the adaptive hash index on the page.
+ The page was declared uninitialized by
+ buf_LRU_block_remove_hashed(). We need to flag
+ the contents of the page valid (which it still is) in
+ order to avoid bogus Valgrind or MSAN warnings.*/
+
+ MEM_MAKE_DEFINED(block->page.frame, srv_page_size);
+ btr_search_drop_page_hash_index(block, false);
+ MEM_UNDEFINED(block->page.frame, srv_page_size);
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
+#endif
+ if (UNIV_LIKELY_NULL(b)) {
+ ut_ad(b->zip_size());
+ b->lock.x_unlock();
+ b->unfix();
+ }
+
+ buf_LRU_block_free_hashed_page(block);
+
+ return(true);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+ buf_block_t* block) /*!< in: block, must not contain a file page */
+{
+ void* data;
+
+ ut_ad(block->page.state() == buf_page_t::MEMORY);
+ assert_block_ahi_empty(block);
+ ut_ad(!block->page.in_free_list);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+ ut_ad(!block->page.hash);
+
+ block->page.set_state(buf_page_t::NOT_USED);
+
+ MEM_UNDEFINED(block->page.frame, srv_page_size);
+ data = block->page.zip.data;
+
+ if (data != NULL) {
+ block->page.zip.data = NULL;
+ buf_pool_mutex_exit_forbid();
+
+ ut_ad(block->zip_size());
+
+ buf_buddy_free(data, block->zip_size());
+
+ buf_pool_mutex_exit_allow();
+ page_zip_set_size(&block->page.zip, 0);
+ }
+
+ if (buf_pool.is_shrinking()
+ && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
+ && buf_pool.will_be_withdrawn(block->page)) {
+ /* This should be withdrawn */
+ UT_LIST_ADD_LAST(
+ buf_pool.withdraw,
+ &block->page);
+ ut_d(block->in_withdraw_list = true);
+ } else {
+ UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
+ ut_d(block->page.in_free_list = true);
+ buf_pool.try_LRU_scan= true;
+ pthread_cond_broadcast(&buf_pool.done_free);
+ }
+
+ block->page.set_os_unused();
+}
+
+/** Release a memory block to the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
+{
+ ut_ad(this == &buf_pool);
+ mysql_mutex_lock(&mutex);
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&mutex);
+}
+
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If !bpage->frame && !bpage->oldest_modification(), the object will be freed.
+
+@param bpage buffer block
+@param id page identifier
+@param chain locked buf_pool.page_hash chain (will be released here)
+@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+ buf_pool_t::hash_chain &chain,
+ bool zip)
+{
+ ut_a(bpage->can_relocate());
+ ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
+
+ buf_LRU_remove_block(bpage);
+
+ buf_pool.freed_page_clock += 1;
+
+ if (UNIV_LIKELY(!bpage->zip.data)) {
+ MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
+ MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size);
+ buf_block_modify_clock_inc((buf_block_t*) bpage);
+ } else if (const page_t *page = bpage->frame) {
+ MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
+ MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size);
+ buf_block_modify_clock_inc((buf_block_t*) bpage);
+
+ ut_a(!zip || !bpage->oldest_modification());
+ ut_ad(bpage->zip_size());
+ /* Skip consistency checks if the page was freed.
+ In recovery, we could get a sole FREE_PAGE record
+ and nothing else, for a ROW_FORMAT=COMPRESSED page.
+ Its contents would be garbage. */
+ if (!bpage->is_freed())
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ if (!zip) {
+ /* InnoDB writes the data to the
+ uncompressed page frame. Copy it
+ to the compressed page, which will
+ be preserved. */
+ memcpy(bpage->zip.data, page,
+ bpage->zip_size());
+ }
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ break;
+ default:
+ ib::error() << "The compressed page to be"
+ " evicted seems corrupt:";
+ ut_print_buf(stderr, page, srv_page_size);
+
+ ib::error() << "Possibly older version of"
+ " the page:";
+
+ ut_print_buf(stderr, bpage->zip.data,
+ bpage->zip_size());
+ putc('\n', stderr);
+ ut_error;
+ }
+ } else {
+ ut_a(!bpage->oldest_modification());
+ MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
+ }
+
+ ut_ad(!bpage->in_zip_hash);
+ buf_pool.page_hash.remove(chain, bpage);
+ page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+
+ if (UNIV_UNLIKELY(!bpage->frame)) {
+ ut_ad(!bpage->in_free_list);
+ ut_ad(!bpage->in_LRU_list);
+ ut_a(bpage->zip.data);
+ ut_a(bpage->zip.ssize);
+ ut_ad(!bpage->oldest_modification());
+
+ hash_lock.unlock();
+ buf_pool_mutex_exit_forbid();
+
+ buf_buddy_free(bpage->zip.data, bpage->zip_size());
+
+ buf_pool_mutex_exit_allow();
+ bpage->lock.free();
+ ut_free(bpage);
+ return false;
+ } else {
+ static_assert(FIL_NULL == 0xffffffffU, "fill pattern");
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ memset_aligned<4>(bpage->frame + FIL_PAGE_OFFSET, 0xff, 4);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memset_aligned<2>(bpage->frame
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+ MEM_UNDEFINED(bpage->frame, srv_page_size);
+ bpage->set_state(buf_page_t::REMOVE_HASH);
+
+ if (!zip) {
+ return true;
+ }
+
+ hash_lock.unlock();
+
+ if (bpage->zip.data) {
+ /* Free the compressed page. */
+ void* data = bpage->zip.data;
+ bpage->zip.data = NULL;
+
+ ut_ad(!bpage->in_free_list);
+ ut_ad(!bpage->oldest_modification());
+ ut_ad(!bpage->in_LRU_list);
+ buf_pool_mutex_exit_forbid();
+
+ buf_buddy_free(data, bpage->zip_size());
+
+ buf_pool_mutex_exit_allow();
+
+ page_zip_set_size(&bpage->zip, 0);
+ }
+
+ return true;
+ }
+}
+
+/** Release and evict a corrupted page.
+@param bpage x-latched page that was found corrupted
+@param state expected current state of the page */
+ATTRIBUTE_COLD
+void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state)
+{
+ const page_id_t id{bpage->id()};
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+
+ recv_sys.free_corrupted_page(id);
+ mysql_mutex_lock(&mutex);
+ hash_lock.lock();
+
+ ut_ad(!bpage->oldest_modification());
+ bpage->set_corrupt_id();
+ auto unfix= state - buf_page_t::FREED;
+ auto s= bpage->zip.fix.fetch_sub(unfix) - unfix;
+ bpage->lock.x_unlock(true);
+
+ while (s != buf_page_t::FREED || bpage->lock.is_locked_or_waiting())
+ {
+ ut_ad(s >= buf_page_t::FREED);
+ ut_ad(s < buf_page_t::UNFIXED);
+ /* Wait for other threads to release the fix count
+ before releasing the bpage from LRU list. */
+ (void) LF_BACKOFF();
+ s= bpage->state();
+ }
+
+ /* remove from LRU and page_hash */
+ if (buf_LRU_block_remove_hashed(bpage, id, chain, true))
+ buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
+
+ mysql_mutex_unlock(&mutex);
+}
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in] old_pct Reserve this percentage of
+ the buffer pool for "old" blocks
+@param[in] adjust true=adjust the LRU list;
+ false=just assign buf_pool.LRU_old_ratio
+ during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
+{
+ uint ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+ if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+ ratio = BUF_LRU_OLD_RATIO_MIN;
+ } else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+ ratio = BUF_LRU_OLD_RATIO_MAX;
+ }
+
+ if (adjust) {
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (ratio != buf_pool.LRU_old_ratio) {
+ buf_pool.LRU_old_ratio = ratio;
+
+ if (UT_LIST_GET_LEN(buf_pool.LRU)
+ >= BUF_LRU_OLD_MIN_LEN) {
+ buf_LRU_old_adjust_len();
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ } else {
+ buf_pool.LRU_old_ratio = ratio;
+ }
+ /* the reverse of
+ ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+ return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update()
+{
+ buf_LRU_stat_t* item;
+ buf_LRU_stat_t cur_stat;
+
+ if (!buf_pool.freed_page_clock) {
+ goto func_exit;
+ }
+
+ /* Update the index. */
+ item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+ buf_LRU_stat_arr_ind++;
+ buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+ /* Add the current value and subtract the obsolete entry.
+ Since buf_LRU_stat_cur is not protected by any mutex,
+ it can be changing between adding to buf_LRU_stat_sum
+ and copying to item. Assign it to local variables to make
+ sure the same value assign to the buf_LRU_stat_sum
+ and item */
+ cur_stat = buf_LRU_stat_cur;
+
+ buf_LRU_stat_sum.io += cur_stat.io - item->io;
+ buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
+
+ /* Put current entry in the array. */
+ memcpy(item, &cur_stat, sizeof *item);
+
+func_exit:
+ /* Clear the current entry. */
+ memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn".
+We would only need this for buf_LRU_scan_and_free_block(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+/** Try to free a replaceable block.
+@param limit maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+
+ return buf_LRU_free_from_unzip_LRU_list(limit) ||
+ buf_LRU_free_from_common_LRU_list(limit);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate()
+{
+ ulint old_len;
+ ulint new_len;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+ ut_a(buf_pool.LRU_old);
+ old_len = buf_pool.LRU_old_len;
+
+ new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+ * buf_pool.LRU_old_ratio
+ / BUF_LRU_OLD_RATIO_DIV,
+ UT_LIST_GET_LEN(buf_pool.LRU)
+ - (BUF_LRU_OLD_TOLERANCE
+ + BUF_LRU_NON_OLD_MIN_LEN));
+
+ ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+ ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+ }
+
+ CheckInLRUList::validate();
+
+ old_len = 0;
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+ ut_ad(bpage->in_file());
+ ut_ad(!bpage->frame
+ || reinterpret_cast<buf_block_t*>(bpage)
+ ->in_unzip_LRU_list
+ == bpage->belongs_to_unzip_LRU());
+
+ if (bpage->is_old()) {
+ const buf_page_t* prev
+ = UT_LIST_GET_PREV(LRU, bpage);
+ const buf_page_t* next
+ = UT_LIST_GET_NEXT(LRU, bpage);
+
+ if (!old_len++) {
+ ut_a(buf_pool.LRU_old == bpage);
+ } else {
+ ut_a(!prev || prev->is_old());
+ }
+
+ ut_a(!next || next->is_old());
+ }
+ }
+
+ ut_a(buf_pool.LRU_old_len == old_len);
+
+ CheckInFreeList::validate();
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+ ut_a(bpage->state() == buf_page_t::NOT_USED);
+ }
+
+ CheckUnzipLRUAndLRUList::validate();
+
+ for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU);
+ block != NULL;
+ block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+ ut_a(block->page.belongs_to_unzip_LRU());
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print()
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+ const page_id_t id(bpage->id());
+
+ fprintf(stderr, "BLOCK space %u page %u ",
+ id.space(), id.page_no());
+
+ if (bpage->is_old()) {
+ fputs("old ", stderr);
+ }
+
+ const unsigned s = bpage->state();
+ if (s > buf_page_t::UNFIXED) {
+ fprintf(stderr, "fix %u ", s - buf_page_t::UNFIXED);
+ } else {
+ ut_ad(s == buf_page_t::UNFIXED
+ || s == buf_page_t::REMOVE_HASH);
+ }
+
+ if (bpage->oldest_modification()) {
+ fputs("modif. ", stderr);
+ }
+
+ if (const byte* frame = bpage->zip.data) {
+ fprintf(stderr, "\ntype %u size " ULINTPF
+ " index id " IB_ID_FMT "\n",
+ fil_page_get_type(frame),
+ bpage->zip_size(),
+ btr_page_get_index_id(frame));
+ } else {
+ fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
+ fil_page_get_type(bpage->frame),
+ btr_page_get_index_id(bpage->frame));
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
new file mode 100644
index 00000000..c4f07738
--- /dev/null
+++ b/storage/innobase/buf/buf0rea.cc
@@ -0,0 +1,710 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.cc
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <mysql/service_thd_wait.h>
+
+#include "buf0rea.h"
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "log.h"
+#include "mariadb_stats.h"
+
+/** If there are buf_pool.curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT 2
+
+/** Remove the sentinel block for the watch before replacing it with a
+real block. watch_unset() or watch_occurred() will notice
+that the block has been replaced with the real block.
+@param w sentinel
+@param chain locked hash table chain
+@return w->state() */
+inline uint32_t buf_pool_t::watch_remove(buf_page_t *w,
+ buf_pool_t::hash_chain &chain)
+{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked());
+ ut_ad(w >= &watch[0]);
+ ut_ad(w < &watch[array_elements(watch)]);
+ ut_ad(!w->in_zip_hash);
+ ut_ad(!w->zip.data);
+
+ uint32_t s{w->state()};
+ w->set_state(buf_page_t::NOT_USED);
+ ut_ad(s >= buf_page_t::UNFIXED);
+ ut_ad(s < buf_page_t::READ_FIX);
+
+ if (~buf_page_t::LRU_MASK & s)
+ page_hash.remove(chain, w);
+
+ ut_ad(!w->in_page_hash);
+ w->id_= page_id_t(~0ULL);
+ return s;
+}
+
+/** Initialize a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] unzip whether the uncompressed page is
+ requested (for ROW_FORMAT=COMPRESSED)
+@return pointer to the block
+@retval NULL in case of an error */
+TRANSACTIONAL_TARGET
+static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
+ ulint zip_size, bool unzip)
+{
+ mtr_t mtr;
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY)
+ {
+ /* It is a read-ahead within an ibuf routine */
+ ut_ad(!ibuf_bitmap_page(page_id, zip_size));
+ ibuf_mtr_start(&mtr);
+
+ if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
+ {
+ ibuf_mtr_commit(&mtr);
+ return nullptr;
+ }
+ }
+ else
+ ut_ad(mode == BUF_READ_ANY_PAGE);
+
+ buf_page_t *bpage= nullptr;
+ buf_block_t *block= nullptr;
+ if (!zip_size || unzip || recv_recovery_is_on())
+ {
+ block= buf_LRU_get_free_block(false);
+ block->initialise(page_id, zip_size, buf_page_t::READ_FIX);
+ /* x_unlock() will be invoked
+ in buf_page_t::read_complete() by the io-handler thread. */
+ block->page.lock.x_lock(true);
+ }
+
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain);
+ if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
+ {
+ /* The page is already in the buffer pool. */
+ if (block)
+ {
+ block->page.lock.x_unlock(true);
+ ut_d(block->page.set_state(buf_page_t::MEMORY));
+ buf_LRU_block_free_non_file_page(block);
+ }
+ goto func_exit;
+ }
+
+ if (UNIV_LIKELY(block != nullptr))
+ {
+ bpage= &block->page;
+
+ /* Insert into the hash table of file pages */
+ if (hash_page)
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
+ (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
+ buf_pool.page_hash.append(chain, &block->page);
+ }
+ else
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ buf_pool.page_hash.append(chain, &block->page);
+ }
+
+ /* The block must be put to the LRU list, to the old blocks */
+ buf_LRU_add_block(&block->page, true/* to old blocks */);
+
+ if (UNIV_UNLIKELY(zip_size))
+ {
+ /* buf_pool.mutex may be released and reacquired by
+ buf_buddy_alloc(). We must defer this operation until after the
+ block descriptor has been added to buf_pool.LRU and
+ buf_pool.page_hash. */
+ block->page.zip.data= static_cast<page_zip_t*>
+ (buf_buddy_alloc(zip_size));
+
+ /* To maintain the invariant
+ block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
+ we have to add this block to unzip_LRU
+ after block->page.zip.data is set. */
+ ut_ad(block->page.belongs_to_unzip_LRU());
+ buf_unzip_LRU_add_block(block, TRUE);
+ }
+ }
+ else
+ {
+ /* The compressed page must be allocated before the
+ control block (bpage), in order to avoid the
+ invocation of buf_buddy_relocate_block() on
+ uninitialized data. */
+ bool lru= false;
+ void *data= buf_buddy_alloc(zip_size, &lru);
+
+ /* If buf_buddy_alloc() allocated storage from the LRU list,
+ it released and reacquired buf_pool.mutex. Thus, we must
+ check the page_hash again, as it may have been modified. */
+ if (UNIV_UNLIKELY(lru))
+ {
+ hash_page= buf_pool.page_hash.get(page_id, chain);
+
+ if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
+ {
+ /* The block was added by some other thread. */
+ buf_buddy_free(data, zip_size);
+ goto func_exit;
+ }
+ }
+
+ bpage= static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *bpage));
+
+ page_zip_des_init(&bpage->zip);
+ page_zip_set_size(&bpage->zip, zip_size);
+ bpage->zip.data = (page_zip_t*) data;
+
+ bpage->init(buf_page_t::READ_FIX, page_id);
+ bpage->lock.x_lock(true);
+
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+
+ if (hash_page)
+ bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
+ (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
+
+ buf_pool.page_hash.append(chain, bpage);
+ }
+
+ /* The block must be put to the LRU list, to the old blocks.
+ The zip size is already set into the page zip */
+ buf_LRU_add_block(bpage, true/* to old blocks */);
+ }
+
+ buf_pool.stat.n_pages_read++;
+func_exit:
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY)
+ ibuf_mtr_commit(&mtr);
+
+ ut_ad(!bpage || bpage->in_file());
+
+ return bpage;
+}
+
+/** Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+
+@param[in,out] space tablespace
+@param[in] sync true if synchronous aio is desired
+@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...,
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] unzip true=request uncompressed page
+@return error code
+@retval DB_SUCCESS if the page was read
+@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */
+static
+dberr_t
+buf_read_page_low(
+ fil_space_t* space,
+ bool sync,
+ ulint mode,
+ const page_id_t page_id,
+ ulint zip_size,
+ bool unzip)
+{
+ buf_page_t* bpage;
+
+ if (buf_dblwr.is_inside(page_id)) {
+ space->release();
+ return DB_PAGE_CORRUPTED;
+ }
+
+ if (sync) {
+ } else if (trx_sys_hdr_page(page_id)
+ || ibuf_bitmap_page(page_id, zip_size)
+ || (!recv_no_ibuf_operations
+ && ibuf_page(page_id, zip_size, nullptr))) {
+
+ /* Trx sys header is so low in the latching order that we play
+ safe and do not leave the i/o-completion to an asynchronous
+ i/o-thread. Change buffer pages must always be read with
+ synchronous i/o, to make sure they do not get involved in
+ thread deadlocks. */
+ sync = true;
+ }
+
+ /* The following call will also check if the tablespace does not exist
+ or is being dropped; if we succeed in initing the page in the buffer
+ pool for read, then DISCARD cannot proceed until the read has
+ completed */
+ bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
+
+ if (!bpage) {
+ space->release();
+ return DB_SUCCESS_LOCKED_REC;
+ }
+
+ ut_ad(bpage->in_file());
+ ulonglong mariadb_timer= 0;
+
+ if (sync) {
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ if (mariadb_stats_active())
+ mariadb_timer= mariadb_measure();
+ }
+
+ DBUG_LOG("ib_buf",
+ "read page " << page_id << " zip_size=" << zip_size
+ << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
+
+ void* dst = zip_size ? bpage->zip.data : bpage->frame;
+ const ulint len = zip_size ? zip_size : srv_page_size;
+
+ auto fio = space->io(IORequest(sync
+ ? IORequest::READ_SYNC
+ : IORequest::READ_ASYNC),
+ os_offset_t{page_id.page_no()} * len, len,
+ dst, bpage);
+
+ if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
+ buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
+ } else if (sync) {
+ thd_wait_end(NULL);
+ /* The i/o was already completed in space->io() */
+ fio.err = bpage->read_complete(*fio.node);
+ space->release();
+ if (fio.err == DB_FAIL) {
+ fio.err = DB_PAGE_CORRUPTED;
+ }
+ if (mariadb_timer)
+ mariadb_increment_pages_read_time(mariadb_timer);
+ }
+
+ return fio.err;
+}
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in] page_id page id of a page which the current thread
+wants to access
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+TRANSACTIONAL_TARGET
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+ if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID)
+ /* Disable the read-ahead for temporary tablespace */
+ return 0;
+
+ if (srv_startup_is_before_trx_rollback_phase)
+ /* No read-ahead to avoid thread deadlocks */
+ return 0;
+
+ if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+ /* If it is an ibuf bitmap page or trx sys hdr, we do no
+ read-ahead, as that could break the ibuf page access order */
+ return 0;
+
+ if (os_aio_pending_reads_approx() >
+ buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+ return 0;
+
+ fil_space_t* space= fil_space_t::get(page_id.space());
+ if (!space)
+ return 0;
+
+ const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+ ulint count= 5 + buf_read_ahead_area / 8;
+ const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+ page_id_t high= low + buf_read_ahead_area;
+ high.set_page_no(std::min(high.page_no(), space->last_page_number()));
+
+ /* Count how many blocks in the area have been recently accessed,
+ that is, reside near the start of the LRU list. */
+
+ for (page_id_t i= low; i < high; ++i)
+ {
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain))
+ if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count)
+ goto read_ahead;
+ }
+
+no_read_ahead:
+ space->release();
+ return 0;
+
+read_ahead:
+ if (space->is_stopping())
+ goto no_read_ahead;
+
+ /* Read all the suitable blocks within the area */
+ const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+
+ for (page_id_t i= low; i < high; ++i)
+ {
+ if (ibuf_bitmap_page(i, zip_size))
+ continue;
+ if (space->is_stopping())
+ break;
+ space->reacquire();
+ if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) ==
+ DB_SUCCESS)
+ count++;
+ }
+
+ if (count)
+ {
+ DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+ count, space->chain.start->name,
+ low.page_no()));
+ mysql_mutex_lock(&buf_pool.mutex);
+ /* Read ahead is considered one I/O operation for the purpose of
+ LRU policy decision. */
+ buf_LRU_stat_inc_io();
+ buf_pool.stat.n_ra_pages_read_rnd+= count;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+
+ space->release();
+ return count;
+}
+
+/** High-level function which reads a page from a file to buf_pool
+if it is not already there. Sets the io_fix and an exclusive lock
+on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
+{
+ fil_space_t *space= fil_space_t::get(page_id.space());
+ if (!space)
+ {
+ ib::info() << "trying to read page " << page_id
+ << " in nonexisting or being-dropped tablespace";
+ return DB_TABLESPACE_DELETED;
+ }
+
+ buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */
+ return buf_read_page_low(space, true, BUF_READ_ANY_PAGE,
+ page_id, zip_size, false);
+}
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out] space tablespace
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+ ulint zip_size)
+{
+ buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
+ page_id, zip_size, false);
+
+ /* We do not increment number of I/O operations used for LRU policy
+ here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+ about evicting uncompressed version of compressed pages from the
+ buffer pool. Since this function is called from buffer pool load
+ these IOs are deliberate and are not part of normal workload we can
+ ignore these in our heuristics. */
+}
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in] page_id page id; see NOTE 3 above
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] ibuf whether if we are inside ibuf routine
+@return number of page read requests issued */
+TRANSACTIONAL_TARGET
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+ /* check if readahead is disabled.
+ Disable the read ahead logic for temporary tablespace */
+ if (!srv_read_ahead_threshold || page_id.space() >= SRV_TMP_SPACE_ID)
+ return 0;
+
+ if (srv_startup_is_before_trx_rollback_phase)
+ /* No read-ahead to avoid thread deadlocks */
+ return 0;
+
+ if (os_aio_pending_reads_approx() >
+ buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+ return 0;
+
+ const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+ const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+ const page_id_t high_1= low + (buf_read_ahead_area - 1);
+
+ /* We will check that almost all pages in the area have been accessed
+ in the desired order. */
+ const bool descending= page_id != low;
+
+ if (!descending && page_id != high_1)
+ /* This is not a border page of the area */
+ return 0;
+
+ if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+ /* If it is an ibuf bitmap page or trx sys hdr, we do no
+ read-ahead, as that could break the ibuf page access order */
+ return 0;
+
+ fil_space_t *space= fil_space_t::get(page_id.space());
+ if (!space)
+ return 0;
+
+ if (high_1.page_no() > space->last_page_number())
+ {
+ /* The area is not whole. */
+fail:
+ space->release();
+ return 0;
+ }
+
+ /* How many out of order accessed pages can we ignore
+ when working out the access pattern for linear readahead */
+ ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
+ srv_read_ahead_threshold,
+ uint32_t{buf_pool.read_ahead_area});
+ page_id_t new_low= low, new_high_1= high_1;
+ unsigned prev_accessed= 0;
+ for (page_id_t i= low; i <= high_1; ++i)
+ {
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+ /* It does not make sense to use transactional_lock_guard here,
+ because we would have many complex conditions inside the memory
+ transaction. */
+ hash_lock.lock_shared();
+
+ const buf_page_t* bpage= buf_pool.page_hash.get(i, chain);
+ if (!bpage)
+ {
+ hash_lock.unlock_shared();
+ if (i == page_id)
+ goto fail;
+failed:
+ if (--count)
+ continue;
+ goto fail;
+ }
+ const unsigned accessed= bpage->is_accessed();
+ if (i == page_id)
+ {
+ /* Read the natural predecessor and successor page addresses from
+ the page; NOTE that because the calling thread may have an x-latch
+ on the page, we do not acquire an s-latch on the page, this is to
+ prevent deadlocks. The hash_lock is only protecting the
+ buf_pool.page_hash for page i, not the bpage contents itself. */
+ const byte *f= bpage->frame ? bpage->frame : bpage->zip.data;
+ uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
+ uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
+ hash_lock.unlock_shared();
+ if (prev == FIL_NULL || next == FIL_NULL)
+ goto fail;
+ page_id_t id= page_id;
+ if (descending)
+ {
+ if (id == high_1)
+ ++id;
+ else if (next - 1 != page_id.page_no())
+ goto fail;
+ else
+ id.set_page_no(prev);
+ }
+ else
+ {
+ if (prev + 1 != page_id.page_no())
+ goto fail;
+ id.set_page_no(next);
+ }
+
+ new_low= id - (id.page_no() % buf_read_ahead_area);
+ new_high_1= new_low + (buf_read_ahead_area - 1);
+
+ if (id != new_low && id != new_high_1)
+ /* This is not a border page of the area: return */
+ goto fail;
+ if (new_high_1.page_no() > space->last_page_number())
+ /* The area is not whole */
+ goto fail;
+ }
+ else
+ hash_lock.unlock_shared();
+
+ if (!accessed)
+ goto failed;
+ /* Note that buf_page_t::is_accessed() returns the time of the
+ first access. If some blocks of the extent existed in the buffer
+ pool at the time of a linear access pattern, the first access
+ times may be nonmonotonic, even though the latest access times
+ were linear. The threshold (srv_read_ahead_factor) should help a
+ little against this. */
+ bool fail= prev_accessed &&
+ (descending ? prev_accessed > accessed : prev_accessed < accessed);
+ prev_accessed= accessed;
+ if (fail)
+ goto failed;
+ }
+
+ /* If we got this far, read-ahead can be sensible: do it */
+ count= 0;
+ for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+ new_low <= new_high_1; ++new_low)
+ {
+ if (ibuf_bitmap_page(new_low, zip_size))
+ continue;
+ if (space->is_stopping())
+ break;
+ space->reacquire();
+ if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) ==
+ DB_SUCCESS)
+ count++;
+ }
+
+ if (count)
+ {
+ DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+ count, space->chain.start->name,
+ new_low.page_no()));
+ mysql_mutex_lock(&buf_pool.mutex);
+ /* Read ahead is considered one I/O operation for the purpose of
+ LRU policy decision. */
+ buf_LRU_stat_inc_io();
+ buf_pool.stat.n_ra_pages_read+= count;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
+
+ space->release();
+ return count;
+}
+
+/** Schedule a page for recovery.
+@param space tablespace
+@param page_id page identifier
+@param recs log records
+@param init page initialization, or nullptr if the page needs to be read */
+void buf_read_recover(fil_space_t *space, const page_id_t page_id,
+ page_recv_t &recs, recv_init *init)
+{
+ ut_ad(space->id == page_id.space());
+ space->reacquire();
+ const ulint zip_size= space->zip_size();
+
+ if (init)
+ {
+ if (buf_page_t *bpage= buf_page_init_for_read(BUF_READ_ANY_PAGE, page_id,
+ zip_size, true))
+ {
+ ut_ad(bpage->in_file());
+ os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs,
+ UT_LIST_GET_FIRST(space->chain),
+ IORequest::READ_ASYNC}, ptrdiff_t(init));
+ }
+ }
+ else if (dberr_t err= buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
+ page_id, zip_size, true))
+ {
+ if (err != DB_SUCCESS_LOCKED_REC)
+ sql_print_error("InnoDB: Recovery failed to read page "
+ UINT32PF " from %s",
+ page_id.page_no(), space->chain.start->name);
+ }
+}